Spaces:
Build error
Build error
Illumotion
commited on
Commit
•
57c742e
1
Parent(s):
6a6900d
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +4 -0
- CMakeLists.txt +2 -0
- Dockerfile +11 -4
- Makefile +16 -14
- Package.swift +4 -3
- colab.ipynb +1 -1
- common/CMakeLists.txt +2 -0
- common/common.cpp +56 -172
- common/common.h +4 -39
- common/sampling.cpp +166 -0
- common/sampling.h +108 -0
- convert-bloom-hf-to-gguf.py +238 -0
- convert-mpt-hf-to-gguf.py +216 -0
- convert-refact-hf-to-gguf.py +263 -0
- examples/CMakeLists.txt +1 -0
- examples/batched-bench/CMakeLists.txt +5 -0
- examples/batched-bench/README.md +51 -0
- examples/batched-bench/batched-bench.cpp +251 -0
- examples/batched.swift/.gitignore +9 -0
- examples/batched.swift/Makefile +6 -0
- examples/batched.swift/Package.swift +22 -0
- examples/batched.swift/README.md +4 -0
- examples/batched.swift/Sources/main.swift +255 -0
- examples/batched/batched.cpp +1 -1
- examples/embd-input/embd-input-lib.cpp +10 -9
- examples/infill/infill.cpp +800 -0
- examples/main/main.cpp +17 -13
- examples/parallel/parallel.cpp +57 -9
- examples/save-load-state/save-load-state.cpp +3 -2
- examples/server/index.html.hpp +0 -0
- examples/server/public/index.html +133 -58
- examples/server/server.cpp +308 -145
- examples/speculative/speculative.cpp +13 -5
- ggml-alloc.c +62 -107
- ggml-alloc.h +11 -5
- ggml-backend.c +385 -0
- ggml-backend.h +143 -0
- ggml-cuda.cu +500 -78
- ggml-cuda.h +4 -0
- ggml-metal.h +18 -1
- ggml-metal.m +152 -9
- ggml-metal.metal +12 -6
- ggml.c +23 -45
- ggml.h +9 -7
- gguf-py/gguf/gguf.py +70 -42
- gpttype_adapter.cpp +1 -1
- koboldcpp.py +105 -53
- llama.cpp +844 -65
- otherarch/llama_v3.cpp +7 -8
- prompts/mnemonics.txt +93 -0
.gitignore
CHANGED
@@ -45,6 +45,7 @@ models-mnt
|
|
45 |
/server
|
46 |
/simple
|
47 |
/batched
|
|
|
48 |
/export-lora
|
49 |
/finetune
|
50 |
/speculative
|
@@ -106,3 +107,6 @@ tests/test-tokenizer-1-bpe
|
|
106 |
rocblas.dll
|
107 |
hipblas.dll
|
108 |
koboldcpp_hipblas.so
|
|
|
|
|
|
|
|
45 |
/server
|
46 |
/simple
|
47 |
/batched
|
48 |
+
/batched-bench
|
49 |
/export-lora
|
50 |
/finetune
|
51 |
/speculative
|
|
|
107 |
rocblas.dll
|
108 |
hipblas.dll
|
109 |
koboldcpp_hipblas.so
|
110 |
+
|
111 |
+
# Jetbrains idea folder
|
112 |
+
.idea/
|
CMakeLists.txt
CHANGED
@@ -356,6 +356,8 @@ add_library(ggml OBJECT
|
|
356 |
ggml.h
|
357 |
ggml-alloc.c
|
358 |
ggml-alloc.h
|
|
|
|
|
359 |
k_quants.h
|
360 |
k_quants.c
|
361 |
${GGML_SOURCES_CUDA})
|
|
|
356 |
ggml.h
|
357 |
ggml-alloc.c
|
358 |
ggml-alloc.h
|
359 |
+
ggml-backend.c
|
360 |
+
ggml-backend.h
|
361 |
k_quants.h
|
362 |
k_quants.c
|
363 |
${GGML_SOURCES_CUDA})
|
Dockerfile
CHANGED
@@ -2,10 +2,17 @@ FROM python
|
|
2 |
WORKDIR /app
|
3 |
COPY . .
|
4 |
RUN apt update \
|
5 |
-
&& apt install build-essential wget libopenblas-dev make -y \
|
6 |
-
&&
|
7 |
&& wget https://huggingface.co/TheBloke/Pygmalion-2-7B-GGUF/resolve/main/pygmalion-2-7b.Q6_K.gguf \
|
8 |
-
|
9 |
-
&&
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-2-7b.Q6_K.gguf", "--port", "7860", "--smartcontext"]
|
|
|
2 |
WORKDIR /app
|
3 |
COPY . .
|
4 |
RUN apt update \
|
5 |
+
&& apt install build-essential wget libopenblas-dev make cmake -y \
|
6 |
+
&& mkdir build \
|
7 |
&& wget https://huggingface.co/TheBloke/Pygmalion-2-7B-GGUF/resolve/main/pygmalion-2-7b.Q6_K.gguf \
|
8 |
+
https://github.com/mozilla/sccache/releases/download/v0.5.4/sccache-dist-v0.5.4-x86_64-unknown-linux-musl.tar.gz \
|
9 |
+
&& tar -vxzf sccache-dist-v0.5.4-x86_64-unknown-linux-musl.tar.gz \
|
10 |
+
&& mv sccache-dist-v0.5.4-x86_64-unknown-linux-musl/sccache /usr/bin/sccache\
|
11 |
+
&& cd build \
|
12 |
+
&& cmake .. -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
|
13 |
+
&& cmake --build . \
|
14 |
+
&& cd .. \
|
15 |
+
&& apt remove build-essential wget make cmake -y \
|
16 |
+
&& rm -fr *.bat convert-* ci docs examples otherarchs tests sccache-dist-v0.5.4-x86_64-unknown-linux-musl*
|
17 |
|
18 |
ENTRYPOINT ["python", "koboldcpp.py", "pygmalion-2-7b.Q6_K.gguf", "--port", "7860", "--smartcontext"]
|
Makefile
CHANGED
@@ -372,6 +372,8 @@ endif # LLAMA_NO_K_QUANTS
|
|
372 |
#there's no intrinsics or special gpu ops used here, so we can have a universal object
|
373 |
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
374 |
$(CC) $(CFLAGS) -c $< -o $@
|
|
|
|
|
375 |
|
376 |
#version 2 libs
|
377 |
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
@@ -402,7 +404,7 @@ ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-ope
|
|
402 |
$(CC) $(CFLAGS) -c $< -o $@
|
403 |
|
404 |
# intermediate objects
|
405 |
-
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
|
406 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
407 |
common.o: common/common.cpp common/common.h common/log.h
|
408 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
@@ -427,7 +429,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
|
|
427 |
clean:
|
428 |
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
|
429 |
|
430 |
-
main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
431 |
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
432 |
@echo
|
433 |
@echo '==== Run ./main -h for help. ===='
|
@@ -438,11 +440,11 @@ gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
|
|
438 |
|
439 |
|
440 |
#generated libraries
|
441 |
-
koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
|
442 |
$(DEFAULT_BUILD)
|
443 |
|
444 |
ifdef OPENBLAS_BUILD
|
445 |
-
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
|
446 |
$(OPENBLAS_BUILD)
|
447 |
else
|
448 |
koboldcpp_openblas:
|
@@ -450,7 +452,7 @@ koboldcpp_openblas:
|
|
450 |
endif
|
451 |
|
452 |
ifdef FAILSAFE_BUILD
|
453 |
-
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o grammar-parser.o $(OBJS)
|
454 |
$(FAILSAFE_BUILD)
|
455 |
else
|
456 |
koboldcpp_failsafe:
|
@@ -458,7 +460,7 @@ koboldcpp_failsafe:
|
|
458 |
endif
|
459 |
|
460 |
ifdef NOAVX2_BUILD
|
461 |
-
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o grammar-parser.o $(OBJS)
|
462 |
$(NOAVX2_BUILD)
|
463 |
else
|
464 |
koboldcpp_noavx2:
|
@@ -466,7 +468,7 @@ koboldcpp_noavx2:
|
|
466 |
endif
|
467 |
|
468 |
ifdef CLBLAST_BUILD
|
469 |
-
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o grammar-parser.o $(OBJS)
|
470 |
$(CLBLAST_BUILD)
|
471 |
else
|
472 |
koboldcpp_clblast:
|
@@ -474,7 +476,7 @@ koboldcpp_clblast:
|
|
474 |
endif
|
475 |
|
476 |
ifdef CUBLAS_BUILD
|
477 |
-
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
|
478 |
$(CUBLAS_BUILD)
|
479 |
else
|
480 |
koboldcpp_cublas:
|
@@ -482,7 +484,7 @@ koboldcpp_cublas:
|
|
482 |
endif
|
483 |
|
484 |
ifdef HIPBLAS_BUILD
|
485 |
-
koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o grammar-parser.o $(HIP_OBJS) $(OBJS)
|
486 |
$(HIPBLAS_BUILD)
|
487 |
else
|
488 |
koboldcpp_hipblas:
|
@@ -490,15 +492,15 @@ koboldcpp_hipblas:
|
|
490 |
endif
|
491 |
|
492 |
# tools
|
493 |
-
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o
|
494 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
495 |
-
quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
|
496 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
497 |
-
quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
|
498 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
499 |
-
quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
|
500 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
501 |
-
quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
|
502 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
503 |
|
504 |
|
|
|
372 |
#there's no intrinsics or special gpu ops used here, so we can have a universal object
|
373 |
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
374 |
$(CC) $(CFLAGS) -c $< -o $@
|
375 |
+
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
376 |
+
$(CC) $(CFLAGS) -c $< -o $@
|
377 |
|
378 |
#version 2 libs
|
379 |
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
|
|
404 |
$(CC) $(CFLAGS) -c $< -o $@
|
405 |
|
406 |
# intermediate objects
|
407 |
+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h otherarch/llama-util.h
|
408 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
409 |
common.o: common/common.cpp common/common.h common/log.h
|
410 |
$(CXX) $(CXXFLAGS) -c $< -o $@
|
|
|
429 |
clean:
|
430 |
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
|
431 |
|
432 |
+
main: examples/main/main.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
|
433 |
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
434 |
@echo
|
435 |
@echo '==== Run ./main -h for help. ===='
|
|
|
440 |
|
441 |
|
442 |
#generated libraries
|
443 |
+
koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
|
444 |
$(DEFAULT_BUILD)
|
445 |
|
446 |
ifdef OPENBLAS_BUILD
|
447 |
+
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
|
448 |
$(OPENBLAS_BUILD)
|
449 |
else
|
450 |
koboldcpp_openblas:
|
|
|
452 |
endif
|
453 |
|
454 |
ifdef FAILSAFE_BUILD
|
455 |
+
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
|
456 |
$(FAILSAFE_BUILD)
|
457 |
else
|
458 |
koboldcpp_failsafe:
|
|
|
460 |
endif
|
461 |
|
462 |
ifdef NOAVX2_BUILD
|
463 |
+
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
|
464 |
$(NOAVX2_BUILD)
|
465 |
else
|
466 |
koboldcpp_noavx2:
|
|
|
468 |
endif
|
469 |
|
470 |
ifdef CLBLAST_BUILD
|
471 |
+
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
|
472 |
$(CLBLAST_BUILD)
|
473 |
else
|
474 |
koboldcpp_clblast:
|
|
|
476 |
endif
|
477 |
|
478 |
ifdef CUBLAS_BUILD
|
479 |
+
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
|
480 |
$(CUBLAS_BUILD)
|
481 |
else
|
482 |
koboldcpp_cublas:
|
|
|
484 |
endif
|
485 |
|
486 |
ifdef HIPBLAS_BUILD
|
487 |
+
koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(HIP_OBJS) $(OBJS)
|
488 |
$(HIPBLAS_BUILD)
|
489 |
else
|
490 |
koboldcpp_hipblas:
|
|
|
492 |
endif
|
493 |
|
494 |
# tools
|
495 |
+
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o
|
496 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
497 |
+
quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
|
498 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
499 |
+
quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
|
500 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
501 |
+
quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
|
502 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
503 |
+
quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
|
504 |
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
505 |
|
506 |
|
Package.swift
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
-
// swift-tools-version:5.
|
2 |
|
3 |
import PackageDescription
|
4 |
|
5 |
#if arch(arm) || arch(arm64)
|
6 |
let platforms: [SupportedPlatform]? = [
|
7 |
-
.macOS(.
|
8 |
.iOS(.v14),
|
9 |
.watchOS(.v4),
|
10 |
.tvOS(.v14)
|
@@ -41,12 +41,13 @@ let package = Package(
|
|
41 |
"ggml.c",
|
42 |
"llama.cpp",
|
43 |
"ggml-alloc.c",
|
|
|
44 |
"k_quants.c",
|
45 |
] + additionalSources,
|
46 |
resources: resources,
|
47 |
publicHeadersPath: "spm-headers",
|
48 |
cSettings: [
|
49 |
-
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
50 |
.define("GGML_USE_K_QUANTS"),
|
51 |
.define("GGML_USE_ACCELERATE")
|
52 |
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
|
|
1 |
+
// swift-tools-version:5.5
|
2 |
|
3 |
import PackageDescription
|
4 |
|
5 |
#if arch(arm) || arch(arm64)
|
6 |
let platforms: [SupportedPlatform]? = [
|
7 |
+
.macOS(.v12),
|
8 |
.iOS(.v14),
|
9 |
.watchOS(.v4),
|
10 |
.tvOS(.v14)
|
|
|
41 |
"ggml.c",
|
42 |
"llama.cpp",
|
43 |
"ggml-alloc.c",
|
44 |
+
"ggml-backend.c",
|
45 |
"k_quants.c",
|
46 |
] + additionalSources,
|
47 |
resources: resources,
|
48 |
publicHeadersPath: "spm-headers",
|
49 |
cSettings: [
|
50 |
+
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
51 |
.define("GGML_USE_K_QUANTS"),
|
52 |
.define("GGML_USE_ACCELERATE")
|
53 |
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
colab.ipynb
CHANGED
@@ -33,7 +33,7 @@
|
|
33 |
"!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
|
34 |
"!sleep 10\r\n",
|
35 |
"!cat nohup.out\r\n",
|
36 |
-
"!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers
|
37 |
]
|
38 |
}
|
39 |
],
|
|
|
33 |
"!nohup ./cloudflared-linux-amd64 tunnel --url http://localhost:5001 &\r\n",
|
34 |
"!sleep 10\r\n",
|
35 |
"!cat nohup.out\r\n",
|
36 |
+
"!python koboldcpp.py model.ggml --usecublas 0 mmq --gpulayers $Layers\r\n"
|
37 |
]
|
38 |
}
|
39 |
],
|
common/CMakeLists.txt
CHANGED
@@ -5,6 +5,8 @@ set(TARGET common)
|
|
5 |
add_library(${TARGET} OBJECT
|
6 |
common.h
|
7 |
common.cpp
|
|
|
|
|
8 |
console.h
|
9 |
console.cpp
|
10 |
grammar-parser.h
|
|
|
5 |
add_library(${TARGET} OBJECT
|
6 |
common.h
|
7 |
common.cpp
|
8 |
+
sampling.h
|
9 |
+
sampling.cpp
|
10 |
console.h
|
11 |
console.cpp
|
12 |
grammar-parser.h
|
common/common.cpp
CHANGED
@@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
107 |
std::string arg;
|
108 |
gpt_params default_params;
|
109 |
const std::string arg_prefix = "--";
|
|
|
110 |
|
111 |
for (int i = 1; i < argc; i++) {
|
112 |
arg = argv[i];
|
@@ -184,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
184 |
invalid_param = true;
|
185 |
break;
|
186 |
}
|
187 |
-
|
188 |
} else if (arg == "-c" || arg == "--ctx-size") {
|
189 |
if (++i >= argc) {
|
190 |
invalid_param = true;
|
@@ -216,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
216 |
invalid_param = true;
|
217 |
break;
|
218 |
}
|
219 |
-
|
220 |
} else if (arg == "--temp") {
|
221 |
if (++i >= argc) {
|
222 |
invalid_param = true;
|
223 |
break;
|
224 |
}
|
225 |
-
|
226 |
} else if (arg == "--tfs") {
|
227 |
if (++i >= argc) {
|
228 |
invalid_param = true;
|
229 |
break;
|
230 |
}
|
231 |
-
|
232 |
} else if (arg == "--typical") {
|
233 |
if (++i >= argc) {
|
234 |
invalid_param = true;
|
235 |
break;
|
236 |
}
|
237 |
-
|
238 |
} else if (arg == "--repeat-last-n") {
|
239 |
if (++i >= argc) {
|
240 |
invalid_param = true;
|
241 |
break;
|
242 |
}
|
243 |
-
|
244 |
} else if (arg == "--repeat-penalty") {
|
245 |
if (++i >= argc) {
|
246 |
invalid_param = true;
|
247 |
break;
|
248 |
}
|
249 |
-
|
250 |
} else if (arg == "--frequency-penalty") {
|
251 |
if (++i >= argc) {
|
252 |
invalid_param = true;
|
253 |
break;
|
254 |
}
|
255 |
-
|
256 |
} else if (arg == "--presence-penalty") {
|
257 |
if (++i >= argc) {
|
258 |
invalid_param = true;
|
259 |
break;
|
260 |
}
|
261 |
-
|
262 |
} else if (arg == "--mirostat") {
|
263 |
if (++i >= argc) {
|
264 |
invalid_param = true;
|
265 |
break;
|
266 |
}
|
267 |
-
|
268 |
} else if (arg == "--mirostat-lr") {
|
269 |
if (++i >= argc) {
|
270 |
invalid_param = true;
|
271 |
break;
|
272 |
}
|
273 |
-
|
274 |
} else if (arg == "--mirostat-ent") {
|
275 |
if (++i >= argc) {
|
276 |
invalid_param = true;
|
277 |
break;
|
278 |
}
|
279 |
-
|
280 |
} else if (arg == "--cfg-negative-prompt") {
|
281 |
if (++i >= argc) {
|
282 |
invalid_param = true;
|
283 |
break;
|
284 |
}
|
285 |
-
|
286 |
} else if (arg == "--cfg-negative-prompt-file") {
|
287 |
if (++i >= argc) {
|
288 |
invalid_param = true;
|
@@ -294,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
294 |
invalid_param = true;
|
295 |
break;
|
296 |
}
|
297 |
-
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(
|
298 |
-
if (!
|
299 |
-
|
300 |
}
|
301 |
} else if (arg == "--cfg-scale") {
|
302 |
if (++i >= argc) {
|
303 |
invalid_param = true;
|
304 |
break;
|
305 |
}
|
306 |
-
|
307 |
} else if (arg == "-b" || arg == "--batch-size") {
|
308 |
if (++i >= argc) {
|
309 |
invalid_param = true;
|
@@ -512,7 +513,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
512 |
} else if (arg == "--ignore-eos") {
|
513 |
params.ignore_eos = true;
|
514 |
} else if (arg == "--no-penalize-nl") {
|
515 |
-
|
516 |
} else if (arg == "-l" || arg == "--logit-bias") {
|
517 |
if (++i >= argc) {
|
518 |
invalid_param = true;
|
@@ -524,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
524 |
std::string value_str;
|
525 |
try {
|
526 |
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
527 |
-
|
528 |
} else {
|
529 |
throw std::exception();
|
530 |
}
|
@@ -627,6 +628,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|
627 |
}
|
628 |
|
629 |
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
|
|
|
630 |
printf("usage: %s [options]\n", argv[0]);
|
631 |
printf("\n");
|
632 |
printf("options:\n");
|
@@ -659,19 +662,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
659 |
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
660 |
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
661 |
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
662 |
-
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n",
|
663 |
-
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)
|
664 |
-
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)
|
665 |
-
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)
|
666 |
-
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n",
|
667 |
-
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)
|
668 |
-
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)
|
669 |
-
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)
|
670 |
printf(" --mirostat N use Mirostat sampling.\n");
|
671 |
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
672 |
-
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n",
|
673 |
-
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)
|
674 |
-
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)
|
675 |
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
676 |
printf(" modifies the likelihood of token appearing in the completion,\n");
|
677 |
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
@@ -682,7 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
682 |
printf(" negative prompt to use for guidance. (default: empty)\n");
|
683 |
printf(" --cfg-negative-prompt-file FNAME\n");
|
684 |
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
685 |
-
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n",
|
686 |
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
687 |
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
688 |
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
@@ -690,7 +693,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|
690 |
printf(" --no-penalize-nl do not penalize newline token\n");
|
691 |
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
692 |
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
693 |
-
printf(" --temp N temperature (default: %.1f)\n", (double)
|
694 |
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
695 |
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
696 |
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
@@ -840,7 +843,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|
840 |
}
|
841 |
|
842 |
if (params.ignore_eos) {
|
843 |
-
params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
844 |
}
|
845 |
|
846 |
{
|
@@ -932,127 +935,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
|
|
932 |
return result;
|
933 |
}
|
934 |
|
935 |
-
//
|
936 |
-
// Sampling utils
|
937 |
-
//
|
938 |
-
|
939 |
-
llama_token llama_sample_token(
|
940 |
-
struct llama_context * ctx,
|
941 |
-
struct llama_context * ctx_guidance,
|
942 |
-
struct llama_grammar * grammar,
|
943 |
-
const struct gpt_params & params,
|
944 |
-
const std::vector<llama_token> & last_tokens,
|
945 |
-
std::vector<llama_token_data> & candidates,
|
946 |
-
int idx) {
|
947 |
-
const int n_ctx = llama_n_ctx(ctx);
|
948 |
-
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
949 |
-
|
950 |
-
const float temp = params.temp;
|
951 |
-
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
952 |
-
const float top_p = params.top_p;
|
953 |
-
const float tfs_z = params.tfs_z;
|
954 |
-
const float typical_p = params.typical_p;
|
955 |
-
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
956 |
-
const float repeat_penalty = params.repeat_penalty;
|
957 |
-
const float alpha_presence = params.presence_penalty;
|
958 |
-
const float alpha_frequency = params.frequency_penalty;
|
959 |
-
const int mirostat = params.mirostat;
|
960 |
-
const float mirostat_tau = params.mirostat_tau;
|
961 |
-
const float mirostat_eta = params.mirostat_eta;
|
962 |
-
const bool penalize_nl = params.penalize_nl;
|
963 |
-
|
964 |
-
llama_token id = 0;
|
965 |
-
|
966 |
-
float * logits = llama_get_logits_ith(ctx, idx);
|
967 |
-
|
968 |
-
// Apply params.logit_bias map
|
969 |
-
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
970 |
-
logits[it->first] += it->second;
|
971 |
-
}
|
972 |
-
|
973 |
-
candidates.clear();
|
974 |
-
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
975 |
-
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
976 |
-
}
|
977 |
-
|
978 |
-
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
979 |
-
|
980 |
-
if (ctx_guidance) {
|
981 |
-
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
982 |
-
}
|
983 |
-
|
984 |
-
// apply penalties
|
985 |
-
if (!last_tokens.empty()) {
|
986 |
-
const float nl_logit = logits[llama_token_nl(ctx)];
|
987 |
-
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
988 |
-
|
989 |
-
llama_sample_repetition_penalty(ctx, &cur_p,
|
990 |
-
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
991 |
-
last_n_repeat, repeat_penalty);
|
992 |
-
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
993 |
-
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
994 |
-
last_n_repeat, alpha_frequency, alpha_presence);
|
995 |
-
|
996 |
-
if (!penalize_nl) {
|
997 |
-
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
998 |
-
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
999 |
-
cur_p.data[idx].logit = nl_logit;
|
1000 |
-
break;
|
1001 |
-
}
|
1002 |
-
}
|
1003 |
-
}
|
1004 |
-
}
|
1005 |
-
|
1006 |
-
if (grammar != NULL) {
|
1007 |
-
llama_sample_grammar(ctx, &cur_p, grammar);
|
1008 |
-
}
|
1009 |
-
|
1010 |
-
if (temp <= 0) {
|
1011 |
-
// Greedy sampling
|
1012 |
-
id = llama_sample_token_greedy(ctx, &cur_p);
|
1013 |
-
} else {
|
1014 |
-
if (mirostat == 1) {
|
1015 |
-
static float mirostat_mu = 2.0f * mirostat_tau;
|
1016 |
-
const int mirostat_m = 100;
|
1017 |
-
llama_sample_temp(ctx, &cur_p, temp);
|
1018 |
-
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
1019 |
-
} else if (mirostat == 2) {
|
1020 |
-
static float mirostat_mu = 2.0f * mirostat_tau;
|
1021 |
-
llama_sample_temp(ctx, &cur_p, temp);
|
1022 |
-
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
1023 |
-
} else {
|
1024 |
-
// Temperature sampling
|
1025 |
-
size_t min_keep = std::max(1, params.n_probs);
|
1026 |
-
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
1027 |
-
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
1028 |
-
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
1029 |
-
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
1030 |
-
llama_sample_temp(ctx, &cur_p, temp);
|
1031 |
-
|
1032 |
-
{
|
1033 |
-
const int n_top = 10;
|
1034 |
-
LOG("top %d candidates:\n", n_top);
|
1035 |
-
|
1036 |
-
for (int i = 0; i < n_top; i++) {
|
1037 |
-
const llama_token id = cur_p.data[i].id;
|
1038 |
-
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
1039 |
-
}
|
1040 |
-
}
|
1041 |
-
|
1042 |
-
id = llama_sample_token(ctx, &cur_p);
|
1043 |
-
|
1044 |
-
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
1045 |
-
}
|
1046 |
-
}
|
1047 |
-
// printf("`%d`", candidates_p.size);
|
1048 |
-
|
1049 |
-
if (grammar != NULL) {
|
1050 |
-
llama_grammar_accept_token(ctx, grammar, id);
|
1051 |
-
}
|
1052 |
-
|
1053 |
-
return id;
|
1054 |
-
}
|
1055 |
-
|
1056 |
//
|
1057 |
// YAML utils
|
1058 |
//
|
@@ -1204,6 +1086,8 @@ std::string get_sortable_timestamp() {
|
|
1204 |
|
1205 |
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
1206 |
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
|
|
|
|
1207 |
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
1208 |
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
1209 |
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
@@ -1250,21 +1134,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
1250 |
|
1251 |
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
1252 |
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
1253 |
-
dump_string_yaml_multiline(stream, "cfg_negative_prompt",
|
1254 |
-
fprintf(stream, "cfg_scale: %f # default: 1.0\n",
|
1255 |
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
1256 |
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
1257 |
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
1258 |
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
1259 |
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
1260 |
-
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n",
|
1261 |
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
1262 |
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
1263 |
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
1264 |
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
1265 |
|
1266 |
-
const auto logit_bias_eos =
|
1267 |
-
const bool ignore_eos = logit_bias_eos !=
|
1268 |
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
1269 |
|
1270 |
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
@@ -1277,7 +1161,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
1277 |
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
1278 |
|
1279 |
fprintf(stream, "logit_bias:\n");
|
1280 |
-
for (std::pair<llama_token, float> lb :
|
1281 |
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
1282 |
continue;
|
1283 |
}
|
@@ -1301,30 +1185,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
1301 |
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
1302 |
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
1303 |
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
1304 |
-
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n",
|
1305 |
-
fprintf(stream, "mirostat_ent: %f # default: 5.0\n",
|
1306 |
-
fprintf(stream, "mirostat_lr: %f # default: 0.1\n",
|
1307 |
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
1308 |
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
1309 |
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
1310 |
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
1311 |
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
1312 |
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
1313 |
-
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n",
|
1314 |
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
1315 |
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
1316 |
-
fprintf(stream, "no_penalize_nl: %s # default: false\n", !
|
1317 |
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
1318 |
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
1319 |
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
1320 |
-
fprintf(stream, "presence_penalty: %f # default: 0.0\n",
|
1321 |
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
1322 |
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
1323 |
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
1324 |
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
1325 |
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
1326 |
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
1327 |
-
fprintf(stream, "repeat_penalty: %f # default: 1.1\n",
|
1328 |
|
1329 |
fprintf(stream, "reverse_prompt:\n");
|
1330 |
for (std::string ap : params.antiprompt) {
|
@@ -1342,15 +1226,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|
1342 |
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
1343 |
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
1344 |
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
1345 |
-
fprintf(stream, "temp: %f # default: 0.8\n",
|
1346 |
|
1347 |
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
1348 |
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
1349 |
|
1350 |
-
fprintf(stream, "tfs: %f # default: 1.0\n",
|
1351 |
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
1352 |
-
fprintf(stream, "top_k: %d # default: 40\n",
|
1353 |
-
fprintf(stream, "top_p: %f # default: 0.95\n",
|
1354 |
-
fprintf(stream, "typical_p: %f # default: 1.0\n",
|
1355 |
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
1356 |
}
|
|
|
107 |
std::string arg;
|
108 |
gpt_params default_params;
|
109 |
const std::string arg_prefix = "--";
|
110 |
+
llama_sampling_params & sparams = params.sampling_params;
|
111 |
|
112 |
for (int i = 1; i < argc; i++) {
|
113 |
arg = argv[i];
|
|
|
185 |
invalid_param = true;
|
186 |
break;
|
187 |
}
|
188 |
+
sparams.top_k = std::stoi(argv[i]);
|
189 |
} else if (arg == "-c" || arg == "--ctx-size") {
|
190 |
if (++i >= argc) {
|
191 |
invalid_param = true;
|
|
|
217 |
invalid_param = true;
|
218 |
break;
|
219 |
}
|
220 |
+
sparams.top_p = std::stof(argv[i]);
|
221 |
} else if (arg == "--temp") {
|
222 |
if (++i >= argc) {
|
223 |
invalid_param = true;
|
224 |
break;
|
225 |
}
|
226 |
+
sparams.temp = std::stof(argv[i]);
|
227 |
} else if (arg == "--tfs") {
|
228 |
if (++i >= argc) {
|
229 |
invalid_param = true;
|
230 |
break;
|
231 |
}
|
232 |
+
sparams.tfs_z = std::stof(argv[i]);
|
233 |
} else if (arg == "--typical") {
|
234 |
if (++i >= argc) {
|
235 |
invalid_param = true;
|
236 |
break;
|
237 |
}
|
238 |
+
sparams.typical_p = std::stof(argv[i]);
|
239 |
} else if (arg == "--repeat-last-n") {
|
240 |
if (++i >= argc) {
|
241 |
invalid_param = true;
|
242 |
break;
|
243 |
}
|
244 |
+
sparams.repeat_last_n = std::stoi(argv[i]);
|
245 |
} else if (arg == "--repeat-penalty") {
|
246 |
if (++i >= argc) {
|
247 |
invalid_param = true;
|
248 |
break;
|
249 |
}
|
250 |
+
sparams.repeat_penalty = std::stof(argv[i]);
|
251 |
} else if (arg == "--frequency-penalty") {
|
252 |
if (++i >= argc) {
|
253 |
invalid_param = true;
|
254 |
break;
|
255 |
}
|
256 |
+
sparams.frequency_penalty = std::stof(argv[i]);
|
257 |
} else if (arg == "--presence-penalty") {
|
258 |
if (++i >= argc) {
|
259 |
invalid_param = true;
|
260 |
break;
|
261 |
}
|
262 |
+
sparams.presence_penalty = std::stof(argv[i]);
|
263 |
} else if (arg == "--mirostat") {
|
264 |
if (++i >= argc) {
|
265 |
invalid_param = true;
|
266 |
break;
|
267 |
}
|
268 |
+
sparams.mirostat = std::stoi(argv[i]);
|
269 |
} else if (arg == "--mirostat-lr") {
|
270 |
if (++i >= argc) {
|
271 |
invalid_param = true;
|
272 |
break;
|
273 |
}
|
274 |
+
sparams.mirostat_eta = std::stof(argv[i]);
|
275 |
} else if (arg == "--mirostat-ent") {
|
276 |
if (++i >= argc) {
|
277 |
invalid_param = true;
|
278 |
break;
|
279 |
}
|
280 |
+
sparams.mirostat_tau = std::stof(argv[i]);
|
281 |
} else if (arg == "--cfg-negative-prompt") {
|
282 |
if (++i >= argc) {
|
283 |
invalid_param = true;
|
284 |
break;
|
285 |
}
|
286 |
+
sparams.cfg_negative_prompt = argv[i];
|
287 |
} else if (arg == "--cfg-negative-prompt-file") {
|
288 |
if (++i >= argc) {
|
289 |
invalid_param = true;
|
|
|
295 |
invalid_param = true;
|
296 |
break;
|
297 |
}
|
298 |
+
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(sparams.cfg_negative_prompt));
|
299 |
+
if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') {
|
300 |
+
sparams.cfg_negative_prompt.pop_back();
|
301 |
}
|
302 |
} else if (arg == "--cfg-scale") {
|
303 |
if (++i >= argc) {
|
304 |
invalid_param = true;
|
305 |
break;
|
306 |
}
|
307 |
+
sparams.cfg_scale = std::stof(argv[i]);
|
308 |
} else if (arg == "-b" || arg == "--batch-size") {
|
309 |
if (++i >= argc) {
|
310 |
invalid_param = true;
|
|
|
513 |
} else if (arg == "--ignore-eos") {
|
514 |
params.ignore_eos = true;
|
515 |
} else if (arg == "--no-penalize-nl") {
|
516 |
+
sparams.penalize_nl = false;
|
517 |
} else if (arg == "-l" || arg == "--logit-bias") {
|
518 |
if (++i >= argc) {
|
519 |
invalid_param = true;
|
|
|
525 |
std::string value_str;
|
526 |
try {
|
527 |
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
|
528 |
+
sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
|
529 |
} else {
|
530 |
throw std::exception();
|
531 |
}
|
|
|
628 |
}
|
629 |
|
630 |
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
631 |
+
const llama_sampling_params & sparams = params.sampling_params;
|
632 |
+
|
633 |
printf("usage: %s [options]\n", argv[0]);
|
634 |
printf("\n");
|
635 |
printf("options:\n");
|
|
|
662 |
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
663 |
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
664 |
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
665 |
+
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
666 |
+
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
667 |
+
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
|
668 |
+
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
|
669 |
+
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
|
670 |
+
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
|
671 |
+
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
|
672 |
+
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
|
673 |
printf(" --mirostat N use Mirostat sampling.\n");
|
674 |
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
675 |
+
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
|
676 |
+
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
|
677 |
+
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
|
678 |
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
679 |
printf(" modifies the likelihood of token appearing in the completion,\n");
|
680 |
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
|
|
685 |
printf(" negative prompt to use for guidance. (default: empty)\n");
|
686 |
printf(" --cfg-negative-prompt-file FNAME\n");
|
687 |
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
688 |
+
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
|
689 |
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
690 |
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
691 |
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
|
|
693 |
printf(" --no-penalize-nl do not penalize newline token\n");
|
694 |
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
695 |
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
696 |
+
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
697 |
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
698 |
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
699 |
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
|
|
843 |
}
|
844 |
|
845 |
if (params.ignore_eos) {
|
846 |
+
params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
|
847 |
}
|
848 |
|
849 |
{
|
|
|
935 |
return result;
|
936 |
}
|
937 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
938 |
//
|
939 |
// YAML utils
|
940 |
//
|
|
|
1086 |
|
1087 |
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
1088 |
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
1089 |
+
const llama_sampling_params & sparams = params.sampling_params;
|
1090 |
+
|
1091 |
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
|
1092 |
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
|
1093 |
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
|
|
|
1134 |
|
1135 |
fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
|
1136 |
fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
|
1137 |
+
dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
|
1138 |
+
fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
|
1139 |
fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
|
1140 |
fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
|
1141 |
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
|
1142 |
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
|
1143 |
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
|
1144 |
+
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
|
1145 |
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
|
1146 |
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
|
1147 |
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
|
1148 |
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
|
1149 |
|
1150 |
+
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
|
1151 |
+
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
|
1152 |
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
|
1153 |
|
1154 |
dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
|
|
|
1161 |
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
|
1162 |
|
1163 |
fprintf(stream, "logit_bias:\n");
|
1164 |
+
for (std::pair<llama_token, float> lb : sparams.logit_bias) {
|
1165 |
if (ignore_eos && lb.first == logit_bias_eos->first) {
|
1166 |
continue;
|
1167 |
}
|
|
|
1185 |
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
1186 |
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
1187 |
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
1188 |
+
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
1189 |
+
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
1190 |
+
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
1191 |
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
|
1192 |
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
|
1193 |
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
|
1194 |
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
|
1195 |
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
|
1196 |
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
|
1197 |
+
fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
|
1198 |
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
1199 |
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
1200 |
+
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
|
1201 |
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
1202 |
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
1203 |
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
1204 |
+
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
|
1205 |
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
|
1206 |
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
|
1207 |
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
|
1208 |
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
|
1209 |
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
|
1210 |
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
|
1211 |
+
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
|
1212 |
|
1213 |
fprintf(stream, "reverse_prompt:\n");
|
1214 |
for (std::string ap : params.antiprompt) {
|
|
|
1226 |
fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
|
1227 |
fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
|
1228 |
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
|
1229 |
+
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
|
1230 |
|
1231 |
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
|
1232 |
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
|
1233 |
|
1234 |
+
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
1235 |
fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
|
1236 |
+
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
1237 |
+
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
1238 |
+
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
1239 |
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
1240 |
}
|
common/common.h
CHANGED
@@ -4,6 +4,8 @@
|
|
4 |
|
5 |
#include "llama.h"
|
6 |
|
|
|
|
|
7 |
#define LOG_NO_FILE_LINE_FUNCTION
|
8 |
#include "log.h"
|
9 |
|
@@ -49,7 +51,6 @@ struct gpt_params {
|
|
49 |
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
50 |
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
51 |
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
52 |
-
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
53 |
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
54 |
float rope_freq_base = 0.0f; // RoPE base frequency
|
55 |
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
@@ -67,13 +68,8 @@ struct gpt_params {
|
|
67 |
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
68 |
float mirostat_tau = 5.00f; // target entropy
|
69 |
float mirostat_eta = 0.10f; // learning rate
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
// Classifier-Free Guidance
|
74 |
-
// https://arxiv.org/abs/2306.17806
|
75 |
-
std::string cfg_negative_prompt; // string to help guidance
|
76 |
-
float cfg_scale = 1.f; // How strong is guidance
|
77 |
|
78 |
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
79 |
std::string model_draft = ""; // draft model for speculative decoding
|
@@ -115,7 +111,6 @@ struct gpt_params {
|
|
115 |
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
116 |
bool ignore_eos = false; // ignore generated EOS tokens
|
117 |
bool instruct = false; // instruction mode (used for Alpaca models)
|
118 |
-
bool penalize_nl = true; // consider newlines as a repeatable token
|
119 |
bool logits_all = false; // return logits for all tokens in the batch
|
120 |
bool use_mmap = true; // use mmap for faster loads
|
121 |
bool use_mlock = false; // use mlock to keep model in memory
|
@@ -180,36 +175,6 @@ std::string llama_detokenize_bpe(
|
|
180 |
llama_context * ctx,
|
181 |
const std::vector<llama_token> & tokens);
|
182 |
|
183 |
-
//
|
184 |
-
// Sampling utils
|
185 |
-
//
|
186 |
-
|
187 |
-
// this is a common sampling function used across the examples for convenience
|
188 |
-
// it can serve as a starting point for implementing your own sampling function
|
189 |
-
//
|
190 |
-
// required:
|
191 |
-
// - ctx: context to use for sampling
|
192 |
-
// - params: sampling parameters
|
193 |
-
//
|
194 |
-
// optional:
|
195 |
-
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
196 |
-
// - grammar: grammar to use for sampling, ignore if NULL
|
197 |
-
// - last_tokens: needed for repetition penalty, ignore if empty
|
198 |
-
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
199 |
-
//
|
200 |
-
// returns:
|
201 |
-
// - token: sampled token
|
202 |
-
// - candidates: vector of candidate tokens
|
203 |
-
//
|
204 |
-
llama_token llama_sample_token(
|
205 |
-
struct llama_context * ctx,
|
206 |
-
struct llama_context * ctx_guidance,
|
207 |
-
struct llama_grammar * grammar,
|
208 |
-
const struct gpt_params & params,
|
209 |
-
const std::vector<llama_token> & last_tokens,
|
210 |
-
std::vector<llama_token_data> & candidates,
|
211 |
-
int idx = 0);
|
212 |
-
|
213 |
//
|
214 |
// YAML utils
|
215 |
//
|
|
|
4 |
|
5 |
#include "llama.h"
|
6 |
|
7 |
+
#include "sampling.h"
|
8 |
+
|
9 |
#define LOG_NO_FILE_LINE_FUNCTION
|
10 |
#include "log.h"
|
11 |
|
|
|
51 |
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
52 |
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
53 |
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
|
|
54 |
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
55 |
float rope_freq_base = 0.0f; // RoPE base frequency
|
56 |
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
|
|
|
68 |
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
69 |
float mirostat_tau = 5.00f; // target entropy
|
70 |
float mirostat_eta = 0.10f; // learning rate
|
71 |
+
// // sampling parameters
|
72 |
+
struct llama_sampling_params sampling_params;
|
|
|
|
|
|
|
|
|
|
|
73 |
|
74 |
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
75 |
std::string model_draft = ""; // draft model for speculative decoding
|
|
|
111 |
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
|
112 |
bool ignore_eos = false; // ignore generated EOS tokens
|
113 |
bool instruct = false; // instruction mode (used for Alpaca models)
|
|
|
114 |
bool logits_all = false; // return logits for all tokens in the batch
|
115 |
bool use_mmap = true; // use mmap for faster loads
|
116 |
bool use_mlock = false; // use mlock to keep model in memory
|
|
|
175 |
llama_context * ctx,
|
176 |
const std::vector<llama_token> & tokens);
|
177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
//
|
179 |
// YAML utils
|
180 |
//
|
common/sampling.cpp
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "sampling.h"
|
2 |
+
|
3 |
+
llama_sampling_context::~llama_sampling_context() {
|
4 |
+
for (auto & it : sequence_contexts) {
|
5 |
+
if (it.second.grammar != NULL) {
|
6 |
+
llama_grammar_free(it.second.grammar);
|
7 |
+
it.second.grammar = NULL;
|
8 |
+
}
|
9 |
+
}
|
10 |
+
}
|
11 |
+
|
12 |
+
llama_sampling_context llama_sampling_context_init(
|
13 |
+
const struct gpt_params & params,
|
14 |
+
llama_grammar * grammar) {
|
15 |
+
llama_sampling_context result;
|
16 |
+
|
17 |
+
result.params = params.sampling_params;
|
18 |
+
result.grammar = grammar;
|
19 |
+
return result;
|
20 |
+
}
|
21 |
+
|
22 |
+
// Note: Creates the context if it doesn't exist, so this always return something.
|
23 |
+
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
|
24 |
+
llama_sampling_context & ctx_sampling,
|
25 |
+
const llama_seq_id seq) {
|
26 |
+
const auto it = ctx_sampling.sequence_contexts.find(seq);
|
27 |
+
if (it != ctx_sampling.sequence_contexts.end()) {
|
28 |
+
return it->second;
|
29 |
+
}
|
30 |
+
llama_sampler_sequence_context new_ctx = {
|
31 |
+
2.0f * ctx_sampling.params.mirostat_tau,
|
32 |
+
ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL,
|
33 |
+
};
|
34 |
+
return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second;
|
35 |
+
}
|
36 |
+
|
37 |
+
bool llama_sampling_context_reset(
|
38 |
+
llama_sampling_context & ctx_sampling,
|
39 |
+
const llama_seq_id seq) {
|
40 |
+
const auto it = ctx_sampling.sequence_contexts.find(seq);
|
41 |
+
if (it == ctx_sampling.sequence_contexts.end()) return false;
|
42 |
+
if (it->second.grammar != NULL) {
|
43 |
+
llama_grammar_free(it->second.grammar);
|
44 |
+
it->second.grammar = NULL;
|
45 |
+
}
|
46 |
+
ctx_sampling.sequence_contexts.erase(it);
|
47 |
+
return true;
|
48 |
+
}
|
49 |
+
|
50 |
+
llama_token llama_sampling_sample(
|
51 |
+
struct llama_context * ctx,
|
52 |
+
struct llama_context * ctx_guidance,
|
53 |
+
struct llama_sampling_context & ctx_sampling,
|
54 |
+
const std::vector<llama_token> & last_tokens,
|
55 |
+
std::vector<llama_token_data> & candidates,
|
56 |
+
const int idx,
|
57 |
+
llama_seq_id seq) {
|
58 |
+
const int n_ctx = llama_n_ctx(ctx);
|
59 |
+
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
60 |
+
|
61 |
+
const llama_sampling_params & params = ctx_sampling.params;
|
62 |
+
const float temp = params.temp;
|
63 |
+
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
64 |
+
const float top_p = params.top_p;
|
65 |
+
const float tfs_z = params.tfs_z;
|
66 |
+
const float typical_p = params.typical_p;
|
67 |
+
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
68 |
+
const float repeat_penalty = params.repeat_penalty;
|
69 |
+
const float alpha_presence = params.presence_penalty;
|
70 |
+
const float alpha_frequency = params.frequency_penalty;
|
71 |
+
const int mirostat = params.mirostat;
|
72 |
+
const float mirostat_tau = params.mirostat_tau;
|
73 |
+
const float mirostat_eta = params.mirostat_eta;
|
74 |
+
const bool penalize_nl = params.penalize_nl;
|
75 |
+
|
76 |
+
llama_token id = 0;
|
77 |
+
|
78 |
+
float * logits = llama_get_logits_ith(ctx, idx);
|
79 |
+
|
80 |
+
// Apply params.logit_bias map
|
81 |
+
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
|
82 |
+
logits[it->first] += it->second;
|
83 |
+
}
|
84 |
+
|
85 |
+
candidates.clear();
|
86 |
+
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
|
87 |
+
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
88 |
+
}
|
89 |
+
|
90 |
+
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
|
91 |
+
|
92 |
+
if (ctx_guidance) {
|
93 |
+
llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
|
94 |
+
}
|
95 |
+
|
96 |
+
// apply penalties
|
97 |
+
if (!last_tokens.empty()) {
|
98 |
+
const float nl_logit = logits[llama_token_nl(ctx)];
|
99 |
+
const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
|
100 |
+
|
101 |
+
llama_sample_repetition_penalty(ctx, &cur_p,
|
102 |
+
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
103 |
+
last_n_repeat, repeat_penalty);
|
104 |
+
llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
|
105 |
+
last_tokens.data() + last_tokens.size() - last_n_repeat,
|
106 |
+
last_n_repeat, alpha_frequency, alpha_presence);
|
107 |
+
|
108 |
+
if (!penalize_nl) {
|
109 |
+
for (size_t idx = 0; idx < cur_p.size; idx++) {
|
110 |
+
if (cur_p.data[idx].id == llama_token_nl(ctx)) {
|
111 |
+
cur_p.data[idx].logit = nl_logit;
|
112 |
+
break;
|
113 |
+
}
|
114 |
+
}
|
115 |
+
}
|
116 |
+
}
|
117 |
+
|
118 |
+
llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq);
|
119 |
+
|
120 |
+
if (ctx_seq.grammar != NULL) {
|
121 |
+
llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar);
|
122 |
+
}
|
123 |
+
|
124 |
+
if (temp <= 0) {
|
125 |
+
// Greedy sampling
|
126 |
+
id = llama_sample_token_greedy(ctx, &cur_p);
|
127 |
+
} else {
|
128 |
+
if (mirostat == 1) {
|
129 |
+
const int mirostat_m = 100;
|
130 |
+
llama_sample_temp(ctx, &cur_p, temp);
|
131 |
+
id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu);
|
132 |
+
} else if (mirostat == 2) {
|
133 |
+
llama_sample_temp(ctx, &cur_p, temp);
|
134 |
+
id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu);
|
135 |
+
} else {
|
136 |
+
// Temperature sampling
|
137 |
+
size_t min_keep = std::max(1, params.n_probs);
|
138 |
+
llama_sample_top_k (ctx, &cur_p, top_k, min_keep);
|
139 |
+
llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep);
|
140 |
+
llama_sample_typical (ctx, &cur_p, typical_p, min_keep);
|
141 |
+
llama_sample_top_p (ctx, &cur_p, top_p, min_keep);
|
142 |
+
llama_sample_temp(ctx, &cur_p, temp);
|
143 |
+
|
144 |
+
{
|
145 |
+
const int n_top = 10;
|
146 |
+
LOG("top %d candidates:\n", n_top);
|
147 |
+
|
148 |
+
for (int i = 0; i < n_top; i++) {
|
149 |
+
const llama_token id = cur_p.data[i].id;
|
150 |
+
(void)id; // To avoid a warning that id is unused when logging is disabled.
|
151 |
+
LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
|
152 |
+
}
|
153 |
+
}
|
154 |
+
|
155 |
+
id = llama_sample_token(ctx, &cur_p);
|
156 |
+
|
157 |
+
LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
|
158 |
+
}
|
159 |
+
}
|
160 |
+
|
161 |
+
if (ctx_seq.grammar != NULL) {
|
162 |
+
llama_grammar_accept_token(ctx, ctx_seq.grammar, id);
|
163 |
+
}
|
164 |
+
|
165 |
+
return id;
|
166 |
+
}
|
common/sampling.h
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
#include "llama.h"
|
4 |
+
|
5 |
+
#include <string>
|
6 |
+
#include <vector>
|
7 |
+
#include <unordered_map>
|
8 |
+
|
9 |
+
// sampling parameters
|
10 |
+
typedef struct llama_sampling_params {
|
11 |
+
int32_t top_k = 40; // <= 0 to use vocab size
|
12 |
+
float top_p = 0.95f; // 1.0 = disabled
|
13 |
+
float tfs_z = 1.00f; // 1.0 = disabled
|
14 |
+
float typical_p = 1.00f; // 1.0 = disabled
|
15 |
+
float temp = 0.80f; // 1.0 = disabled
|
16 |
+
float repeat_penalty = 1.10f; // 1.0 = disabled
|
17 |
+
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
18 |
+
float frequency_penalty = 0.00f; // 0.0 = disabled
|
19 |
+
float presence_penalty = 0.00f; // 0.0 = disabled
|
20 |
+
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
21 |
+
float mirostat_tau = 5.00f; // target entropy
|
22 |
+
float mirostat_eta = 0.10f; // learning rate
|
23 |
+
|
24 |
+
bool penalize_nl = true; // consider newlines as a repeatable token
|
25 |
+
|
26 |
+
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
27 |
+
|
28 |
+
// Classifier-Free Guidance
|
29 |
+
// https://arxiv.org/abs/2306.17806
|
30 |
+
std::string cfg_negative_prompt; // string to help guidance
|
31 |
+
float cfg_scale = 1.f; // How strong is guidance
|
32 |
+
|
33 |
+
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
34 |
+
|
35 |
+
} llama_sampling_params;
|
36 |
+
|
37 |
+
// per-sequence sampler context
|
38 |
+
typedef struct llama_sampler_sequence_context {
|
39 |
+
float mirostat_mu; // mirostat sampler state
|
40 |
+
llama_grammar * grammar;
|
41 |
+
} llama_sampler_sequence_context;
|
42 |
+
|
43 |
+
// general sampler context
|
44 |
+
typedef struct llama_sampling_context {
|
45 |
+
~llama_sampling_context();
|
46 |
+
|
47 |
+
// parameters that will be used for sampling and when creating
|
48 |
+
// new llama_sampler_sequence_context instances
|
49 |
+
llama_sampling_params params;
|
50 |
+
|
51 |
+
// map of sequence ids to sampler contexts
|
52 |
+
std::unordered_map<llama_seq_id, llama_sampler_sequence_context> sequence_contexts;
|
53 |
+
|
54 |
+
// when non-NULL, new instances of llama_sampler_sequence_context
|
55 |
+
// will get a copy of the grammar here
|
56 |
+
// note: only the pointer is stored here, it is not a copy of
|
57 |
+
// the grammar and shouldn't be freed
|
58 |
+
llama_grammar * grammar;
|
59 |
+
} llama_sampling_context;
|
60 |
+
|
61 |
+
#include "common.h"
|
62 |
+
|
63 |
+
// Create a new sampling context instance.
|
64 |
+
llama_sampling_context llama_sampling_context_init(
|
65 |
+
const struct gpt_params & params,
|
66 |
+
llama_grammar * grammar = NULL);
|
67 |
+
|
68 |
+
// Fetches the sampler context for the specified sequence id (defaults to 0).
|
69 |
+
// If the context for that sequence id doesn't already exist, it will be created with
|
70 |
+
// default values based on the parameters in the ctx_sampling argument.
|
71 |
+
llama_sampler_sequence_context & llama_sampling_get_sequence_context(
|
72 |
+
llama_sampling_context & ctx_sampling,
|
73 |
+
const llama_seq_id seq = 0);
|
74 |
+
|
75 |
+
// Reset the sampler context for the supplied sequence id (defaults to 0).
|
76 |
+
// This is necessary to reuse a sequence id or free memory used by sequences
|
77 |
+
// that are no longer required.
|
78 |
+
bool llama_sampling_context_reset(
|
79 |
+
llama_sampling_context & ctx_sampling,
|
80 |
+
const llama_seq_id seq = 0);
|
81 |
+
|
82 |
+
// this is a common sampling function used across the examples for convenience
|
83 |
+
// it can serve as a starting point for implementing your own sampling function
|
84 |
+
// Note: When using multiple sequences, it is the caller's responsibility to call
|
85 |
+
// llama_sampling_context_reset when a sequence ends
|
86 |
+
//
|
87 |
+
// required:
|
88 |
+
// - ctx: context to use for sampling
|
89 |
+
// - ctx_sampling: sampling-specific context
|
90 |
+
//
|
91 |
+
// optional:
|
92 |
+
// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL
|
93 |
+
// - last_tokens: needed for repetition penalty, ignore if empty
|
94 |
+
// - idx: sample from llama_get_logits_ith(ctx, idx)
|
95 |
+
// - seq: sequence id to associate sampler state with
|
96 |
+
//
|
97 |
+
// returns:
|
98 |
+
// - token: sampled token
|
99 |
+
// - candidates: vector of candidate tokens
|
100 |
+
//
|
101 |
+
llama_token llama_sampling_sample(
|
102 |
+
struct llama_context * ctx,
|
103 |
+
struct llama_context * ctx_guidance,
|
104 |
+
struct llama_sampling_context & ctx_sampling,
|
105 |
+
const std::vector<llama_token> & last_tokens,
|
106 |
+
std::vector<llama_token_data> & candidates,
|
107 |
+
const int idx = 0,
|
108 |
+
llama_seq_id seq = 0);
|
convert-bloom-hf-to-gguf.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF bloom --> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import re
|
10 |
+
import struct
|
11 |
+
import sys
|
12 |
+
from pathlib import Path
|
13 |
+
from typing import Any
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
import torch
|
17 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
18 |
+
|
19 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
20 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
21 |
+
import gguf
|
22 |
+
|
23 |
+
|
24 |
+
def count_model_parts(dir_model: Path) -> int:
|
25 |
+
num_parts = 0
|
26 |
+
for filename in os.listdir(dir_model):
|
27 |
+
if filename.startswith("pytorch_model-"):
|
28 |
+
num_parts += 1
|
29 |
+
|
30 |
+
if num_parts > 0:
|
31 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
32 |
+
return num_parts
|
33 |
+
|
34 |
+
|
35 |
+
# Supported Models:
|
36 |
+
# https://huggingface.co/bigscience/bloom-1b7
|
37 |
+
# https://huggingface.co/bigscience/bloom-3b
|
38 |
+
# https://huggingface.co/bigscience/bloom-7b1
|
39 |
+
# https://huggingface.co/Langboat/bloom-1b4-zh
|
40 |
+
def parse_args() -> argparse.Namespace:
|
41 |
+
parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
|
42 |
+
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
43 |
+
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
44 |
+
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
|
45 |
+
parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
|
46 |
+
return parser.parse_args()
|
47 |
+
|
48 |
+
args = parse_args()
|
49 |
+
|
50 |
+
dir_model = args.model
|
51 |
+
ftype = args.ftype
|
52 |
+
if not dir_model.is_dir():
|
53 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
54 |
+
sys.exit(1)
|
55 |
+
|
56 |
+
# possible tensor data types
|
57 |
+
# ftype == 0 -> float32
|
58 |
+
# ftype == 1 -> float16
|
59 |
+
|
60 |
+
# map from ftype to string
|
61 |
+
ftype_str = ["f32", "f16"]
|
62 |
+
|
63 |
+
if args.outfile is not None:
|
64 |
+
fname_out = args.outfile
|
65 |
+
else:
|
66 |
+
# output in the same directory as the model by default
|
67 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
68 |
+
|
69 |
+
print("gguf: loading model "+dir_model.name)
|
70 |
+
|
71 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
72 |
+
hparams = json.load(f)
|
73 |
+
|
74 |
+
if hparams["architectures"][0] != "BloomForCausalLM":
|
75 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
76 |
+
sys.exit(1)
|
77 |
+
|
78 |
+
# get number of model parts
|
79 |
+
num_parts = count_model_parts(dir_model)
|
80 |
+
|
81 |
+
ARCH=gguf.MODEL_ARCH.BLOOM
|
82 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
83 |
+
|
84 |
+
print("gguf: get model metadata")
|
85 |
+
|
86 |
+
block_count = hparams["n_layer"]
|
87 |
+
|
88 |
+
gguf_writer.add_name("Bloom")
|
89 |
+
n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
|
90 |
+
n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
|
91 |
+
gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
|
92 |
+
gguf_writer.add_embedding_length(n_embed)
|
93 |
+
gguf_writer.add_feed_forward_length(4 * n_embed)
|
94 |
+
gguf_writer.add_block_count(block_count)
|
95 |
+
gguf_writer.add_head_count(n_head)
|
96 |
+
gguf_writer.add_head_count_kv(n_head)
|
97 |
+
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
|
98 |
+
gguf_writer.add_file_type(ftype)
|
99 |
+
|
100 |
+
# TOKENIZATION
|
101 |
+
|
102 |
+
print("gguf: get tokenizer metadata")
|
103 |
+
|
104 |
+
tokens: list[bytearray] = []
|
105 |
+
scores: list[float] = []
|
106 |
+
toktypes: list[int] = []
|
107 |
+
|
108 |
+
# gpt2 tokenizer
|
109 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
110 |
+
|
111 |
+
print("gguf: get gpt2 tokenizer vocab")
|
112 |
+
|
113 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
114 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
115 |
+
|
116 |
+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
117 |
+
# This causes downstream issues with mismatched tensor sizes when running the inference
|
118 |
+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
119 |
+
assert max(tokenizer.vocab.values()) < vocab_size
|
120 |
+
|
121 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
122 |
+
|
123 |
+
for i in range(vocab_size):
|
124 |
+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
125 |
+
scores.append(0.0) # dummy
|
126 |
+
toktypes.append(gguf.TokenType.NORMAL)
|
127 |
+
|
128 |
+
gguf_writer.add_token_list(tokens)
|
129 |
+
gguf_writer.add_token_scores(scores)
|
130 |
+
gguf_writer.add_token_types(toktypes)
|
131 |
+
|
132 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
133 |
+
special_vocab.add_to_gguf(gguf_writer)
|
134 |
+
|
135 |
+
# TENSORS
|
136 |
+
|
137 |
+
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
138 |
+
|
139 |
+
# params for qkv transform
|
140 |
+
n_head_kv = hparams.get("n_head_kv", n_head)
|
141 |
+
head_dim = n_embed // n_head
|
142 |
+
|
143 |
+
# tensor info
|
144 |
+
print("gguf: get tensor metadata")
|
145 |
+
|
146 |
+
if num_parts == 0:
|
147 |
+
part_names = iter(("pytorch_model.bin",))
|
148 |
+
else:
|
149 |
+
part_names = (
|
150 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
151 |
+
)
|
152 |
+
|
153 |
+
for part_name in part_names:
|
154 |
+
if args.vocab_only:
|
155 |
+
break
|
156 |
+
print("gguf: loading model part '" + part_name + "'")
|
157 |
+
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
158 |
+
|
159 |
+
has_lm_head = True
|
160 |
+
if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
|
161 |
+
has_lm_head = False
|
162 |
+
|
163 |
+
for original_name in model_part.keys():
|
164 |
+
data = model_part[original_name]
|
165 |
+
name = re.sub(r'transformer\.', '', original_name)
|
166 |
+
|
167 |
+
old_dtype = data.dtype
|
168 |
+
|
169 |
+
# convert any unsupported data types to float32
|
170 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
171 |
+
data = data.to(torch.float32)
|
172 |
+
|
173 |
+
data = data.squeeze().numpy()
|
174 |
+
|
175 |
+
if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
|
176 |
+
# Map bloom-style qkv_linear to gpt-style qkv_linear
|
177 |
+
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
|
178 |
+
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
|
179 |
+
qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
|
180 |
+
data = np.concatenate(
|
181 |
+
(qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
|
182 |
+
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
|
183 |
+
qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
|
184 |
+
axis=0
|
185 |
+
)
|
186 |
+
print("re-format attention.linear_qkv.weight")
|
187 |
+
elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
|
188 |
+
qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
|
189 |
+
data = np.concatenate(
|
190 |
+
(qkv_bias[:, 0, :].reshape((n_embed,)),
|
191 |
+
qkv_bias[:, 1, :].reshape((n_embed,)),
|
192 |
+
qkv_bias[:, 2, :].reshape((n_embed,))),
|
193 |
+
axis=0
|
194 |
+
)
|
195 |
+
print("re-format attention.linear_qkv.bias")
|
196 |
+
|
197 |
+
# map tensor names
|
198 |
+
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
199 |
+
if new_name is None:
|
200 |
+
print("Can not map tensor '" + name + "'")
|
201 |
+
sys.exit()
|
202 |
+
|
203 |
+
n_dims = len(data.shape)
|
204 |
+
data_dtype = data.dtype
|
205 |
+
|
206 |
+
# if f32 desired, convert any float16 to float32
|
207 |
+
if ftype == 0 and data_dtype == np.float16:
|
208 |
+
data = data.astype(np.float32)
|
209 |
+
|
210 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
211 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
212 |
+
data = data.astype(np.float32)
|
213 |
+
|
214 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
215 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
216 |
+
data = data.astype(np.float16)
|
217 |
+
|
218 |
+
print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
219 |
+
|
220 |
+
gguf_writer.add_tensor(new_name, data)
|
221 |
+
|
222 |
+
if not has_lm_head and name == "word_embeddings.weight":
|
223 |
+
gguf_writer.add_tensor("output.weight", data)
|
224 |
+
print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa
|
225 |
+
|
226 |
+
|
227 |
+
print("gguf: write header")
|
228 |
+
gguf_writer.write_header_to_file()
|
229 |
+
print("gguf: write metadata")
|
230 |
+
gguf_writer.write_kv_data_to_file()
|
231 |
+
if not args.vocab_only:
|
232 |
+
print("gguf: write tensors")
|
233 |
+
gguf_writer.write_tensors_to_file()
|
234 |
+
|
235 |
+
gguf_writer.close()
|
236 |
+
|
237 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
238 |
+
print("")
|
convert-mpt-hf-to-gguf.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF mpt--> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import struct
|
10 |
+
import sys
|
11 |
+
from pathlib import Path
|
12 |
+
from typing import Any
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import torch
|
16 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
17 |
+
|
18 |
+
if 'NO_LOCAL_GGUF' not in os.environ:
|
19 |
+
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
20 |
+
import gguf
|
21 |
+
|
22 |
+
|
23 |
+
def count_model_parts(dir_model: Path) -> int:
|
24 |
+
num_parts = 0
|
25 |
+
for filename in os.listdir(dir_model):
|
26 |
+
if filename.startswith("pytorch_model-"):
|
27 |
+
num_parts += 1
|
28 |
+
|
29 |
+
if num_parts > 0:
|
30 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
31 |
+
return num_parts
|
32 |
+
|
33 |
+
|
34 |
+
def parse_args() -> argparse.Namespace:
|
35 |
+
parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
|
36 |
+
parser.add_argument(
|
37 |
+
"--vocab-only", action="store_true",
|
38 |
+
help="extract only the vocab",
|
39 |
+
)
|
40 |
+
parser.add_argument(
|
41 |
+
"--outfile", type=Path,
|
42 |
+
help="path to write to; default: based on input",
|
43 |
+
)
|
44 |
+
parser.add_argument(
|
45 |
+
"model", type=Path,
|
46 |
+
help="directory containing model file, or model file itself (*.bin)",
|
47 |
+
)
|
48 |
+
parser.add_argument(
|
49 |
+
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
50 |
+
help="output format - use 0 for float32, 1 for float16",
|
51 |
+
)
|
52 |
+
return parser.parse_args()
|
53 |
+
|
54 |
+
args = parse_args()
|
55 |
+
|
56 |
+
dir_model = args.model
|
57 |
+
ftype = args.ftype
|
58 |
+
if not dir_model.is_dir():
|
59 |
+
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
60 |
+
sys.exit(1)
|
61 |
+
|
62 |
+
# possible tensor data types
|
63 |
+
# ftype == 0 -> float32
|
64 |
+
# ftype == 1 -> float16
|
65 |
+
|
66 |
+
# map from ftype to string
|
67 |
+
ftype_str = ["f32", "f16"]
|
68 |
+
|
69 |
+
if args.outfile is not None:
|
70 |
+
fname_out = args.outfile
|
71 |
+
else:
|
72 |
+
# output in the same directory as the model by default
|
73 |
+
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
74 |
+
|
75 |
+
print("gguf: loading model "+dir_model.name)
|
76 |
+
|
77 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
78 |
+
hparams = json.load(f)
|
79 |
+
|
80 |
+
if hparams["architectures"][0] != "MPTForCausalLM":
|
81 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
82 |
+
|
83 |
+
sys.exit()
|
84 |
+
|
85 |
+
# get number of model parts
|
86 |
+
num_parts = count_model_parts(dir_model)
|
87 |
+
|
88 |
+
ARCH=gguf.MODEL_ARCH.MPT
|
89 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
90 |
+
|
91 |
+
print("gguf: get model metadata")
|
92 |
+
|
93 |
+
block_count = hparams["n_layers"]
|
94 |
+
|
95 |
+
gguf_writer.add_name(dir_model.name)
|
96 |
+
gguf_writer.add_context_length(hparams["max_seq_len"])
|
97 |
+
gguf_writer.add_embedding_length(hparams["d_model"])
|
98 |
+
gguf_writer.add_block_count(block_count)
|
99 |
+
gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
|
100 |
+
gguf_writer.add_head_count(hparams["n_heads"])
|
101 |
+
gguf_writer.add_layer_norm_eps(1e-05)
|
102 |
+
if hparams["attn_config"]["clip_qkv"] is not None:
|
103 |
+
gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
|
104 |
+
gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
|
105 |
+
|
106 |
+
# TOKENIZATION
|
107 |
+
|
108 |
+
print("gguf: get tokenizer metadata")
|
109 |
+
|
110 |
+
tokens: list[bytearray] = []
|
111 |
+
scores: list[float] = []
|
112 |
+
toktypes: list[int] = []
|
113 |
+
|
114 |
+
# gpt2 tokenizer
|
115 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
116 |
+
|
117 |
+
print("gguf: get gpt2 tokenizer vocab")
|
118 |
+
|
119 |
+
# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
|
120 |
+
# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
|
121 |
+
# accomodate some "reserved" tokens; this is causing problems down the line in
|
122 |
+
# llama.cpp, so we pad the vocab with dummy tokens:
|
123 |
+
|
124 |
+
vocab_size = hparams["vocab_size"]
|
125 |
+
|
126 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
127 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
128 |
+
|
129 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
130 |
+
|
131 |
+
for i in range(vocab_size):
|
132 |
+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
133 |
+
scores.append(0.0) # dummy
|
134 |
+
toktypes.append(gguf.TokenType.NORMAL)
|
135 |
+
|
136 |
+
gguf_writer.add_token_list(tokens)
|
137 |
+
gguf_writer.add_token_scores(scores)
|
138 |
+
gguf_writer.add_token_types(toktypes)
|
139 |
+
|
140 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
141 |
+
special_vocab.add_to_gguf(gguf_writer)
|
142 |
+
|
143 |
+
# TENSORS
|
144 |
+
|
145 |
+
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
146 |
+
|
147 |
+
# tensor info
|
148 |
+
print("gguf: get tensor metadata")
|
149 |
+
|
150 |
+
if num_parts == 0:
|
151 |
+
part_names = iter(("pytorch_model.bin",))
|
152 |
+
else:
|
153 |
+
part_names = (
|
154 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
155 |
+
)
|
156 |
+
|
157 |
+
for part_name in part_names:
|
158 |
+
if args.vocab_only:
|
159 |
+
break
|
160 |
+
print("gguf: loading model part '" + part_name + "'")
|
161 |
+
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
162 |
+
|
163 |
+
for name in model_part.keys():
|
164 |
+
data = model_part[name]
|
165 |
+
|
166 |
+
old_dtype = data.dtype
|
167 |
+
|
168 |
+
# convert any unsupported data types to float32
|
169 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
170 |
+
data = data.to(torch.float32)
|
171 |
+
|
172 |
+
data = data.squeeze().numpy()
|
173 |
+
|
174 |
+
# map tensor names
|
175 |
+
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
176 |
+
if new_name is None:
|
177 |
+
print("Cannot map tensor '" + name + "'")
|
178 |
+
continue # for the sake of compatibility with some old published models, don't quit
|
179 |
+
sys.exit()
|
180 |
+
|
181 |
+
n_dims = len(data.shape)
|
182 |
+
data_dtype = data.dtype
|
183 |
+
|
184 |
+
# if f32 desired, convert any float16 to float32
|
185 |
+
if ftype == 0 and data_dtype == np.float16:
|
186 |
+
data = data.astype(np.float32)
|
187 |
+
|
188 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
189 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
190 |
+
data = data.astype(np.float32)
|
191 |
+
|
192 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
193 |
+
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
194 |
+
data = data.astype(np.float16)
|
195 |
+
|
196 |
+
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
197 |
+
|
198 |
+
gguf_writer.add_tensor(new_name, data)
|
199 |
+
|
200 |
+
# note: MPT output is tied to (same as) wte in original model;
|
201 |
+
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
|
202 |
+
if new_name == "token_embd.weight":
|
203 |
+
gguf_writer.add_tensor("output.weight", data)
|
204 |
+
|
205 |
+
print("gguf: write header")
|
206 |
+
gguf_writer.write_header_to_file()
|
207 |
+
print("gguf: write metadata")
|
208 |
+
gguf_writer.write_kv_data_to_file()
|
209 |
+
if not args.vocab_only:
|
210 |
+
print("gguf: write tensors")
|
211 |
+
gguf_writer.write_tensors_to_file()
|
212 |
+
|
213 |
+
gguf_writer.close()
|
214 |
+
|
215 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
216 |
+
print("")
|
convert-refact-hf-to-gguf.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# HF refact--> gguf conversion
|
3 |
+
|
4 |
+
from __future__ import annotations
|
5 |
+
|
6 |
+
import argparse
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
import sys
|
10 |
+
from pathlib import Path
|
11 |
+
|
12 |
+
import numpy as np
|
13 |
+
import torch
|
14 |
+
from transformers import AutoTokenizer # type: ignore[import]
|
15 |
+
|
16 |
+
if "NO_LOCAL_GGUF" not in os.environ:
|
17 |
+
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
18 |
+
import gguf
|
19 |
+
|
20 |
+
def count_model_parts(dir_model: Path) -> int:
|
21 |
+
num_parts = 0
|
22 |
+
for filename in os.listdir(dir_model):
|
23 |
+
if filename.startswith("pytorch_model-"):
|
24 |
+
num_parts += 1
|
25 |
+
|
26 |
+
if num_parts > 0:
|
27 |
+
print("gguf: found " + str(num_parts) + " model parts")
|
28 |
+
return num_parts
|
29 |
+
|
30 |
+
|
31 |
+
def parse_args() -> argparse.Namespace:
|
32 |
+
parser = argparse.ArgumentParser(
|
33 |
+
description="Convert a Refact model to a GGML compatible file"
|
34 |
+
)
|
35 |
+
parser.add_argument(
|
36 |
+
"--vocab-only",
|
37 |
+
action="store_true",
|
38 |
+
help="extract only the vocab",
|
39 |
+
)
|
40 |
+
parser.add_argument(
|
41 |
+
"--outfile",
|
42 |
+
type=Path,
|
43 |
+
help="path to write to; default: based on input",
|
44 |
+
)
|
45 |
+
parser.add_argument(
|
46 |
+
"model",
|
47 |
+
type=Path,
|
48 |
+
help="directory containing model file, or model file itself (*.bin)",
|
49 |
+
)
|
50 |
+
parser.add_argument(
|
51 |
+
"ftype",
|
52 |
+
type=int,
|
53 |
+
choices=[0, 1],
|
54 |
+
default=1,
|
55 |
+
nargs="?",
|
56 |
+
help="output format - use 0 for float32, 1 for float16",
|
57 |
+
)
|
58 |
+
return parser.parse_args()
|
59 |
+
|
60 |
+
|
61 |
+
args = parse_args()
|
62 |
+
|
63 |
+
dir_model = args.model
|
64 |
+
ftype = args.ftype
|
65 |
+
if not dir_model.is_dir():
|
66 |
+
print(f"Error: {args.model} is not a directory", file=sys.stderr)
|
67 |
+
sys.exit(1)
|
68 |
+
|
69 |
+
# possible tensor data types
|
70 |
+
# ftype == 0 -> float32
|
71 |
+
# ftype == 1 -> float16
|
72 |
+
|
73 |
+
# map from ftype to string
|
74 |
+
ftype_str = ["f32", "f16"]
|
75 |
+
|
76 |
+
if args.outfile is not None:
|
77 |
+
fname_out = args.outfile
|
78 |
+
else:
|
79 |
+
# output in the same directory as the model by default
|
80 |
+
fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
|
81 |
+
|
82 |
+
print("gguf: loading model " + dir_model.name)
|
83 |
+
|
84 |
+
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
85 |
+
hparams = json.load(f)
|
86 |
+
|
87 |
+
if hparams["architectures"][0] != "GPTRefactForCausalLM":
|
88 |
+
print("Model architecture not supported: " + hparams["architectures"][0])
|
89 |
+
|
90 |
+
sys.exit(1)
|
91 |
+
|
92 |
+
# get number of model parts
|
93 |
+
num_parts = count_model_parts(dir_model)
|
94 |
+
|
95 |
+
ARCH = gguf.MODEL_ARCH.REFACT
|
96 |
+
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
|
97 |
+
|
98 |
+
print("gguf: get model metadata")
|
99 |
+
|
100 |
+
# Get refact feed forward dimension
|
101 |
+
hidden_dim = hparams["n_embd"]
|
102 |
+
inner_dim = 4 * hidden_dim
|
103 |
+
hidden_dim = int(2 * inner_dim / 3)
|
104 |
+
multiple_of = 256
|
105 |
+
ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
|
106 |
+
|
107 |
+
block_count = hparams["n_layer"]
|
108 |
+
|
109 |
+
gguf_writer.add_name("Refact")
|
110 |
+
# refact uses Alibi. So this is from config.json which might be used by training.
|
111 |
+
gguf_writer.add_context_length(hparams["n_positions"])
|
112 |
+
gguf_writer.add_embedding_length(hparams["n_embd"])
|
113 |
+
|
114 |
+
gguf_writer.add_feed_forward_length(ff_dim)
|
115 |
+
gguf_writer.add_block_count(block_count)
|
116 |
+
gguf_writer.add_head_count(hparams["n_head"])
|
117 |
+
gguf_writer.add_head_count_kv(1)
|
118 |
+
gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
|
119 |
+
gguf_writer.add_file_type(ftype)
|
120 |
+
|
121 |
+
# TOKENIZATION
|
122 |
+
|
123 |
+
print("gguf: get tokenizer metadata")
|
124 |
+
|
125 |
+
tokens: list[bytearray] = []
|
126 |
+
scores: list[float] = []
|
127 |
+
toktypes: list[int] = []
|
128 |
+
|
129 |
+
# gpt2 tokenizer
|
130 |
+
gguf_writer.add_tokenizer_model("gpt2")
|
131 |
+
|
132 |
+
print("gguf: get gpt2 tokenizer vocab")
|
133 |
+
|
134 |
+
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
135 |
+
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
136 |
+
|
137 |
+
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
138 |
+
# This causes downstream issues with mismatched tensor sizes when running the inference
|
139 |
+
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
140 |
+
assert max(tokenizer.vocab.values()) < vocab_size
|
141 |
+
|
142 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
143 |
+
|
144 |
+
for i in range(vocab_size):
|
145 |
+
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
146 |
+
scores.append(0.0) # dummy
|
147 |
+
toktypes.append(gguf.TokenType.NORMAL)
|
148 |
+
|
149 |
+
gguf_writer.add_token_list(tokens)
|
150 |
+
gguf_writer.add_token_scores(scores)
|
151 |
+
gguf_writer.add_token_types(toktypes)
|
152 |
+
|
153 |
+
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
154 |
+
special_vocab.add_to_gguf(gguf_writer)
|
155 |
+
|
156 |
+
# TENSORS
|
157 |
+
|
158 |
+
tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
|
159 |
+
|
160 |
+
# params for qkv transform
|
161 |
+
n_head = hparams["n_head"]
|
162 |
+
n_head_kv = 1
|
163 |
+
|
164 |
+
head_dim = hparams["n_embd"] // n_head
|
165 |
+
|
166 |
+
# tensor info
|
167 |
+
print("gguf: get tensor metadata")
|
168 |
+
|
169 |
+
if num_parts == 0:
|
170 |
+
part_names = iter(("pytorch_model.bin",))
|
171 |
+
else:
|
172 |
+
part_names = (
|
173 |
+
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
174 |
+
)
|
175 |
+
for part_name in part_names:
|
176 |
+
if args.vocab_only:
|
177 |
+
break
|
178 |
+
print("gguf: loading model part '" + part_name + "'")
|
179 |
+
model_part = torch.load(dir_model / part_name, map_location="cpu")
|
180 |
+
|
181 |
+
for i in range(block_count):
|
182 |
+
if f"transformer.h.{i}.attn.kv.weight" in model_part:
|
183 |
+
data = model_part[f"transformer.h.{i}.attn.kv.weight"]
|
184 |
+
model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
|
185 |
+
: n_head_kv * head_dim
|
186 |
+
]
|
187 |
+
model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
|
188 |
+
n_head_kv * head_dim :
|
189 |
+
]
|
190 |
+
del model_part[f"transformer.h.{i}.attn.kv.weight"]
|
191 |
+
if f"transformer.h.{i}.attn.q.weight" in model_part:
|
192 |
+
model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
|
193 |
+
f"transformer.h.{i}.attn.q.weight"
|
194 |
+
]
|
195 |
+
del model_part[f"transformer.h.{i}.attn.q.weight"]
|
196 |
+
if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
|
197 |
+
data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
198 |
+
model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
|
199 |
+
model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
|
200 |
+
del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
|
201 |
+
|
202 |
+
for name in model_part.keys():
|
203 |
+
data = model_part[name]
|
204 |
+
|
205 |
+
old_dtype = data.dtype
|
206 |
+
|
207 |
+
# convert any unsupported data types to float32
|
208 |
+
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
209 |
+
data = data.to(torch.float32)
|
210 |
+
|
211 |
+
data = data.squeeze().numpy()
|
212 |
+
|
213 |
+
# map tensor names
|
214 |
+
new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
|
215 |
+
if new_name is None:
|
216 |
+
print("Can not map tensor '" + name + "'")
|
217 |
+
sys.exit()
|
218 |
+
|
219 |
+
n_dims = len(data.shape)
|
220 |
+
data_dtype = data.dtype
|
221 |
+
|
222 |
+
# if f32 desired, convert any float16 to float32
|
223 |
+
if ftype == 0 and data_dtype == np.float16:
|
224 |
+
data = data.astype(np.float32)
|
225 |
+
|
226 |
+
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
227 |
+
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
228 |
+
data = data.astype(np.float32)
|
229 |
+
|
230 |
+
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
231 |
+
if (
|
232 |
+
ftype == 1
|
233 |
+
and data_dtype == np.float32
|
234 |
+
and name.endswith(".weight")
|
235 |
+
and n_dims == 2
|
236 |
+
):
|
237 |
+
data = data.astype(np.float16)
|
238 |
+
|
239 |
+
print(
|
240 |
+
new_name
|
241 |
+
+ ", n_dims = "
|
242 |
+
+ str(n_dims)
|
243 |
+
+ ", "
|
244 |
+
+ str(old_dtype)
|
245 |
+
+ " --> "
|
246 |
+
+ str(data.dtype)
|
247 |
+
)
|
248 |
+
|
249 |
+
gguf_writer.add_tensor(new_name, data)
|
250 |
+
|
251 |
+
|
252 |
+
print("gguf: write header")
|
253 |
+
gguf_writer.write_header_to_file()
|
254 |
+
print("gguf: write metadata")
|
255 |
+
gguf_writer.write_kv_data_to_file()
|
256 |
+
if not args.vocab_only:
|
257 |
+
print("gguf: write tensors")
|
258 |
+
gguf_writer.write_tensors_to_file()
|
259 |
+
|
260 |
+
gguf_writer.close()
|
261 |
+
|
262 |
+
print(f"gguf: model successfully exported to '{fname_out}'")
|
263 |
+
print("")
|
examples/CMakeLists.txt
CHANGED
@@ -25,6 +25,7 @@ else()
|
|
25 |
add_subdirectory(convert-llama2c-to-ggml)
|
26 |
add_subdirectory(simple)
|
27 |
add_subdirectory(batched)
|
|
|
28 |
add_subdirectory(speculative)
|
29 |
add_subdirectory(parallel)
|
30 |
add_subdirectory(embd-input)
|
|
|
25 |
add_subdirectory(convert-llama2c-to-ggml)
|
26 |
add_subdirectory(simple)
|
27 |
add_subdirectory(batched)
|
28 |
+
add_subdirectory(batched-bench)
|
29 |
add_subdirectory(speculative)
|
30 |
add_subdirectory(parallel)
|
31 |
add_subdirectory(embd-input)
|
examples/batched-bench/CMakeLists.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
set(TARGET batched-bench)
|
2 |
+
add_executable(${TARGET} batched-bench.cpp)
|
3 |
+
install(TARGETS ${TARGET} RUNTIME)
|
4 |
+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
5 |
+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
examples/batched-bench/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# llama.cpp/example/batched-bench
|
2 |
+
|
3 |
+
Benchmark the batched decoding performance of `llama.cpp`
|
4 |
+
|
5 |
+
## Usage
|
6 |
+
|
7 |
+
There are 2 modes of operation:
|
8 |
+
|
9 |
+
- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
|
10 |
+
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
11 |
+
|
12 |
+
```bash
|
13 |
+
./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
14 |
+
|
15 |
+
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
16 |
+
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
|
17 |
+
|
18 |
+
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
19 |
+
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
|
20 |
+
|
21 |
+
# custom set of batches
|
22 |
+
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
23 |
+
```
|
24 |
+
|
25 |
+
## Sample results
|
26 |
+
|
27 |
+
- `PP` - prompt tokens per batch
|
28 |
+
- `TG` - generated tokens per batch
|
29 |
+
- `B` - number of batches
|
30 |
+
- `N_KV` - required KV cache size
|
31 |
+
- `T_PP` - prompt processing time (i.e. time to first token)
|
32 |
+
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
|
33 |
+
- `T_TG` - time to generate all batches
|
34 |
+
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
|
35 |
+
- `T` - total time
|
36 |
+
- `S` - total speed (i.e. all tokens / total time)
|
37 |
+
|
38 |
+
| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s |
|
39 |
+
|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
|
40 |
+
| 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 |
|
41 |
+
| 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 |
|
42 |
+
| 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 |
|
43 |
+
| 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 |
|
44 |
+
| 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 |
|
45 |
+
| 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 |
|
46 |
+
| 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 |
|
47 |
+
| 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 |
|
48 |
+
| 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 |
|
49 |
+
| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 |
|
50 |
+
| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 |
|
51 |
+
| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 |
|
examples/batched-bench/batched-bench.cpp
ADDED
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "common.h"
|
2 |
+
#include "llama.h"
|
3 |
+
|
4 |
+
#include <algorithm>
|
5 |
+
#include <cmath>
|
6 |
+
#include <cstdio>
|
7 |
+
#include <string>
|
8 |
+
#include <vector>
|
9 |
+
|
10 |
+
// mutates the input string
|
11 |
+
static std::vector<int> parse_list(char * p) {
|
12 |
+
std::vector<int> ret;
|
13 |
+
|
14 |
+
char * q = p;
|
15 |
+
|
16 |
+
while (*p) {
|
17 |
+
if (*p == ',') {
|
18 |
+
*p = '\0';
|
19 |
+
ret.push_back(std::atoi(q));
|
20 |
+
q = p + 1;
|
21 |
+
}
|
22 |
+
|
23 |
+
++p;
|
24 |
+
}
|
25 |
+
|
26 |
+
ret.push_back(std::atoi(q));
|
27 |
+
|
28 |
+
return ret;
|
29 |
+
}
|
30 |
+
|
31 |
+
int main(int argc, char ** argv) {
|
32 |
+
gpt_params params;
|
33 |
+
|
34 |
+
if (argc == 1 || argv[1][0] == '-') {
|
35 |
+
printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
|
36 |
+
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
37 |
+
printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
38 |
+
return 1 ;
|
39 |
+
}
|
40 |
+
|
41 |
+
int n_kv_max = 2048;
|
42 |
+
int is_pp_shared = 0;
|
43 |
+
int n_gpu_layers = 0;
|
44 |
+
int mmq = 0;
|
45 |
+
|
46 |
+
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
|
47 |
+
std::vector<int> n_tg = { 128, 256, };
|
48 |
+
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
|
49 |
+
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
|
50 |
+
|
51 |
+
if (argc >= 2) {
|
52 |
+
params.model = argv[1];
|
53 |
+
}
|
54 |
+
|
55 |
+
if (argc >= 3) {
|
56 |
+
n_kv_max = std::atoi(argv[2]);
|
57 |
+
}
|
58 |
+
|
59 |
+
if (argc >= 4) {
|
60 |
+
is_pp_shared = std::atoi(argv[3]);
|
61 |
+
}
|
62 |
+
|
63 |
+
if (argc >= 5) {
|
64 |
+
n_gpu_layers = std::atoi(argv[4]);
|
65 |
+
}
|
66 |
+
|
67 |
+
if (argc >= 6) {
|
68 |
+
mmq = std::atoi(argv[5]);
|
69 |
+
}
|
70 |
+
|
71 |
+
if (argc >= 7) {
|
72 |
+
n_pp = parse_list(argv[6]);
|
73 |
+
}
|
74 |
+
|
75 |
+
if (argc >= 8) {
|
76 |
+
n_tg = parse_list(argv[7]);
|
77 |
+
}
|
78 |
+
|
79 |
+
if (argc >= 9) {
|
80 |
+
n_pl = parse_list(argv[8]);
|
81 |
+
}
|
82 |
+
|
83 |
+
// init LLM
|
84 |
+
|
85 |
+
llama_backend_init(params.numa);
|
86 |
+
|
87 |
+
// initialize the model
|
88 |
+
|
89 |
+
llama_model_params model_params = llama_model_default_params();
|
90 |
+
|
91 |
+
model_params.n_gpu_layers = n_gpu_layers;
|
92 |
+
|
93 |
+
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
94 |
+
|
95 |
+
if (model == NULL) {
|
96 |
+
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
97 |
+
return 1;
|
98 |
+
}
|
99 |
+
|
100 |
+
llama_context_params ctx_params = llama_context_default_params();
|
101 |
+
|
102 |
+
ctx_params.seed = 1234;
|
103 |
+
ctx_params.n_ctx = n_kv_max;
|
104 |
+
ctx_params.n_batch = 512;
|
105 |
+
ctx_params.mul_mat_q = mmq;
|
106 |
+
|
107 |
+
ctx_params.n_threads = params.n_threads;
|
108 |
+
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
109 |
+
|
110 |
+
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
111 |
+
|
112 |
+
if (ctx == NULL) {
|
113 |
+
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
114 |
+
return 1;
|
115 |
+
}
|
116 |
+
|
117 |
+
llama_batch batch = llama_batch_init(n_kv_max, 0);
|
118 |
+
|
119 |
+
// decode in batches of ctx_params.n_batch tokens
|
120 |
+
auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
|
121 |
+
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
122 |
+
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
123 |
+
|
124 |
+
llama_batch batch_view = {
|
125 |
+
n_tokens,
|
126 |
+
batch.token + i,
|
127 |
+
nullptr,
|
128 |
+
batch.pos + i,
|
129 |
+
batch.seq_id + i,
|
130 |
+
batch.logits + i,
|
131 |
+
0, 0, 0, // unused
|
132 |
+
};
|
133 |
+
|
134 |
+
const int ret = llama_decode(ctx, batch_view);
|
135 |
+
if (ret != 0) {
|
136 |
+
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
|
137 |
+
return false;
|
138 |
+
}
|
139 |
+
}
|
140 |
+
|
141 |
+
return true;
|
142 |
+
};
|
143 |
+
|
144 |
+
// warm up
|
145 |
+
{
|
146 |
+
batch.n_tokens = 16;
|
147 |
+
|
148 |
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
149 |
+
batch.token[i] = 0;
|
150 |
+
batch.pos[i] = i;
|
151 |
+
batch.seq_id[i] = 0;
|
152 |
+
batch.logits[i] = false;
|
153 |
+
}
|
154 |
+
|
155 |
+
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
156 |
+
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
157 |
+
return 1;
|
158 |
+
}
|
159 |
+
}
|
160 |
+
|
161 |
+
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
162 |
+
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
163 |
+
|
164 |
+
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
|
165 |
+
for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
|
166 |
+
for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
|
167 |
+
const int pp = n_pp[i_pp];
|
168 |
+
const int tg = n_tg[i_tg];
|
169 |
+
const int pl = n_pl[i_pl];
|
170 |
+
|
171 |
+
const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
|
172 |
+
|
173 |
+
if (n_ctx_req > n_kv_max) {
|
174 |
+
continue;
|
175 |
+
}
|
176 |
+
|
177 |
+
batch.n_tokens = is_pp_shared ? pp : pl*pp;
|
178 |
+
|
179 |
+
for (int i = 0; i < batch.n_tokens; ++i) {
|
180 |
+
batch.token[i] = 0;
|
181 |
+
batch.pos[i] = i;
|
182 |
+
batch.seq_id[i] = 0;
|
183 |
+
batch.logits[i] = false;
|
184 |
+
}
|
185 |
+
batch.logits[batch.n_tokens - 1] = true;
|
186 |
+
|
187 |
+
const auto t_pp_start = ggml_time_us();
|
188 |
+
|
189 |
+
llama_kv_cache_tokens_rm(ctx, -1, -1);
|
190 |
+
|
191 |
+
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
192 |
+
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
193 |
+
return 1;
|
194 |
+
}
|
195 |
+
|
196 |
+
if (is_pp_shared) {
|
197 |
+
for (int32_t i = 1; i < pl; ++i) {
|
198 |
+
llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
|
199 |
+
}
|
200 |
+
}
|
201 |
+
|
202 |
+
const auto t_pp_end = ggml_time_us();
|
203 |
+
|
204 |
+
const auto t_tg_start = ggml_time_us();
|
205 |
+
|
206 |
+
for (int i = 0; i < tg; ++i) {
|
207 |
+
batch.n_tokens = pl;
|
208 |
+
|
209 |
+
for (int j = 0; j < pl; ++j) {
|
210 |
+
batch.token[j] = 0;
|
211 |
+
batch.pos[j] = pp + i;
|
212 |
+
batch.seq_id[j] = j;
|
213 |
+
batch.logits[j] = true;
|
214 |
+
}
|
215 |
+
|
216 |
+
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
217 |
+
LOG_TEE("%s: llama_decode() failed\n", __func__);
|
218 |
+
return 1;
|
219 |
+
}
|
220 |
+
}
|
221 |
+
|
222 |
+
const auto t_tg_end = ggml_time_us();
|
223 |
+
|
224 |
+
const int32_t n_kv = n_ctx_req;
|
225 |
+
|
226 |
+
const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
|
227 |
+
const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
|
228 |
+
const float t = t_pp + t_tg;
|
229 |
+
|
230 |
+
const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
|
231 |
+
const float speed_tg = pl*tg / t_tg;
|
232 |
+
const float speed = n_kv / t;
|
233 |
+
|
234 |
+
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
|
235 |
+
}
|
236 |
+
}
|
237 |
+
}
|
238 |
+
|
239 |
+
llama_print_timings(ctx);
|
240 |
+
|
241 |
+
llama_batch_free(batch);
|
242 |
+
|
243 |
+
llama_free(ctx);
|
244 |
+
llama_free_model(model);
|
245 |
+
|
246 |
+
llama_backend_free();
|
247 |
+
|
248 |
+
fprintf(stderr, "\n\n");
|
249 |
+
|
250 |
+
return 0;
|
251 |
+
}
|
examples/batched.swift/.gitignore
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.DS_Store
|
2 |
+
/.build
|
3 |
+
/Packages
|
4 |
+
xcuserdata/
|
5 |
+
DerivedData/
|
6 |
+
.swiftpm/configuration/registries.json
|
7 |
+
.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
|
8 |
+
.netrc
|
9 |
+
batched_swift
|
examples/batched.swift/Makefile
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.PHONY: build
|
2 |
+
|
3 |
+
build:
|
4 |
+
xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
|
5 |
+
rm -f ./batched_swift
|
6 |
+
ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
|
examples/batched.swift/Package.swift
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// swift-tools-version: 5.5
|
2 |
+
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
3 |
+
|
4 |
+
import PackageDescription
|
5 |
+
|
6 |
+
let package = Package(
|
7 |
+
name: "batched_swift",
|
8 |
+
platforms: [.macOS(.v12)],
|
9 |
+
dependencies: [
|
10 |
+
.package(name: "llama", path: "../../"),
|
11 |
+
],
|
12 |
+
targets: [
|
13 |
+
// Targets are the basic building blocks of a package, defining a module or a test suite.
|
14 |
+
// Targets can depend on other targets in this package and products from dependencies.
|
15 |
+
.executableTarget(
|
16 |
+
name: "batched_swift",
|
17 |
+
dependencies: ["llama"],
|
18 |
+
path: "Sources",
|
19 |
+
linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
|
20 |
+
),
|
21 |
+
]
|
22 |
+
)
|
examples/batched.swift/README.md
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This is a swift clone of `examples/batched`.
|
2 |
+
|
3 |
+
$ `make`
|
4 |
+
$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
|
examples/batched.swift/Sources/main.swift
ADDED
@@ -0,0 +1,255 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import Foundation
|
2 |
+
import llama
|
3 |
+
|
4 |
+
let arguments = CommandLine.arguments
|
5 |
+
|
6 |
+
// Check that we have at least one argument (the model path)
|
7 |
+
guard arguments.count > 1 else {
|
8 |
+
print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
|
9 |
+
exit(1)
|
10 |
+
}
|
11 |
+
|
12 |
+
let modelPath: String = arguments[1]
|
13 |
+
let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
|
14 |
+
let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
|
15 |
+
|
16 |
+
// total length of the sequences including the prompt
|
17 |
+
let n_len: Int = 32
|
18 |
+
|
19 |
+
// init LLM
|
20 |
+
llama_backend_init(false)
|
21 |
+
defer {
|
22 |
+
llama_backend_free()
|
23 |
+
}
|
24 |
+
|
25 |
+
let model_params = llama_model_default_params()
|
26 |
+
guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
|
27 |
+
print("Failed to load model")
|
28 |
+
exit(1)
|
29 |
+
}
|
30 |
+
|
31 |
+
defer {
|
32 |
+
llama_free_model(model)
|
33 |
+
}
|
34 |
+
|
35 |
+
var tokens = tokenize(text: prompt, add_bos: true)
|
36 |
+
|
37 |
+
let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
|
38 |
+
|
39 |
+
var context_params = llama_context_default_params()
|
40 |
+
context_params.seed = 1234
|
41 |
+
context_params.n_ctx = n_kv_req
|
42 |
+
context_params.n_batch = UInt32(max(n_len, n_parallel))
|
43 |
+
context_params.n_threads = 8
|
44 |
+
context_params.n_threads_batch = 8
|
45 |
+
|
46 |
+
let context = llama_new_context_with_model(model, context_params)
|
47 |
+
guard context != nil else {
|
48 |
+
print("Failed to initialize context")
|
49 |
+
exit(1)
|
50 |
+
}
|
51 |
+
|
52 |
+
defer {
|
53 |
+
llama_free(context)
|
54 |
+
}
|
55 |
+
|
56 |
+
let n_ctx = llama_n_ctx(context)
|
57 |
+
|
58 |
+
print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
|
59 |
+
|
60 |
+
if n_kv_req > n_ctx {
|
61 |
+
print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
|
62 |
+
exit(1)
|
63 |
+
}
|
64 |
+
|
65 |
+
var buffer: [CChar] = []
|
66 |
+
for id: llama_token in tokens {
|
67 |
+
print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
|
68 |
+
}
|
69 |
+
|
70 |
+
print("\n")
|
71 |
+
|
72 |
+
var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0)
|
73 |
+
defer {
|
74 |
+
llama_batch_free(batch)
|
75 |
+
}
|
76 |
+
|
77 |
+
// evaluate the initial prompt
|
78 |
+
batch.n_tokens = Int32(tokens.count)
|
79 |
+
|
80 |
+
for (i, token) in tokens.enumerated() {
|
81 |
+
batch.token[i] = token
|
82 |
+
batch.pos[i] = Int32(i)
|
83 |
+
batch.seq_id[i] = 0
|
84 |
+
batch.logits[i] = 0
|
85 |
+
}
|
86 |
+
|
87 |
+
// llama_decode will output logits only for the last token of the prompt
|
88 |
+
batch.logits[Int(batch.n_tokens) - 1] = 1
|
89 |
+
|
90 |
+
if llama_decode(context, batch) != 0 {
|
91 |
+
print("llama_decode() failed")
|
92 |
+
exit(1)
|
93 |
+
}
|
94 |
+
|
95 |
+
for i in 1 ..< n_parallel {
|
96 |
+
llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
|
97 |
+
}
|
98 |
+
|
99 |
+
if n_parallel > 1 {
|
100 |
+
print("generating \(n_parallel) sequences ...\n")
|
101 |
+
}
|
102 |
+
|
103 |
+
var streams: [String] = .init(repeating: "", count: n_parallel)
|
104 |
+
var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
|
105 |
+
var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
|
106 |
+
|
107 |
+
var n_cur = batch.n_tokens
|
108 |
+
var n_decode = 0
|
109 |
+
|
110 |
+
let t_main_start = ggml_time_us()
|
111 |
+
|
112 |
+
while n_cur <= n_len {
|
113 |
+
// prepare the next batch
|
114 |
+
batch.n_tokens = 0
|
115 |
+
|
116 |
+
// sample the next token for each parallel sequence / stream
|
117 |
+
for i in 0 ..< n_parallel {
|
118 |
+
if i_batch[i] < 0 {
|
119 |
+
// the stream has already finished
|
120 |
+
continue
|
121 |
+
}
|
122 |
+
|
123 |
+
var n_vocab = llama_n_vocab(model)
|
124 |
+
var logits = llama_get_logits_ith(context, i_batch[i])
|
125 |
+
|
126 |
+
var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
|
127 |
+
|
128 |
+
for token_id in 0 ..< n_vocab {
|
129 |
+
candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
|
130 |
+
}
|
131 |
+
|
132 |
+
var candidates_p: llama_token_data_array = .init(
|
133 |
+
data: &candidates,
|
134 |
+
size: candidates.count,
|
135 |
+
sorted: false
|
136 |
+
)
|
137 |
+
|
138 |
+
let top_k: Int32 = 40
|
139 |
+
let top_p: Float = 0.9
|
140 |
+
let temp: Float = 0.4
|
141 |
+
|
142 |
+
llama_sample_top_k(context, &candidates_p, top_k, 1)
|
143 |
+
llama_sample_top_p(context, &candidates_p, top_p, 1)
|
144 |
+
llama_sample_temp(context, &candidates_p, temp)
|
145 |
+
|
146 |
+
let new_token_id = llama_sample_token(context, &candidates_p)
|
147 |
+
|
148 |
+
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
149 |
+
|
150 |
+
// is it an end of stream? -> mark the stream as finished
|
151 |
+
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
152 |
+
i_batch[i] = -1
|
153 |
+
// print("")
|
154 |
+
if n_parallel > 1 {
|
155 |
+
print("stream \(i) finished at n_cur = \(n_cur)")
|
156 |
+
}
|
157 |
+
|
158 |
+
continue
|
159 |
+
}
|
160 |
+
|
161 |
+
let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
|
162 |
+
|
163 |
+
// if there is only one stream, we print immediately to stdout
|
164 |
+
if n_parallel == 1 {
|
165 |
+
print(nextStringPiece, terminator: "")
|
166 |
+
}
|
167 |
+
streams[i] += nextStringPiece
|
168 |
+
|
169 |
+
// push this new token for next evaluation
|
170 |
+
batch.token[Int(batch.n_tokens)] = new_token_id
|
171 |
+
batch.pos[Int(batch.n_tokens)] = n_cur
|
172 |
+
batch.seq_id[Int(batch.n_tokens)] = Int32(i)
|
173 |
+
batch.logits[Int(batch.n_tokens)] = 1
|
174 |
+
|
175 |
+
i_batch[i] = batch.n_tokens
|
176 |
+
|
177 |
+
batch.n_tokens += 1
|
178 |
+
|
179 |
+
n_decode += 1
|
180 |
+
}
|
181 |
+
|
182 |
+
// all streams are finished
|
183 |
+
if batch.n_tokens == 0 {
|
184 |
+
break
|
185 |
+
}
|
186 |
+
|
187 |
+
n_cur += 1
|
188 |
+
|
189 |
+
// evaluate the current batch with the transformer model
|
190 |
+
if llama_decode(context, batch) != 0 {
|
191 |
+
print("llama_decode() failed")
|
192 |
+
exit(1)
|
193 |
+
}
|
194 |
+
}
|
195 |
+
|
196 |
+
if n_parallel > 1 {
|
197 |
+
print("\n")
|
198 |
+
for (i, stream) in streams.enumerated() {
|
199 |
+
print("sequence \(i):\n\n\(prompt)\(stream)\n")
|
200 |
+
}
|
201 |
+
}
|
202 |
+
|
203 |
+
let t_main_end = ggml_time_us()
|
204 |
+
|
205 |
+
print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
|
206 |
+
|
207 |
+
llama_print_timings(context)
|
208 |
+
|
209 |
+
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
210 |
+
let n_tokens = text.count + (add_bos ? 1 : 0)
|
211 |
+
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
212 |
+
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos)
|
213 |
+
var swiftTokens: [llama_token] = []
|
214 |
+
for i in 0 ..< tokenCount {
|
215 |
+
swiftTokens.append(tokens[Int(i)])
|
216 |
+
}
|
217 |
+
tokens.deallocate()
|
218 |
+
return swiftTokens
|
219 |
+
}
|
220 |
+
|
221 |
+
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
|
222 |
+
var result = [CChar](repeating: 0, count: 8)
|
223 |
+
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
|
224 |
+
if nTokens < 0 {
|
225 |
+
if result.count >= -Int(nTokens) {
|
226 |
+
result.removeLast(-Int(nTokens))
|
227 |
+
} else {
|
228 |
+
result.removeAll()
|
229 |
+
}
|
230 |
+
let check = llama_token_to_piece(
|
231 |
+
model,
|
232 |
+
token,
|
233 |
+
&result,
|
234 |
+
Int32(result.count)
|
235 |
+
)
|
236 |
+
assert(check == nTokens)
|
237 |
+
} else {
|
238 |
+
result.removeLast(result.count - Int(nTokens))
|
239 |
+
}
|
240 |
+
if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
|
241 |
+
return utfString
|
242 |
+
} else {
|
243 |
+
buffer.append(contentsOf: result)
|
244 |
+
let data = Data(buffer.map { UInt8(bitPattern: $0) })
|
245 |
+
if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
|
246 |
+
buffer = []
|
247 |
+
}
|
248 |
+
guard let bufferString = String(data: data, encoding: .utf8) else {
|
249 |
+
return nil
|
250 |
+
}
|
251 |
+
buffer = []
|
252 |
+
return bufferString
|
253 |
+
}
|
254 |
+
return nil
|
255 |
+
}
|
examples/batched/batched.cpp
CHANGED
@@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
|
|
66 |
ctx_params.seed = 1234;
|
67 |
ctx_params.n_ctx = n_kv_req;
|
68 |
ctx_params.n_batch = std::max(n_len, n_parallel);
|
69 |
-
ctx_params.n_threads
|
70 |
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
71 |
|
72 |
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
|
|
66 |
ctx_params.seed = 1234;
|
67 |
ctx_params.n_ctx = n_kv_req;
|
68 |
ctx_params.n_batch = std::max(n_len, n_parallel);
|
69 |
+
ctx_params.n_threads = params.n_threads;
|
70 |
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
71 |
|
72 |
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
examples/embd-input/embd-input-lib.cpp
CHANGED
@@ -128,21 +128,22 @@ bool eval_string(struct MyModel * mymodel,const char* str){
|
|
128 |
llama_token sampling_id(struct MyModel* mymodel) {
|
129 |
llama_context* ctx = mymodel->ctx;
|
130 |
gpt_params params = mymodel->params;
|
|
|
131 |
// int n_ctx = llama_n_ctx(ctx);
|
132 |
|
133 |
// out of user input, sample next token
|
134 |
-
const float temp =
|
135 |
-
const int32_t top_k =
|
136 |
-
const float top_p =
|
137 |
-
const float tfs_z =
|
138 |
-
const float typical_p =
|
139 |
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
140 |
// const float repeat_penalty = params.repeat_penalty;
|
141 |
// const float alpha_presence = params.presence_penalty;
|
142 |
// const float alpha_frequency = params.frequency_penalty;
|
143 |
-
const int mirostat =
|
144 |
-
const float mirostat_tau =
|
145 |
-
const float mirostat_eta =
|
146 |
// const bool penalize_nl = params.penalize_nl;
|
147 |
|
148 |
llama_token id = 0;
|
@@ -151,7 +152,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
|
|
151 |
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
152 |
|
153 |
// Apply params.logit_bias map
|
154 |
-
for (auto it =
|
155 |
logits[it->first] += it->second;
|
156 |
}
|
157 |
|
|
|
128 |
llama_token sampling_id(struct MyModel* mymodel) {
|
129 |
llama_context* ctx = mymodel->ctx;
|
130 |
gpt_params params = mymodel->params;
|
131 |
+
llama_sampling_params & sparams = params.sampling_params;
|
132 |
// int n_ctx = llama_n_ctx(ctx);
|
133 |
|
134 |
// out of user input, sample next token
|
135 |
+
const float temp = sparams.temp;
|
136 |
+
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
|
137 |
+
const float top_p = sparams.top_p;
|
138 |
+
const float tfs_z = sparams.tfs_z;
|
139 |
+
const float typical_p = sparams.typical_p;
|
140 |
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
141 |
// const float repeat_penalty = params.repeat_penalty;
|
142 |
// const float alpha_presence = params.presence_penalty;
|
143 |
// const float alpha_frequency = params.frequency_penalty;
|
144 |
+
const int mirostat = sparams.mirostat;
|
145 |
+
const float mirostat_tau = sparams.mirostat_tau;
|
146 |
+
const float mirostat_eta = sparams.mirostat_eta;
|
147 |
// const bool penalize_nl = params.penalize_nl;
|
148 |
|
149 |
llama_token id = 0;
|
|
|
152 |
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
153 |
|
154 |
// Apply params.logit_bias map
|
155 |
+
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
|
156 |
logits[it->first] += it->second;
|
157 |
}
|
158 |
|
examples/infill/infill.cpp
ADDED
@@ -0,0 +1,800 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "common.h"
|
2 |
+
|
3 |
+
#include "console.h"
|
4 |
+
#include "llama.h"
|
5 |
+
#include "build-info.h"
|
6 |
+
#include "grammar-parser.h"
|
7 |
+
|
8 |
+
#include <cassert>
|
9 |
+
#include <cinttypes>
|
10 |
+
#include <cmath>
|
11 |
+
#include <cstdio>
|
12 |
+
#include <cstring>
|
13 |
+
#include <ctime>
|
14 |
+
#include <fstream>
|
15 |
+
#include <iostream>
|
16 |
+
#include <sstream>
|
17 |
+
#include <string>
|
18 |
+
#include <vector>
|
19 |
+
|
20 |
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
21 |
+
#include <signal.h>
|
22 |
+
#include <unistd.h>
|
23 |
+
#elif defined (_WIN32)
|
24 |
+
#define WIN32_LEAN_AND_MEAN
|
25 |
+
#ifndef NOMINMAX
|
26 |
+
#define NOMINMAX
|
27 |
+
#endif
|
28 |
+
#include <windows.h>
|
29 |
+
#include <signal.h>
|
30 |
+
#endif
|
31 |
+
|
32 |
+
#if defined(_MSC_VER)
|
33 |
+
#pragma warning(disable: 4244 4267) // possible loss of data
|
34 |
+
#endif
|
35 |
+
|
36 |
+
static llama_context ** g_ctx;
|
37 |
+
static llama_model ** g_model;
|
38 |
+
static gpt_params * g_params;
|
39 |
+
static std::vector<llama_token> * g_input_tokens;
|
40 |
+
static std::ostringstream * g_output_ss;
|
41 |
+
static std::vector<llama_token> * g_output_tokens;
|
42 |
+
static bool is_interacting = false;
|
43 |
+
|
44 |
+
|
45 |
+
static void write_logfile(
|
46 |
+
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
47 |
+
const std::vector<llama_token> & input_tokens, const std::string & output,
|
48 |
+
const std::vector<llama_token> & output_tokens
|
49 |
+
) {
|
50 |
+
if (params.logdir.empty()) {
|
51 |
+
return;
|
52 |
+
}
|
53 |
+
|
54 |
+
const std::string timestamp = get_sortable_timestamp();
|
55 |
+
|
56 |
+
const bool success = create_directory_with_parents(params.logdir);
|
57 |
+
if (!success) {
|
58 |
+
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
|
59 |
+
__func__, params.logdir.c_str());
|
60 |
+
return;
|
61 |
+
}
|
62 |
+
|
63 |
+
const std::string logfile_path = params.logdir + timestamp + ".yml";
|
64 |
+
FILE * logfile = fopen(logfile_path.c_str(), "w");
|
65 |
+
|
66 |
+
if (logfile == NULL) {
|
67 |
+
fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
|
68 |
+
return;
|
69 |
+
}
|
70 |
+
|
71 |
+
fprintf(logfile, "binary: infill\n");
|
72 |
+
char model_desc[128];
|
73 |
+
llama_model_desc(model, model_desc, sizeof(model_desc));
|
74 |
+
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
|
75 |
+
|
76 |
+
fprintf(logfile, "\n");
|
77 |
+
fprintf(logfile, "######################\n");
|
78 |
+
fprintf(logfile, "# Generation Results #\n");
|
79 |
+
fprintf(logfile, "######################\n");
|
80 |
+
fprintf(logfile, "\n");
|
81 |
+
|
82 |
+
dump_string_yaml_multiline(logfile, "output", output.c_str());
|
83 |
+
dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
|
84 |
+
|
85 |
+
llama_dump_timing_info_yaml(logfile, ctx);
|
86 |
+
fclose(logfile);
|
87 |
+
}
|
88 |
+
|
89 |
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
90 |
+
static void sigint_handler(int signo) {
|
91 |
+
if (signo == SIGINT) {
|
92 |
+
if (!is_interacting) {
|
93 |
+
is_interacting = true;
|
94 |
+
} else {
|
95 |
+
console::cleanup();
|
96 |
+
printf("\n");
|
97 |
+
llama_print_timings(*g_ctx);
|
98 |
+
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
99 |
+
_exit(130);
|
100 |
+
}
|
101 |
+
}
|
102 |
+
}
|
103 |
+
#endif
|
104 |
+
|
105 |
+
int main(int argc, char ** argv) {
|
106 |
+
gpt_params params;
|
107 |
+
llama_sampling_params & sparams = params.sampling_params;
|
108 |
+
g_params = ¶ms;
|
109 |
+
|
110 |
+
if (!gpt_params_parse(argc, argv, params)) {
|
111 |
+
return 1;
|
112 |
+
}
|
113 |
+
|
114 |
+
#ifndef LOG_DISABLE_LOGS
|
115 |
+
log_set_target(log_filename_generator("infill", "log"));
|
116 |
+
LOG_TEE("Log start\n");
|
117 |
+
log_dump_cmdline(argc, argv);
|
118 |
+
#endif // LOG_DISABLE_LOGS
|
119 |
+
|
120 |
+
console::init(params.simple_io, params.use_color);
|
121 |
+
atexit([]() { console::cleanup(); });
|
122 |
+
|
123 |
+
if (params.logits_all) {
|
124 |
+
printf("\n************\n");
|
125 |
+
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
126 |
+
printf("************\n\n");
|
127 |
+
|
128 |
+
return 0;
|
129 |
+
}
|
130 |
+
|
131 |
+
if (params.embedding) {
|
132 |
+
printf("\n************\n");
|
133 |
+
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
134 |
+
printf("************\n\n");
|
135 |
+
|
136 |
+
return 0;
|
137 |
+
}
|
138 |
+
|
139 |
+
if (params.n_ctx != 0 && params.n_ctx < 8) {
|
140 |
+
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
141 |
+
params.n_ctx = 8;
|
142 |
+
}
|
143 |
+
if (params.instruct) {
|
144 |
+
printf("\n************\n");
|
145 |
+
printf("%s: please use the 'main' tool for instruct mode\n", __func__);
|
146 |
+
printf("************\n\n");
|
147 |
+
|
148 |
+
return 0;
|
149 |
+
}
|
150 |
+
if (!params.antiprompt.empty()) {
|
151 |
+
printf("\n************\n");
|
152 |
+
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
153 |
+
printf("************\n\n");
|
154 |
+
|
155 |
+
return 0;
|
156 |
+
}
|
157 |
+
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
|
158 |
+
printf("\n************\n");
|
159 |
+
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
|
160 |
+
printf("************\n\n");
|
161 |
+
|
162 |
+
return 0;
|
163 |
+
}
|
164 |
+
if (params.random_prompt) {
|
165 |
+
printf("\n************\n");
|
166 |
+
printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
|
167 |
+
printf("************\n\n");
|
168 |
+
|
169 |
+
return 0;
|
170 |
+
}
|
171 |
+
if (!params.path_prompt_cache.empty()) {
|
172 |
+
printf("\n************\n");
|
173 |
+
printf("%s: infill does not support prompt caching\n", __func__);
|
174 |
+
printf("************\n\n");
|
175 |
+
|
176 |
+
return 0;
|
177 |
+
}
|
178 |
+
|
179 |
+
if (params.rope_freq_base != 0.0) {
|
180 |
+
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
|
181 |
+
}
|
182 |
+
|
183 |
+
if (params.rope_freq_scale != 0.0) {
|
184 |
+
LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale);
|
185 |
+
}
|
186 |
+
|
187 |
+
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
188 |
+
LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
|
189 |
+
|
190 |
+
if (params.seed == LLAMA_DEFAULT_SEED) {
|
191 |
+
params.seed = time(NULL);
|
192 |
+
}
|
193 |
+
|
194 |
+
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
195 |
+
|
196 |
+
std::mt19937 rng(params.seed);
|
197 |
+
|
198 |
+
LOG("%s: llama backend init\n", __func__);
|
199 |
+
llama_backend_init(params.numa);
|
200 |
+
|
201 |
+
llama_model * model;
|
202 |
+
llama_context * ctx;
|
203 |
+
llama_context * ctx_guidance = NULL;
|
204 |
+
g_model = &model;
|
205 |
+
g_ctx = &ctx;
|
206 |
+
|
207 |
+
// load the model and apply lora adapter, if any
|
208 |
+
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
209 |
+
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
210 |
+
if (sparams.cfg_scale > 1.f) {
|
211 |
+
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
212 |
+
ctx_guidance = llama_new_context_with_model(model, lparams);
|
213 |
+
}
|
214 |
+
|
215 |
+
if (model == NULL) {
|
216 |
+
LOG_TEE("%s: error: unable to load model\n", __func__);
|
217 |
+
return 1;
|
218 |
+
}
|
219 |
+
|
220 |
+
const int n_ctx_train = llama_n_ctx_train(model);
|
221 |
+
const int n_ctx = llama_n_ctx(ctx);
|
222 |
+
LOG("n_ctx: %d\n", n_ctx);
|
223 |
+
|
224 |
+
if (n_ctx > n_ctx_train) {
|
225 |
+
LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
226 |
+
__func__, n_ctx_train, n_ctx);
|
227 |
+
}
|
228 |
+
|
229 |
+
// print system information
|
230 |
+
{
|
231 |
+
LOG_TEE("\n");
|
232 |
+
LOG_TEE("%s\n", get_system_info(params).c_str());
|
233 |
+
}
|
234 |
+
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
|
235 |
+
LOG("add_bos: %d\n", add_bos);
|
236 |
+
|
237 |
+
bool suff_rm_leading_spc = params.escape;
|
238 |
+
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
239 |
+
params.input_suffix.erase(0, 1);
|
240 |
+
suff_rm_leading_spc = false;
|
241 |
+
}
|
242 |
+
std::vector<llama_token> embd_inp;
|
243 |
+
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
244 |
+
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
245 |
+
const int space_token = 29871;
|
246 |
+
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
247 |
+
inp_sfx.erase(inp_sfx.begin());
|
248 |
+
}
|
249 |
+
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
250 |
+
if (add_bos) {
|
251 |
+
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
252 |
+
}
|
253 |
+
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
254 |
+
embd_inp = inp_pfx;
|
255 |
+
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
256 |
+
embd_inp.push_back(llama_token_middle(ctx));
|
257 |
+
|
258 |
+
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
|
259 |
+
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
|
260 |
+
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
|
261 |
+
|
262 |
+
// Should not run without any tokens
|
263 |
+
if (embd_inp.empty()) {
|
264 |
+
embd_inp.push_back(llama_token_bos(ctx));
|
265 |
+
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
|
266 |
+
}
|
267 |
+
|
268 |
+
// Tokenize negative prompt
|
269 |
+
std::vector<llama_token> guidance_inp;
|
270 |
+
int guidance_offset = 0;
|
271 |
+
int original_prompt_len = 0;
|
272 |
+
if (ctx_guidance) {
|
273 |
+
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
274 |
+
|
275 |
+
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
276 |
+
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
277 |
+
|
278 |
+
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
279 |
+
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
|
280 |
+
|
281 |
+
original_prompt_len = original_inp.size();
|
282 |
+
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
283 |
+
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
|
284 |
+
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
285 |
+
}
|
286 |
+
|
287 |
+
if ((int) embd_inp.size() > n_ctx - 4) {
|
288 |
+
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
289 |
+
return 1;
|
290 |
+
}
|
291 |
+
|
292 |
+
// number of tokens to keep when resetting context
|
293 |
+
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
|
294 |
+
params.n_keep = (int)embd_inp.size();
|
295 |
+
}
|
296 |
+
|
297 |
+
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
|
298 |
+
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
|
299 |
+
|
300 |
+
|
301 |
+
// enable interactive mode if interactive start is specified
|
302 |
+
if (params.interactive_first) {
|
303 |
+
params.interactive = true;
|
304 |
+
}
|
305 |
+
|
306 |
+
if (params.verbose_prompt) {
|
307 |
+
LOG_TEE("\n");
|
308 |
+
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
309 |
+
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
310 |
+
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
311 |
+
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
312 |
+
}
|
313 |
+
|
314 |
+
if (ctx_guidance) {
|
315 |
+
LOG_TEE("\n");
|
316 |
+
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
317 |
+
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
318 |
+
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
319 |
+
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
320 |
+
}
|
321 |
+
}
|
322 |
+
|
323 |
+
if (params.n_keep > 0) {
|
324 |
+
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
325 |
+
for (int i = 0; i < params.n_keep; i++) {
|
326 |
+
LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
327 |
+
}
|
328 |
+
LOG_TEE("'\n");
|
329 |
+
}
|
330 |
+
LOG_TEE("\n");
|
331 |
+
}
|
332 |
+
|
333 |
+
if (params.interactive) {
|
334 |
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
335 |
+
struct sigaction sigint_action;
|
336 |
+
sigint_action.sa_handler = sigint_handler;
|
337 |
+
sigemptyset (&sigint_action.sa_mask);
|
338 |
+
sigint_action.sa_flags = 0;
|
339 |
+
sigaction(SIGINT, &sigint_action, NULL);
|
340 |
+
#elif defined (_WIN32)
|
341 |
+
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
|
342 |
+
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
|
343 |
+
};
|
344 |
+
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
345 |
+
#endif
|
346 |
+
|
347 |
+
LOG_TEE("%s: interactive mode on.\n", __func__);
|
348 |
+
|
349 |
+
if (params.input_prefix_bos) {
|
350 |
+
LOG_TEE("Input prefix with BOS\n");
|
351 |
+
}
|
352 |
+
|
353 |
+
if (!params.input_prefix.empty()) {
|
354 |
+
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
355 |
+
}
|
356 |
+
|
357 |
+
if (!params.input_suffix.empty()) {
|
358 |
+
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
359 |
+
}
|
360 |
+
}
|
361 |
+
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
362 |
+
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
363 |
+
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
364 |
+
LOG_TEE("\n\n");
|
365 |
+
|
366 |
+
struct llama_grammar * grammar = NULL;
|
367 |
+
grammar_parser::parse_state parsed_grammar;
|
368 |
+
|
369 |
+
if (!params.grammar.empty()) {
|
370 |
+
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
|
371 |
+
// will be empty (default) if there are parse errors
|
372 |
+
if (parsed_grammar.rules.empty()) {
|
373 |
+
return 1;
|
374 |
+
}
|
375 |
+
LOG_TEE("%s: grammar:\n", __func__);
|
376 |
+
grammar_parser::print_grammar(stderr, parsed_grammar);
|
377 |
+
LOG_TEE("\n");
|
378 |
+
|
379 |
+
{
|
380 |
+
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
381 |
+
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
382 |
+
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
383 |
+
}
|
384 |
+
}
|
385 |
+
|
386 |
+
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
387 |
+
grammar = llama_grammar_init(
|
388 |
+
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
389 |
+
}
|
390 |
+
|
391 |
+
// TODO: replace with ring-buffer
|
392 |
+
std::vector<llama_token> last_tokens(n_ctx);
|
393 |
+
std::fill(last_tokens.begin(), last_tokens.end(), 0);
|
394 |
+
LOG_TEE("\n##### Infill mode #####\n\n");
|
395 |
+
if (params.infill) {
|
396 |
+
printf("\n************\n");
|
397 |
+
printf("no need to specify '--infill', always running infill\n");
|
398 |
+
printf("************\n\n");
|
399 |
+
}
|
400 |
+
if (params.interactive) {
|
401 |
+
const char *control_message;
|
402 |
+
if (params.multiline_input) {
|
403 |
+
control_message = " - To return control to LLaMa, end your input with '\\'.\n"
|
404 |
+
" - To return control without starting a new line, end your input with '/'.\n";
|
405 |
+
} else {
|
406 |
+
control_message = " - Press Return to return control to LLaMa.\n"
|
407 |
+
" - To return control without starting a new line, end your input with '/'.\n"
|
408 |
+
" - If you want to submit another line, end your input with '\\'.\n";
|
409 |
+
}
|
410 |
+
LOG_TEE("== Running in interactive mode. ==\n");
|
411 |
+
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
412 |
+
LOG_TEE( " - Press Ctrl+C to interject at any time.\n");
|
413 |
+
#endif
|
414 |
+
LOG_TEE( "%s\n", control_message);
|
415 |
+
|
416 |
+
is_interacting = params.interactive_first;
|
417 |
+
}
|
418 |
+
|
419 |
+
bool input_echo = true;
|
420 |
+
|
421 |
+
int n_past = 0;
|
422 |
+
int n_remain = params.n_predict;
|
423 |
+
int n_consumed = 0;
|
424 |
+
int n_past_guidance = 0;
|
425 |
+
|
426 |
+
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
427 |
+
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
428 |
+
std::ostringstream output_ss; g_output_ss = &output_ss;
|
429 |
+
|
430 |
+
// the first thing we will do is to output the prompt, so set color accordingly
|
431 |
+
console::set_display(console::prompt);
|
432 |
+
|
433 |
+
std::vector<llama_token> embd;
|
434 |
+
std::vector<llama_token> embd_guidance;
|
435 |
+
|
436 |
+
const int n_vocab = llama_n_vocab(model);
|
437 |
+
|
438 |
+
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
|
439 |
+
std::vector<llama_token_data> candidates;
|
440 |
+
candidates.reserve(n_vocab);
|
441 |
+
|
442 |
+
while (n_remain != 0 || params.interactive) {
|
443 |
+
// predict
|
444 |
+
if (!embd.empty()) {
|
445 |
+
// Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
|
446 |
+
// --prompt or --file which uses the same value.
|
447 |
+
int max_embd_size = n_ctx - 4;
|
448 |
+
|
449 |
+
// Ensure the input doesn't exceed the context size by truncating embd if necessary.
|
450 |
+
if ((int) embd.size() > max_embd_size) {
|
451 |
+
const int skipped_tokens = (int) embd.size() - max_embd_size;
|
452 |
+
embd.resize(max_embd_size);
|
453 |
+
|
454 |
+
console::set_display(console::error);
|
455 |
+
printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
|
456 |
+
console::set_display(console::reset);
|
457 |
+
fflush(stdout);
|
458 |
+
}
|
459 |
+
|
460 |
+
// infinite text generation via context swapping
|
461 |
+
// if we run out of context:
|
462 |
+
// - take the n_keep first tokens from the original prompt (via n_past)
|
463 |
+
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
464 |
+
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
465 |
+
if (params.n_predict == -2) {
|
466 |
+
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
467 |
+
break;
|
468 |
+
}
|
469 |
+
|
470 |
+
const int n_left = n_past - params.n_keep - 1;
|
471 |
+
const int n_discard = n_left/2;
|
472 |
+
|
473 |
+
LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
|
474 |
+
n_past, n_left, n_ctx, params.n_keep, n_discard);
|
475 |
+
|
476 |
+
llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1);
|
477 |
+
llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
|
478 |
+
|
479 |
+
n_past -= n_discard;
|
480 |
+
|
481 |
+
if (ctx_guidance) {
|
482 |
+
n_past_guidance -= n_discard;
|
483 |
+
}
|
484 |
+
|
485 |
+
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
|
486 |
+
|
487 |
+
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
488 |
+
|
489 |
+
}
|
490 |
+
|
491 |
+
// evaluate tokens in batches
|
492 |
+
// embd is typically prepared beforehand to fit within a batch, but not always
|
493 |
+
|
494 |
+
if (ctx_guidance) {
|
495 |
+
int input_size = 0;
|
496 |
+
llama_token * input_buf = NULL;
|
497 |
+
|
498 |
+
if (n_past_guidance < (int) guidance_inp.size()) {
|
499 |
+
// Guidance context should have the same data with these modifications:
|
500 |
+
//
|
501 |
+
// * Replace the initial prompt
|
502 |
+
// * Shift everything by guidance_offset
|
503 |
+
embd_guidance = guidance_inp;
|
504 |
+
if (embd.begin() + original_prompt_len < embd.end()) {
|
505 |
+
embd_guidance.insert(
|
506 |
+
embd_guidance.end(),
|
507 |
+
embd.begin() + original_prompt_len,
|
508 |
+
embd.end()
|
509 |
+
);
|
510 |
+
}
|
511 |
+
|
512 |
+
input_buf = embd_guidance.data();
|
513 |
+
input_size = embd_guidance.size();
|
514 |
+
|
515 |
+
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
|
516 |
+
} else {
|
517 |
+
input_buf = embd.data();
|
518 |
+
input_size = embd.size();
|
519 |
+
}
|
520 |
+
|
521 |
+
for (int i = 0; i < input_size; i += params.n_batch) {
|
522 |
+
int n_eval = std::min(input_size - i, params.n_batch);
|
523 |
+
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
|
524 |
+
LOG_TEE("%s : failed to eval\n", __func__);
|
525 |
+
return 1;
|
526 |
+
}
|
527 |
+
|
528 |
+
n_past_guidance += n_eval;
|
529 |
+
}
|
530 |
+
}
|
531 |
+
|
532 |
+
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
533 |
+
int n_eval = (int) embd.size() - i;
|
534 |
+
if (n_eval > params.n_batch) {
|
535 |
+
n_eval = params.n_batch;
|
536 |
+
}
|
537 |
+
|
538 |
+
LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
|
539 |
+
|
540 |
+
if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) {
|
541 |
+
LOG_TEE("%s : failed to eval\n", __func__);
|
542 |
+
return 1;
|
543 |
+
}
|
544 |
+
|
545 |
+
n_past += n_eval;
|
546 |
+
|
547 |
+
LOG("n_past = %d\n", n_past);
|
548 |
+
}
|
549 |
+
|
550 |
+
}
|
551 |
+
|
552 |
+
embd.clear();
|
553 |
+
embd_guidance.clear();
|
554 |
+
|
555 |
+
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
556 |
+
|
557 |
+
const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
|
558 |
+
|
559 |
+
last_tokens.erase(last_tokens.begin());
|
560 |
+
last_tokens.push_back(id);
|
561 |
+
|
562 |
+
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
|
563 |
+
|
564 |
+
embd.push_back(id);
|
565 |
+
|
566 |
+
// echo this to console
|
567 |
+
input_echo = true;
|
568 |
+
|
569 |
+
// decrement remaining sampling budget
|
570 |
+
--n_remain;
|
571 |
+
|
572 |
+
LOG("n_remain: %d\n", n_remain);
|
573 |
+
} else {
|
574 |
+
// some user input remains from prompt or interaction, forward it to processing
|
575 |
+
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
|
576 |
+
while ((int) embd_inp.size() > n_consumed) {
|
577 |
+
embd.push_back(embd_inp[n_consumed]);
|
578 |
+
last_tokens.erase(last_tokens.begin());
|
579 |
+
last_tokens.push_back(embd_inp[n_consumed]);
|
580 |
+
++n_consumed;
|
581 |
+
if ((int) embd.size() >= params.n_batch) {
|
582 |
+
break;
|
583 |
+
}
|
584 |
+
}
|
585 |
+
}
|
586 |
+
|
587 |
+
// display text
|
588 |
+
if (input_echo) {
|
589 |
+
for (auto id : embd) {
|
590 |
+
const std::string token_str = llama_token_to_piece(ctx, id);
|
591 |
+
printf("%s", token_str.c_str());
|
592 |
+
|
593 |
+
if (embd.size() > 1) {
|
594 |
+
input_tokens.push_back(id);
|
595 |
+
} else {
|
596 |
+
output_tokens.push_back(id);
|
597 |
+
output_ss << token_str;
|
598 |
+
}
|
599 |
+
}
|
600 |
+
fflush(stdout);
|
601 |
+
}
|
602 |
+
// reset color to default if we there is no pending user input
|
603 |
+
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
604 |
+
console::set_display(console::reset);
|
605 |
+
}
|
606 |
+
|
607 |
+
// if not currently processing queued inputs;
|
608 |
+
if ((int) embd_inp.size() <= n_consumed) {
|
609 |
+
|
610 |
+
// deal with eot token in infill mode
|
611 |
+
if ((last_tokens.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
|
612 |
+
if(is_interacting && !params.interactive_first) {
|
613 |
+
// print an eot token
|
614 |
+
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
|
615 |
+
}
|
616 |
+
fflush(stdout);
|
617 |
+
printf("\n");
|
618 |
+
console::set_display(console::user_input);
|
619 |
+
std::string buffer;
|
620 |
+
std::string line;
|
621 |
+
bool another_line=true;
|
622 |
+
// set a new prefix via stdin
|
623 |
+
do {
|
624 |
+
another_line = console::readline(line, params.multiline_input);
|
625 |
+
buffer += line;
|
626 |
+
} while (another_line);
|
627 |
+
// check if we got an empty line, if so we use the old input
|
628 |
+
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
629 |
+
params.input_prefix = buffer;
|
630 |
+
}
|
631 |
+
buffer.clear();
|
632 |
+
// set a new suffix via stdin
|
633 |
+
do {
|
634 |
+
another_line = console::readline(line, params.multiline_input);
|
635 |
+
buffer += line;
|
636 |
+
} while (another_line);
|
637 |
+
// check if we got an empty line
|
638 |
+
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
|
639 |
+
params.input_suffix = buffer;
|
640 |
+
}
|
641 |
+
buffer.clear();
|
642 |
+
// done taking input, reset color
|
643 |
+
console::set_display(console::reset);
|
644 |
+
|
645 |
+
if (params.escape) {
|
646 |
+
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
|
647 |
+
process_escapes(params.input_prefix);
|
648 |
+
process_escapes(params.input_suffix);
|
649 |
+
}
|
650 |
+
suff_rm_leading_spc = params.escape;
|
651 |
+
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
652 |
+
params.input_suffix.erase(0, 1);
|
653 |
+
suff_rm_leading_spc = false;
|
654 |
+
}
|
655 |
+
// tokenize new prefix and suffix
|
656 |
+
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
657 |
+
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
658 |
+
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
|
659 |
+
inp_sfx.erase(inp_sfx.begin());
|
660 |
+
}
|
661 |
+
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
|
662 |
+
if (add_bos) {
|
663 |
+
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
|
664 |
+
}
|
665 |
+
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
|
666 |
+
embd_inp = inp_pfx;
|
667 |
+
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
668 |
+
embd_inp.push_back(llama_token_middle(ctx));
|
669 |
+
embd.clear();
|
670 |
+
embd_guidance.clear();
|
671 |
+
n_remain = params.n_predict;
|
672 |
+
n_past = 0;
|
673 |
+
n_consumed = 0;
|
674 |
+
// LOG_TEE("took new input\n");
|
675 |
+
is_interacting = false;
|
676 |
+
}
|
677 |
+
// deal with end of text token in interactive mode
|
678 |
+
else if (last_tokens.back() == llama_token_eos(ctx)) {
|
679 |
+
LOG("found EOS token\n");
|
680 |
+
|
681 |
+
if (params.interactive) {
|
682 |
+
|
683 |
+
is_interacting = true;
|
684 |
+
printf("\n");
|
685 |
+
console::set_display(console::user_input);
|
686 |
+
fflush(stdout);
|
687 |
+
}
|
688 |
+
}
|
689 |
+
|
690 |
+
if (n_past > 0 && is_interacting && !params.interactive) {
|
691 |
+
LOG("waiting for user input\n");
|
692 |
+
|
693 |
+
if (params.input_prefix_bos) {
|
694 |
+
LOG("adding input prefix BOS token\n");
|
695 |
+
embd_inp.push_back(llama_token_bos(ctx));
|
696 |
+
}
|
697 |
+
|
698 |
+
std::string buffer;
|
699 |
+
if (!params.input_prefix.empty()) {
|
700 |
+
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
701 |
+
buffer += params.input_prefix;
|
702 |
+
printf("%s", buffer.c_str());
|
703 |
+
}
|
704 |
+
|
705 |
+
std::string line;
|
706 |
+
bool another_line = true;
|
707 |
+
do {
|
708 |
+
another_line = console::readline(line, params.multiline_input);
|
709 |
+
buffer += line;
|
710 |
+
} while (another_line);
|
711 |
+
|
712 |
+
// done taking input, reset color
|
713 |
+
console::set_display(console::reset);
|
714 |
+
|
715 |
+
// Add tokens to embd only if the input buffer is non-empty
|
716 |
+
// Entering a empty line lets the user pass control back
|
717 |
+
if (buffer.length() > 1) {
|
718 |
+
// append input suffix if any
|
719 |
+
if (!params.input_suffix.empty()) {
|
720 |
+
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
721 |
+
buffer += params.input_suffix;
|
722 |
+
printf("%s", params.input_suffix.c_str());
|
723 |
+
}
|
724 |
+
|
725 |
+
LOG("buffer: '%s'\n", buffer.c_str());
|
726 |
+
|
727 |
+
const size_t original_size = embd_inp.size();
|
728 |
+
|
729 |
+
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
730 |
+
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
|
731 |
+
|
732 |
+
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
733 |
+
|
734 |
+
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
735 |
+
const llama_token token = embd_inp[i];
|
736 |
+
output_tokens.push_back(token);
|
737 |
+
output_ss << llama_token_to_piece(ctx, token);
|
738 |
+
}
|
739 |
+
|
740 |
+
n_remain -= line_inp.size();
|
741 |
+
LOG("n_remain: %d\n", n_remain);
|
742 |
+
} else {
|
743 |
+
LOG("empty line, passing control back\n");
|
744 |
+
}
|
745 |
+
|
746 |
+
input_echo = false; // do not echo this again
|
747 |
+
}
|
748 |
+
|
749 |
+
if (n_past > 0) {
|
750 |
+
if (is_interacting) {
|
751 |
+
// reset grammar state if we're restarting generation
|
752 |
+
if (grammar != NULL) {
|
753 |
+
llama_grammar_free(grammar);
|
754 |
+
|
755 |
+
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
|
756 |
+
grammar = llama_grammar_init(
|
757 |
+
grammar_rules.data(), grammar_rules.size(),
|
758 |
+
parsed_grammar.symbol_ids.at("root"));
|
759 |
+
}
|
760 |
+
}
|
761 |
+
is_interacting = false;
|
762 |
+
}
|
763 |
+
}
|
764 |
+
|
765 |
+
// end of text token
|
766 |
+
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
|
767 |
+
break;
|
768 |
+
}
|
769 |
+
|
770 |
+
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
771 |
+
// We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size).
|
772 |
+
if (params.interactive && n_remain <= 0 && params.n_predict >= 0) {
|
773 |
+
n_remain = params.n_predict;
|
774 |
+
is_interacting = true;
|
775 |
+
}
|
776 |
+
}
|
777 |
+
if (!params.interactive && n_remain <= 0) {
|
778 |
+
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
|
779 |
+
fflush(stdout);
|
780 |
+
}
|
781 |
+
|
782 |
+
llama_print_timings(ctx);
|
783 |
+
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
784 |
+
|
785 |
+
if (ctx_guidance) { llama_free(ctx_guidance); }
|
786 |
+
llama_free(ctx);
|
787 |
+
llama_free_model(model);
|
788 |
+
|
789 |
+
if (grammar != NULL) {
|
790 |
+
llama_grammar_free(grammar);
|
791 |
+
}
|
792 |
+
llama_backend_free();
|
793 |
+
|
794 |
+
#ifndef LOG_DISABLE_LOGS
|
795 |
+
LOG_TEE("Log end\n");
|
796 |
+
#endif // LOG_DISABLE_LOGS
|
797 |
+
|
798 |
+
return 0;
|
799 |
+
}
|
800 |
+
|
examples/main/main.cpp
CHANGED
@@ -109,6 +109,7 @@ int main(int argc, char ** argv) {
|
|
109 |
if (!gpt_params_parse(argc, argv, params)) {
|
110 |
return 1;
|
111 |
}
|
|
|
112 |
|
113 |
#ifndef LOG_DISABLE_LOGS
|
114 |
log_set_target(log_filename_generator("main", "log"));
|
@@ -179,7 +180,7 @@ int main(int argc, char ** argv) {
|
|
179 |
// load the model and apply lora adapter, if any
|
180 |
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
181 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
182 |
-
if (
|
183 |
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
184 |
ctx_guidance = llama_new_context_with_model(model, lparams);
|
185 |
}
|
@@ -257,9 +258,9 @@ int main(int argc, char ** argv) {
|
|
257 |
int guidance_offset = 0;
|
258 |
int original_prompt_len = 0;
|
259 |
if (ctx_guidance) {
|
260 |
-
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(
|
261 |
|
262 |
-
guidance_inp = ::llama_tokenize(ctx_guidance,
|
263 |
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
264 |
|
265 |
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
@@ -296,6 +297,9 @@ int main(int argc, char ** argv) {
|
|
296 |
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
297 |
__func__, n_matching_session_tokens, embd_inp.size());
|
298 |
}
|
|
|
|
|
|
|
299 |
}
|
300 |
|
301 |
LOGLN(
|
@@ -343,7 +347,7 @@ int main(int argc, char ** argv) {
|
|
343 |
|
344 |
if (ctx_guidance) {
|
345 |
LOG_TEE("\n");
|
346 |
-
LOG_TEE("%s: negative prompt: '%s'\n", __func__,
|
347 |
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
348 |
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
349 |
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
@@ -395,7 +399,7 @@ int main(int argc, char ** argv) {
|
|
395 |
}
|
396 |
}
|
397 |
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
398 |
-
|
399 |
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
400 |
LOG_TEE("\n\n");
|
401 |
|
@@ -413,8 +417,8 @@ int main(int argc, char ** argv) {
|
|
413 |
LOG_TEE("\n");
|
414 |
|
415 |
{
|
416 |
-
auto it =
|
417 |
-
if (it !=
|
418 |
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
419 |
}
|
420 |
}
|
@@ -469,6 +473,7 @@ int main(int argc, char ** argv) {
|
|
469 |
|
470 |
const int n_vocab = llama_n_vocab(model);
|
471 |
|
|
|
472 |
std::vector<llama_token_data> candidates;
|
473 |
candidates.reserve(n_vocab);
|
474 |
|
@@ -622,7 +627,7 @@ int main(int argc, char ** argv) {
|
|
622 |
LOG("saved session to %s\n", path_session.c_str());
|
623 |
}
|
624 |
|
625 |
-
const llama_token id =
|
626 |
|
627 |
last_tokens.erase(last_tokens.begin());
|
628 |
last_tokens.push_back(id);
|
@@ -667,7 +672,7 @@ int main(int argc, char ** argv) {
|
|
667 |
}
|
668 |
fflush(stdout);
|
669 |
}
|
670 |
-
// reset color to default if
|
671 |
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
672 |
console::set_display(console::reset);
|
673 |
}
|
@@ -694,10 +699,8 @@ int main(int argc, char ** argv) {
|
|
694 |
if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
|
695 |
if (params.interactive) {
|
696 |
is_interacting = true;
|
697 |
-
console::set_display(console::user_input);
|
698 |
}
|
699 |
is_antiprompt = true;
|
700 |
-
fflush(stdout);
|
701 |
break;
|
702 |
}
|
703 |
}
|
@@ -721,8 +724,6 @@ int main(int argc, char ** argv) {
|
|
721 |
|
722 |
is_interacting = true;
|
723 |
printf("\n");
|
724 |
-
console::set_display(console::user_input);
|
725 |
-
fflush(stdout);
|
726 |
} else if (params.instruct) {
|
727 |
is_interacting = true;
|
728 |
}
|
@@ -747,6 +748,9 @@ int main(int argc, char ** argv) {
|
|
747 |
printf("%s", buffer.c_str());
|
748 |
}
|
749 |
|
|
|
|
|
|
|
750 |
std::string line;
|
751 |
bool another_line = true;
|
752 |
do {
|
|
|
109 |
if (!gpt_params_parse(argc, argv, params)) {
|
110 |
return 1;
|
111 |
}
|
112 |
+
llama_sampling_params & sparams = params.sampling_params;
|
113 |
|
114 |
#ifndef LOG_DISABLE_LOGS
|
115 |
log_set_target(log_filename_generator("main", "log"));
|
|
|
180 |
// load the model and apply lora adapter, if any
|
181 |
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
|
182 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
183 |
+
if (sparams.cfg_scale > 1.f) {
|
184 |
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
185 |
ctx_guidance = llama_new_context_with_model(model, lparams);
|
186 |
}
|
|
|
258 |
int guidance_offset = 0;
|
259 |
int original_prompt_len = 0;
|
260 |
if (ctx_guidance) {
|
261 |
+
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
262 |
|
263 |
+
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos);
|
264 |
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));
|
265 |
|
266 |
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
|
|
297 |
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
298 |
__func__, n_matching_session_tokens, embd_inp.size());
|
299 |
}
|
300 |
+
|
301 |
+
// remove any "future" tokens that we might have inherited from the previous session
|
302 |
+
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
|
303 |
}
|
304 |
|
305 |
LOGLN(
|
|
|
347 |
|
348 |
if (ctx_guidance) {
|
349 |
LOG_TEE("\n");
|
350 |
+
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
351 |
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
352 |
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
353 |
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
|
|
399 |
}
|
400 |
}
|
401 |
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
402 |
+
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
|
403 |
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
404 |
LOG_TEE("\n\n");
|
405 |
|
|
|
417 |
LOG_TEE("\n");
|
418 |
|
419 |
{
|
420 |
+
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
|
421 |
+
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
|
422 |
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
423 |
}
|
424 |
}
|
|
|
473 |
|
474 |
const int n_vocab = llama_n_vocab(model);
|
475 |
|
476 |
+
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar);
|
477 |
std::vector<llama_token_data> candidates;
|
478 |
candidates.reserve(n_vocab);
|
479 |
|
|
|
627 |
LOG("saved session to %s\n", path_session.c_str());
|
628 |
}
|
629 |
|
630 |
+
const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates);
|
631 |
|
632 |
last_tokens.erase(last_tokens.begin());
|
633 |
last_tokens.push_back(id);
|
|
|
672 |
}
|
673 |
fflush(stdout);
|
674 |
}
|
675 |
+
// reset color to default if there is no pending user input
|
676 |
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
677 |
console::set_display(console::reset);
|
678 |
}
|
|
|
699 |
if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
|
700 |
if (params.interactive) {
|
701 |
is_interacting = true;
|
|
|
702 |
}
|
703 |
is_antiprompt = true;
|
|
|
704 |
break;
|
705 |
}
|
706 |
}
|
|
|
724 |
|
725 |
is_interacting = true;
|
726 |
printf("\n");
|
|
|
|
|
727 |
} else if (params.instruct) {
|
728 |
is_interacting = true;
|
729 |
}
|
|
|
748 |
printf("%s", buffer.c_str());
|
749 |
}
|
750 |
|
751 |
+
// color user input only
|
752 |
+
console::set_display(console::user_input);
|
753 |
+
|
754 |
std::string line;
|
755 |
bool another_line = true;
|
756 |
do {
|
examples/parallel/parallel.cpp
CHANGED
@@ -10,6 +10,7 @@
|
|
10 |
#include <cstdio>
|
11 |
#include <string>
|
12 |
#include <vector>
|
|
|
13 |
|
14 |
// trim whitespace from the beginning and end of a string
|
15 |
static std::string trim(const std::string & str) {
|
@@ -70,6 +71,26 @@ struct client {
|
|
70 |
std::vector<llama_token> tokens_prev;
|
71 |
};
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
int main(int argc, char ** argv) {
|
74 |
srand(1234);
|
75 |
|
@@ -104,6 +125,25 @@ int main(int argc, char ** argv) {
|
|
104 |
params.logits_all = true;
|
105 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
fprintf(stderr, "\n\n");
|
108 |
fflush(stderr);
|
109 |
|
@@ -129,7 +169,7 @@ int main(int argc, char ** argv) {
|
|
129 |
|
130 |
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
131 |
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
132 |
-
llama_batch batch = llama_batch_init(
|
133 |
|
134 |
int32_t n_total_prompt = 0;
|
135 |
int32_t n_total_gen = 0;
|
@@ -233,7 +273,7 @@ int main(int argc, char ** argv) {
|
|
233 |
client.n_decoded = 0;
|
234 |
client.i_batch = batch.n_tokens - 1;
|
235 |
|
236 |
-
LOG_TEE("\033[
|
237 |
|
238 |
g_seq_id += 1;
|
239 |
|
@@ -301,7 +341,7 @@ int main(int argc, char ** argv) {
|
|
301 |
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
302 |
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
303 |
|
304 |
-
const llama_token id =
|
305 |
|
306 |
if (client.n_decoded == 1) {
|
307 |
// start measuring generation time after the first token to make sure all concurrent clients
|
@@ -332,12 +372,12 @@ int main(int argc, char ** argv) {
|
|
332 |
}
|
333 |
|
334 |
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
335 |
-
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system,
|
336 |
|
337 |
const auto t_main_end = ggml_time_us();
|
338 |
|
339 |
-
LOG_TEE("\033[
|
340 |
-
client.id, client.seq_id, client.n_prompt, client.n_decoded,
|
341 |
(t_main_end - client.t_start_prompt) / 1e6,
|
342 |
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
343 |
n_cache_miss,
|
@@ -346,7 +386,7 @@ int main(int argc, char ** argv) {
|
|
346 |
|
347 |
n_total_prompt += client.n_prompt;
|
348 |
n_total_gen += client.n_decoded;
|
349 |
-
|
350 |
client.seq_id = -1;
|
351 |
}
|
352 |
|
@@ -357,13 +397,21 @@ int main(int argc, char ** argv) {
|
|
357 |
|
358 |
const auto t_main_end = ggml_time_us();
|
359 |
|
360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
362 |
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
363 |
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
364 |
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
365 |
|
366 |
-
LOG_TEE("\n
|
367 |
|
368 |
llama_print_timings(ctx);
|
369 |
|
|
|
10 |
#include <cstdio>
|
11 |
#include <string>
|
12 |
#include <vector>
|
13 |
+
#include <ctime>
|
14 |
|
15 |
// trim whitespace from the beginning and end of a string
|
16 |
static std::string trim(const std::string & str) {
|
|
|
71 |
std::vector<llama_token> tokens_prev;
|
72 |
};
|
73 |
|
74 |
+
static void print_date_time() {
|
75 |
+
std::time_t current_time = std::time(nullptr);
|
76 |
+
std::tm* local_time = std::localtime(¤t_time);
|
77 |
+
char buffer[80];
|
78 |
+
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", local_time);
|
79 |
+
|
80 |
+
printf("\n\033[35mrun parameters as at %s\033[0m\n", buffer);
|
81 |
+
}
|
82 |
+
|
83 |
+
// Define a split string function to ...
|
84 |
+
static std::vector<std::string> split_string(const std::string& input, char delimiter) {
|
85 |
+
std::vector<std::string> tokens;
|
86 |
+
std::istringstream stream(input);
|
87 |
+
std::string token;
|
88 |
+
while (std::getline(stream, token, delimiter)) {
|
89 |
+
tokens.push_back(token);
|
90 |
+
}
|
91 |
+
return tokens;
|
92 |
+
}
|
93 |
+
|
94 |
int main(int argc, char ** argv) {
|
95 |
srand(1234);
|
96 |
|
|
|
125 |
params.logits_all = true;
|
126 |
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
127 |
|
128 |
+
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL);
|
129 |
+
|
130 |
+
// load the prompts from an external file if there are any
|
131 |
+
if (params.prompt.empty()) {
|
132 |
+
printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n");
|
133 |
+
} else {
|
134 |
+
// Output each line of the input params.prompts vector and copy to k_prompts
|
135 |
+
int index = 0;
|
136 |
+
printf("\n\033[32mNow printing the external prompt file %s\033[0m\n\n", params.prompt_file.c_str());
|
137 |
+
|
138 |
+
std::vector<std::string> prompts = split_string(params.prompt, '\n');
|
139 |
+
for (const auto& prompt : prompts) {
|
140 |
+
k_prompts.resize(index + 1);
|
141 |
+
k_prompts[index] = prompt;
|
142 |
+
index++;
|
143 |
+
printf("%3d prompt: %s\n", index, prompt.c_str());
|
144 |
+
}
|
145 |
+
}
|
146 |
+
|
147 |
fprintf(stderr, "\n\n");
|
148 |
fflush(stderr);
|
149 |
|
|
|
169 |
|
170 |
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
171 |
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
172 |
+
llama_batch batch = llama_batch_init(n_ctx, 0);
|
173 |
|
174 |
int32_t n_total_prompt = 0;
|
175 |
int32_t n_total_gen = 0;
|
|
|
273 |
client.n_decoded = 0;
|
274 |
client.i_batch = batch.n_tokens - 1;
|
275 |
|
276 |
+
LOG_TEE("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);
|
277 |
|
278 |
g_seq_id += 1;
|
279 |
|
|
|
341 |
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
342 |
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
343 |
|
344 |
+
const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id);
|
345 |
|
346 |
if (client.n_decoded == 1) {
|
347 |
// start measuring generation time after the first token to make sure all concurrent clients
|
|
|
372 |
}
|
373 |
|
374 |
// delete only the generated part of the sequence, i.e. keep the system prompt in the cache
|
375 |
+
llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
|
376 |
|
377 |
const auto t_main_end = ggml_time_us();
|
378 |
|
379 |
+
LOG_TEE("\033[31mClient %3d, seq %3d/%3d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \nInput: %s\n\033[35mResponse: %s\033[0m\n\n",
|
380 |
+
client.id, client.seq_id, n_seq, client.n_prompt, client.n_decoded,
|
381 |
(t_main_end - client.t_start_prompt) / 1e6,
|
382 |
(double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
|
383 |
n_cache_miss,
|
|
|
386 |
|
387 |
n_total_prompt += client.n_prompt;
|
388 |
n_total_gen += client.n_decoded;
|
389 |
+
llama_sampling_context_reset(ctx_sampling, client.seq_id);
|
390 |
client.seq_id = -1;
|
391 |
}
|
392 |
|
|
|
397 |
|
398 |
const auto t_main_end = ggml_time_us();
|
399 |
|
400 |
+
print_date_time();
|
401 |
+
|
402 |
+
LOG_TEE("\n%s: n_parallel = %d, n_sequences = %d, cont_batching = %d, system tokens = %d\n", __func__, n_clients, n_seq, cont_batching, n_tokens_system);
|
403 |
+
if (params.prompt_file.empty()) {
|
404 |
+
params.prompt_file = "used built-in defaults";
|
405 |
+
}
|
406 |
+
LOG_TEE("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
|
407 |
+
LOG_TEE("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str());
|
408 |
+
|
409 |
LOG_TEE("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6);
|
410 |
LOG_TEE("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6);
|
411 |
LOG_TEE("Total speed (AVG): %6s speed: %5.2f t/s\n", "", (double) (n_total_prompt + n_total_gen) / (t_main_end - t_main_start) * 1e6);
|
412 |
LOG_TEE("Cache misses: %6d\n", n_cache_miss);
|
413 |
|
414 |
+
LOG_TEE("\n");
|
415 |
|
416 |
llama_print_timings(ctx);
|
417 |
|
examples/save-load-state/save-load-state.cpp
CHANGED
@@ -8,9 +8,10 @@
|
|
8 |
|
9 |
int main(int argc, char ** argv) {
|
10 |
gpt_params params;
|
|
|
11 |
params.seed = 42;
|
12 |
params.n_threads = 4;
|
13 |
-
|
14 |
params.prompt = "The quick brown fox";
|
15 |
|
16 |
if (!gpt_params_parse(argc, argv, params)) {
|
@@ -24,7 +25,7 @@ int main(int argc, char ** argv) {
|
|
24 |
}
|
25 |
|
26 |
auto n_past = 0;
|
27 |
-
auto last_n_tokens_data = std::vector<llama_token>(
|
28 |
|
29 |
// init
|
30 |
llama_model * model;
|
|
|
8 |
|
9 |
int main(int argc, char ** argv) {
|
10 |
gpt_params params;
|
11 |
+
llama_sampling_params & sparams = params.sampling_params;
|
12 |
params.seed = 42;
|
13 |
params.n_threads = 4;
|
14 |
+
sparams.repeat_last_n = 64;
|
15 |
params.prompt = "The quick brown fox";
|
16 |
|
17 |
if (!gpt_params_parse(argc, argv, params)) {
|
|
|
25 |
}
|
26 |
|
27 |
auto n_past = 0;
|
28 |
+
auto last_n_tokens_data = std::vector<llama_token>(sparams.repeat_last_n, 0);
|
29 |
|
30 |
// init
|
31 |
llama_model * model;
|
examples/server/index.html.hpp
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
examples/server/public/index.html
CHANGED
@@ -136,6 +136,11 @@
|
|
136 |
display: block;
|
137 |
}
|
138 |
|
|
|
|
|
|
|
|
|
|
|
139 |
header, footer {
|
140 |
text-align: center;
|
141 |
}
|
@@ -145,6 +150,14 @@
|
|
145 |
color: #888;
|
146 |
}
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
@keyframes loading-bg-wipe {
|
150 |
0% {
|
@@ -187,7 +200,7 @@
|
|
187 |
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
188 |
historyTemplate: "{{name}}: {{message}}",
|
189 |
transcript: [],
|
190 |
-
type: "chat",
|
191 |
char: "Llama",
|
192 |
user: "User",
|
193 |
})
|
@@ -365,13 +378,44 @@
|
|
365 |
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
|
366 |
}
|
367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
// send message to server
|
369 |
const chat = async (msg) => {
|
370 |
if (controller.value) {
|
371 |
console.log('already running...');
|
372 |
return;
|
373 |
}
|
374 |
-
controller.value = new AbortController();
|
375 |
|
376 |
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
377 |
|
@@ -391,55 +435,41 @@
|
|
391 |
).join("\n"),
|
392 |
});
|
393 |
|
394 |
-
|
395 |
-
const history = session.value.transcript
|
396 |
-
|
397 |
-
const llamaParams = {
|
398 |
...params.value,
|
399 |
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
400 |
-
}
|
401 |
-
|
402 |
-
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
403 |
-
const data = chunk.data;
|
404 |
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
}
|
418 |
|
419 |
-
|
420 |
-
|
421 |
-
|
|
|
|
|
422 |
}
|
|
|
423 |
|
424 |
-
|
|
|
|
|
425 |
}
|
426 |
|
427 |
function MessageInput() {
|
428 |
const message = useSignal("")
|
429 |
|
430 |
-
const stop = (e) => {
|
431 |
-
e.preventDefault();
|
432 |
-
if (controller.value) {
|
433 |
-
controller.value.abort();
|
434 |
-
controller.value = null;
|
435 |
-
}
|
436 |
-
}
|
437 |
-
|
438 |
-
const reset = (e) => {
|
439 |
-
stop(e);
|
440 |
-
transcriptUpdate([]);
|
441 |
-
}
|
442 |
-
|
443 |
const submit = (e) => {
|
444 |
stop(e);
|
445 |
chat(message.value);
|
@@ -474,6 +504,19 @@
|
|
474 |
`
|
475 |
}
|
476 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
477 |
const ChatLog = (props) => {
|
478 |
const messages = session.value.transcript;
|
479 |
const container = useRef(null)
|
@@ -497,7 +540,11 @@
|
|
497 |
data;
|
498 |
message = html`<${Markdownish} text=${template(text)} />`
|
499 |
}
|
500 |
-
|
|
|
|
|
|
|
|
|
501 |
};
|
502 |
|
503 |
return html`
|
@@ -574,18 +621,31 @@
|
|
574 |
userTemplateAutosave()
|
575 |
}, [session.value, params.value])
|
576 |
|
577 |
-
|
578 |
-
|
579 |
-
<
|
580 |
-
|
581 |
-
|
|
|
|
|
|
|
|
|
|
|
582 |
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
</
|
588 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
589 |
|
590 |
<fieldset class="two">
|
591 |
<div>
|
@@ -609,15 +669,30 @@
|
|
609 |
<label for="template">Chat history template</label>
|
610 |
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
611 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
612 |
|
|
|
|
|
|
|
|
|
613 |
<div>
|
614 |
-
<label
|
615 |
-
<
|
616 |
-
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
617 |
-
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
618 |
</div>
|
619 |
</fieldset>
|
620 |
|
|
|
|
|
621 |
<fieldset class="two">
|
622 |
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
623 |
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
@@ -851,7 +926,7 @@
|
|
851 |
function App(props) {
|
852 |
|
853 |
return html`
|
854 |
-
<div>
|
855 |
<header>
|
856 |
<h1>llama.cpp</h1>
|
857 |
</header>
|
@@ -861,7 +936,7 @@
|
|
861 |
</main>
|
862 |
|
863 |
<section id="write">
|
864 |
-
<${MessageInput} />
|
865 |
</section>
|
866 |
|
867 |
<footer>
|
|
|
136 |
display: block;
|
137 |
}
|
138 |
|
139 |
+
fieldset label.slim {
|
140 |
+
margin: 0 0.5em;
|
141 |
+
display: inline;
|
142 |
+
}
|
143 |
+
|
144 |
header, footer {
|
145 |
text-align: center;
|
146 |
}
|
|
|
150 |
color: #888;
|
151 |
}
|
152 |
|
153 |
+
.mode-chat textarea[name=prompt] {
|
154 |
+
height: 4.5em;
|
155 |
+
}
|
156 |
+
|
157 |
+
.mode-completion textarea[name=prompt] {
|
158 |
+
height: 10em;
|
159 |
+
}
|
160 |
+
|
161 |
|
162 |
@keyframes loading-bg-wipe {
|
163 |
0% {
|
|
|
200 |
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
201 |
historyTemplate: "{{name}}: {{message}}",
|
202 |
transcript: [],
|
203 |
+
type: "chat", // "chat" | "completion"
|
204 |
char: "Llama",
|
205 |
user: "User",
|
206 |
})
|
|
|
378 |
return String(str).replaceAll(/\{\{(.*?)\}\}/g, (_, key) => template(settings[key]));
|
379 |
}
|
380 |
|
381 |
+
async function runLlama(prompt, llamaParams, char) {
|
382 |
+
const currentMessages = [];
|
383 |
+
const history = session.value.transcript;
|
384 |
+
if (controller.value) {
|
385 |
+
throw new Error("already running");
|
386 |
+
}
|
387 |
+
controller.value = new AbortController();
|
388 |
+
for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
|
389 |
+
const data = chunk.data;
|
390 |
+
|
391 |
+
if (data.stop) {
|
392 |
+
while (
|
393 |
+
currentMessages.length > 0 &&
|
394 |
+
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
395 |
+
) {
|
396 |
+
currentMessages.pop();
|
397 |
+
}
|
398 |
+
transcriptUpdate([...history, [char, currentMessages]])
|
399 |
+
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
400 |
+
} else {
|
401 |
+
currentMessages.push(data);
|
402 |
+
transcriptUpdate([...history, [char, currentMessages]])
|
403 |
+
}
|
404 |
+
|
405 |
+
if (data.timings) {
|
406 |
+
llamaStats.value = data.timings;
|
407 |
+
}
|
408 |
+
}
|
409 |
+
|
410 |
+
controller.value = null;
|
411 |
+
}
|
412 |
+
|
413 |
// send message to server
|
414 |
const chat = async (msg) => {
|
415 |
if (controller.value) {
|
416 |
console.log('already running...');
|
417 |
return;
|
418 |
}
|
|
|
419 |
|
420 |
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
421 |
|
|
|
435 |
).join("\n"),
|
436 |
});
|
437 |
|
438 |
+
await runLlama(prompt, {
|
|
|
|
|
|
|
439 |
...params.value,
|
440 |
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
441 |
+
}, "{{char}}");
|
442 |
+
}
|
|
|
|
|
443 |
|
444 |
+
const runCompletion = async () => {
|
445 |
+
if (controller.value) {
|
446 |
+
console.log('already running...');
|
447 |
+
return;
|
448 |
+
}
|
449 |
+
const {prompt} = session.value;
|
450 |
+
transcriptUpdate([...session.value.transcript, ["", prompt]]);
|
451 |
+
await runLlama(prompt, {
|
452 |
+
...params.value,
|
453 |
+
stop: [],
|
454 |
+
}, "");
|
455 |
+
}
|
|
|
456 |
|
457 |
+
const stop = (e) => {
|
458 |
+
e.preventDefault();
|
459 |
+
if (controller.value) {
|
460 |
+
controller.value.abort();
|
461 |
+
controller.value = null;
|
462 |
}
|
463 |
+
}
|
464 |
|
465 |
+
const reset = (e) => {
|
466 |
+
stop(e);
|
467 |
+
transcriptUpdate([]);
|
468 |
}
|
469 |
|
470 |
function MessageInput() {
|
471 |
const message = useSignal("")
|
472 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
473 |
const submit = (e) => {
|
474 |
stop(e);
|
475 |
chat(message.value);
|
|
|
504 |
`
|
505 |
}
|
506 |
|
507 |
+
function CompletionControls() {
|
508 |
+
const submit = (e) => {
|
509 |
+
stop(e);
|
510 |
+
runCompletion();
|
511 |
+
}
|
512 |
+
return html`
|
513 |
+
<div>
|
514 |
+
<button onclick=${submit} type="button" disabled=${generating.value}>Start</button>
|
515 |
+
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
|
516 |
+
<button onclick=${reset}>Reset</button>
|
517 |
+
</div>`;
|
518 |
+
}
|
519 |
+
|
520 |
const ChatLog = (props) => {
|
521 |
const messages = session.value.transcript;
|
522 |
const container = useRef(null)
|
|
|
540 |
data;
|
541 |
message = html`<${Markdownish} text=${template(text)} />`
|
542 |
}
|
543 |
+
if(user) {
|
544 |
+
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
545 |
+
} else {
|
546 |
+
return html`<p key=${index}>${message}</p>`
|
547 |
+
}
|
548 |
};
|
549 |
|
550 |
return html`
|
|
|
621 |
userTemplateAutosave()
|
622 |
}, [session.value, params.value])
|
623 |
|
624 |
+
const GrammarControl = () => (
|
625 |
+
html`
|
626 |
+
<div>
|
627 |
+
<label for="template">Grammar</label>
|
628 |
+
<textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
629 |
+
<input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
630 |
+
<button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
631 |
+
</div>
|
632 |
+
`
|
633 |
+
);
|
634 |
|
635 |
+
const PromptControlFieldSet = () => (
|
636 |
+
html`
|
637 |
+
<fieldset>
|
638 |
+
<div>
|
639 |
+
<label htmlFor="prompt">Prompt</label>
|
640 |
+
<textarea type="text" name="prompt" value="${session.value.prompt}" oninput=${updateSession}/>
|
641 |
+
</div>
|
642 |
+
</fieldset>
|
643 |
+
`
|
644 |
+
);
|
645 |
+
|
646 |
+
const ChatConfigForm = () => (
|
647 |
+
html`
|
648 |
+
${PromptControlFieldSet()}
|
649 |
|
650 |
<fieldset class="two">
|
651 |
<div>
|
|
|
669 |
<label for="template">Chat history template</label>
|
670 |
<textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
|
671 |
</div>
|
672 |
+
${GrammarControl()}
|
673 |
+
</fieldset>
|
674 |
+
`
|
675 |
+
);
|
676 |
+
|
677 |
+
const CompletionConfigForm = () => (
|
678 |
+
html`
|
679 |
+
${PromptControlFieldSet()}
|
680 |
+
<fieldset>${GrammarControl()}</fieldset>
|
681 |
+
`
|
682 |
+
);
|
683 |
|
684 |
+
return html`
|
685 |
+
<form>
|
686 |
+
<fieldset class="two">
|
687 |
+
<${UserTemplateResetButton}/>
|
688 |
<div>
|
689 |
+
<label class="slim"><input type="radio" name="type" value="chat" checked=${session.value.type === "chat"} oninput=${updateSession} /> Chat</label>
|
690 |
+
<label class="slim"><input type="radio" name="type" value="completion" checked=${session.value.type === "completion"} oninput=${updateSession} /> Completion</label>
|
|
|
|
|
691 |
</div>
|
692 |
</fieldset>
|
693 |
|
694 |
+
${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
|
695 |
+
|
696 |
<fieldset class="two">
|
697 |
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
|
698 |
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
|
|
|
926 |
function App(props) {
|
927 |
|
928 |
return html`
|
929 |
+
<div class="mode-${session.value.type}">
|
930 |
<header>
|
931 |
<h1>llama.cpp</h1>
|
932 |
</header>
|
|
|
936 |
</main>
|
937 |
|
938 |
<section id="write">
|
939 |
+
<${session.value.type === 'chat' ? MessageInput : CompletionControls} />
|
940 |
</section>
|
941 |
|
942 |
<footer>
|
examples/server/server.cpp
CHANGED
@@ -200,6 +200,7 @@ struct llama_server_context
|
|
200 |
llama_model *model = nullptr;
|
201 |
llama_context *ctx = nullptr;
|
202 |
gpt_params params;
|
|
|
203 |
int n_ctx;
|
204 |
|
205 |
grammar_parser::parse_state parsed_grammar;
|
@@ -254,6 +255,7 @@ struct llama_server_context
|
|
254 |
if (grammar != nullptr) {
|
255 |
llama_grammar_free(grammar);
|
256 |
grammar = nullptr;
|
|
|
257 |
}
|
258 |
}
|
259 |
|
@@ -329,8 +331,8 @@ struct llama_server_context
|
|
329 |
grammar_parser::print_grammar(stderr, parsed_grammar);
|
330 |
|
331 |
{
|
332 |
-
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
333 |
-
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
334 |
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
335 |
}
|
336 |
}
|
@@ -339,9 +341,89 @@ struct llama_server_context
|
|
339 |
grammar = llama_grammar_init(
|
340 |
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
341 |
}
|
|
|
342 |
return true;
|
343 |
}
|
344 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
345 |
void loadPrompt()
|
346 |
{
|
347 |
auto prompt_tokens = tokenize(prompt, true); // always add BOS
|
@@ -383,9 +465,6 @@ struct llama_server_context
|
|
383 |
// compare the evaluated prompt with the new prompt
|
384 |
n_past = common_part(embd, prompt_tokens);
|
385 |
|
386 |
-
// since #3228 we now have to manually manage the KV cache
|
387 |
-
llama_kv_cache_seq_rm(ctx, 0, n_past, params.n_ctx);
|
388 |
-
|
389 |
embd = prompt_tokens;
|
390 |
if (n_past == num_prompt_tokens)
|
391 |
{
|
@@ -393,6 +472,9 @@ struct llama_server_context
|
|
393 |
n_past--;
|
394 |
}
|
395 |
|
|
|
|
|
|
|
396 |
LOG_VERBOSE("prompt ingested", {
|
397 |
{"n_past", n_past},
|
398 |
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
@@ -440,9 +522,11 @@ struct llama_server_context
|
|
440 |
});
|
441 |
}
|
442 |
|
|
|
443 |
while (n_past < embd.size())
|
444 |
{
|
445 |
int n_eval = (int)embd.size() - n_past;
|
|
|
446 |
if (n_eval > params.n_batch)
|
447 |
{
|
448 |
n_eval = params.n_batch;
|
@@ -468,98 +552,20 @@ struct llama_server_context
|
|
468 |
return result;
|
469 |
}
|
470 |
|
471 |
-
// out of user input, sample next token
|
472 |
-
const float temp = params.temp;
|
473 |
-
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
|
474 |
-
const float top_p = params.top_p;
|
475 |
-
const float tfs_z = params.tfs_z;
|
476 |
-
const float typical_p = params.typical_p;
|
477 |
-
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
478 |
-
const float repeat_penalty = params.repeat_penalty;
|
479 |
-
const float alpha_presence = params.presence_penalty;
|
480 |
-
const float alpha_frequency = params.frequency_penalty;
|
481 |
-
const int mirostat = params.mirostat;
|
482 |
-
const float mirostat_tau = params.mirostat_tau;
|
483 |
-
const float mirostat_eta = params.mirostat_eta;
|
484 |
-
const bool penalize_nl = params.penalize_nl;
|
485 |
-
const int32_t n_probs = params.n_probs;
|
486 |
-
|
487 |
{
|
488 |
-
|
489 |
-
auto n_vocab = llama_n_vocab(model);
|
490 |
-
|
491 |
-
// Apply params.logit_bias map
|
492 |
-
for (const auto &it : params.logit_bias)
|
493 |
-
{
|
494 |
-
logits[it.first] += it.second;
|
495 |
-
}
|
496 |
-
|
497 |
std::vector<llama_token_data> candidates;
|
498 |
-
candidates.reserve(
|
499 |
-
for (llama_token token_id = 0; token_id < n_vocab; token_id++)
|
500 |
-
{
|
501 |
-
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
502 |
-
}
|
503 |
|
504 |
-
|
505 |
-
|
506 |
-
// Apply penalties
|
507 |
-
float nl_logit = logits[llama_token_nl(ctx)];
|
508 |
-
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
509 |
-
llama_sample_repetition_penalty(ctx, &candidates_p,
|
510 |
-
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
511 |
-
last_n_repeat, repeat_penalty);
|
512 |
-
llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
|
513 |
-
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
514 |
-
last_n_repeat, alpha_frequency, alpha_presence);
|
515 |
-
if (!penalize_nl)
|
516 |
-
{
|
517 |
-
logits[llama_token_nl(ctx)] = nl_logit;
|
518 |
-
}
|
519 |
|
520 |
-
|
521 |
-
llama_sample_grammar(ctx, &candidates_p, grammar);
|
522 |
-
}
|
523 |
|
524 |
-
|
|
|
525 |
{
|
526 |
-
//
|
527 |
-
|
528 |
-
if (n_probs > 0)
|
529 |
-
{
|
530 |
-
llama_sample_softmax(ctx, &candidates_p);
|
531 |
-
}
|
532 |
-
}
|
533 |
-
else
|
534 |
-
{
|
535 |
-
if (mirostat == 1)
|
536 |
-
{
|
537 |
-
static float mirostat_mu = 2.0f * mirostat_tau;
|
538 |
-
const int mirostat_m = 100;
|
539 |
-
llama_sample_temp(ctx, &candidates_p, temp);
|
540 |
-
result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
|
541 |
-
}
|
542 |
-
else if (mirostat == 2)
|
543 |
-
{
|
544 |
-
static float mirostat_mu = 2.0f * mirostat_tau;
|
545 |
-
llama_sample_temp(ctx, &candidates_p, temp);
|
546 |
-
result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
|
547 |
-
}
|
548 |
-
else
|
549 |
-
{
|
550 |
-
// Temperature sampling
|
551 |
-
size_t min_keep = std::max(1, n_probs);
|
552 |
-
llama_sample_top_k(ctx, &candidates_p, top_k, min_keep);
|
553 |
-
llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep);
|
554 |
-
llama_sample_typical(ctx, &candidates_p, typical_p, min_keep);
|
555 |
-
llama_sample_top_p(ctx, &candidates_p, top_p, min_keep);
|
556 |
-
llama_sample_temp(ctx, &candidates_p, temp);
|
557 |
-
result.tok = llama_sample_token(ctx, &candidates_p);
|
558 |
-
}
|
559 |
-
}
|
560 |
-
|
561 |
-
if (grammar != nullptr) {
|
562 |
-
llama_grammar_accept_token(ctx, grammar, result.tok);
|
563 |
}
|
564 |
|
565 |
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
@@ -569,7 +575,9 @@ struct llama_server_context
|
|
569 |
|
570 |
last_n_tokens.erase(last_n_tokens.begin());
|
571 |
last_n_tokens.push_back(result.tok);
|
572 |
-
|
|
|
|
|
573 |
}
|
574 |
|
575 |
// add it to the context
|
@@ -629,7 +637,7 @@ struct llama_server_context
|
|
629 |
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
630 |
generated_text += token_text;
|
631 |
|
632 |
-
if (params.n_probs > 0)
|
633 |
{
|
634 |
generated_token_probs.push_back(token_with_probs);
|
635 |
}
|
@@ -710,15 +718,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|
710 |
printf("usage: %s [options]\n", argv0);
|
711 |
printf("\n");
|
712 |
printf("options:\n");
|
713 |
-
printf(" -h, --help
|
714 |
-
printf(" -v, --verbose
|
715 |
-
printf(" -t N,
|
716 |
-
printf(" -
|
717 |
-
printf(" --
|
718 |
-
printf(" --rope-freq-
|
719 |
-
printf("
|
720 |
-
printf(" --
|
721 |
-
printf("
|
|
|
722 |
if (llama_mlock_supported())
|
723 |
{
|
724 |
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
@@ -863,6 +872,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|
863 |
}
|
864 |
params.n_threads = std::stoi(argv[i]);
|
865 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
866 |
else if (arg == "-b" || arg == "--batch-size")
|
867 |
{
|
868 |
if (++i >= argc)
|
@@ -947,7 +965,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|
947 |
invalid_param = true;
|
948 |
break;
|
949 |
}
|
950 |
-
params.lora_adapter.push_back(
|
951 |
params.use_mmap = false;
|
952 |
}
|
953 |
else if (arg == "--lora-scaled")
|
@@ -963,7 +981,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|
963 |
invalid_param = true;
|
964 |
break;
|
965 |
}
|
966 |
-
params.lora_adapter.push_back(
|
967 |
params.use_mmap = false;
|
968 |
}
|
969 |
else if (arg == "--lora-base")
|
@@ -1017,34 +1035,35 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|
1017 |
|
1018 |
static json format_generation_settings(llama_server_context &llama)
|
1019 |
{
|
1020 |
-
const auto
|
1021 |
-
const
|
|
|
1022 |
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
1023 |
|
1024 |
return json{
|
1025 |
{"n_ctx", llama.n_ctx},
|
1026 |
{"model", llama.params.model_alias},
|
1027 |
{"seed", llama.params.seed},
|
1028 |
-
{"temp",
|
1029 |
-
{"top_k",
|
1030 |
-
{"top_p",
|
1031 |
-
{"tfs_z",
|
1032 |
-
{"typical_p",
|
1033 |
-
{"repeat_last_n",
|
1034 |
-
{"repeat_penalty",
|
1035 |
-
{"presence_penalty",
|
1036 |
-
{"frequency_penalty",
|
1037 |
-
{"mirostat",
|
1038 |
-
{"mirostat_tau",
|
1039 |
-
{"mirostat_eta",
|
1040 |
-
{"penalize_nl",
|
1041 |
{"stop", llama.params.antiprompt},
|
1042 |
{"n_predict", llama.params.n_predict},
|
1043 |
{"n_keep", llama.params.n_keep},
|
1044 |
{"ignore_eos", ignore_eos},
|
1045 |
{"stream", llama.stream},
|
1046 |
-
{"logit_bias",
|
1047 |
-
{"n_probs",
|
1048 |
{"grammar", llama.params.grammar},
|
1049 |
};
|
1050 |
}
|
@@ -1060,8 +1079,6 @@ static json format_timings(llama_server_context &llama)
|
|
1060 |
{
|
1061 |
const auto timings = llama_get_timings(llama.ctx);
|
1062 |
|
1063 |
-
assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
|
1064 |
-
|
1065 |
return json{
|
1066 |
{"prompt_n", timings.n_p_eval},
|
1067 |
{"prompt_ms", timings.t_p_eval_ms},
|
@@ -1095,7 +1112,7 @@ static json format_final_response(llama_server_context &llama, const std::string
|
|
1095 |
{"timings", format_timings(llama)},
|
1096 |
};
|
1097 |
|
1098 |
-
if (llama.params.n_probs > 0)
|
1099 |
{
|
1100 |
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
1101 |
}
|
@@ -1111,7 +1128,7 @@ static json format_partial_response(
|
|
1111 |
{"stop", false},
|
1112 |
};
|
1113 |
|
1114 |
-
if (llama.params.n_probs > 0)
|
1115 |
{
|
1116 |
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
1117 |
}
|
@@ -1143,26 +1160,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
|
1143 |
static void parse_options_completion(const json &body, llama_server_context &llama)
|
1144 |
{
|
1145 |
gpt_params default_params;
|
|
|
|
|
1146 |
|
1147 |
llama.stream = json_value(body, "stream", false);
|
1148 |
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
1149 |
-
|
1150 |
-
|
1151 |
-
|
1152 |
-
|
1153 |
-
|
1154 |
-
|
1155 |
-
|
1156 |
-
|
1157 |
-
|
1158 |
-
|
1159 |
-
|
1160 |
-
|
1161 |
-
|
1162 |
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
1163 |
llama.params.seed = json_value(body, "seed", default_params.seed);
|
1164 |
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
1165 |
-
|
1166 |
|
1167 |
if (body.count("prompt") != 0)
|
1168 |
{
|
@@ -1173,10 +1192,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
|
1173 |
llama.prompt = "";
|
1174 |
}
|
1175 |
|
1176 |
-
|
1177 |
if (json_value(body, "ignore_eos", false))
|
1178 |
{
|
1179 |
-
|
1180 |
}
|
1181 |
|
1182 |
const auto &logit_bias = body.find("logit_bias");
|
@@ -1192,11 +1211,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
|
1192 |
{
|
1193 |
if (el[1].is_number())
|
1194 |
{
|
1195 |
-
|
1196 |
}
|
1197 |
else if (el[1].is_boolean() && !el[1].get<bool>())
|
1198 |
{
|
1199 |
-
|
1200 |
}
|
1201 |
}
|
1202 |
}
|
@@ -1216,9 +1235,32 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
|
1216 |
}
|
1217 |
}
|
1218 |
|
|
|
|
|
1219 |
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
1220 |
}
|
1221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1222 |
static void log_server_request(const Request &req, const Response &res)
|
1223 |
{
|
1224 |
LOG_INFO("request", {
|
@@ -1403,7 +1445,7 @@ int main(int argc, char **argv)
|
|
1403 |
}
|
1404 |
|
1405 |
auto probs = llama.generated_token_probs;
|
1406 |
-
if (llama.params.n_probs > 0 && llama.stopped_word) {
|
1407 |
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
1408 |
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
1409 |
}
|
@@ -1455,7 +1497,7 @@ int main(int argc, char **argv)
|
|
1455 |
|
1456 |
std::vector<completion_token_output> probs_output = {};
|
1457 |
|
1458 |
-
if (llama.params.n_probs > 0) {
|
1459 |
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
1460 |
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
1461 |
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
@@ -1519,6 +1561,127 @@ int main(int argc, char **argv)
|
|
1519 |
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
|
1520 |
} });
|
1521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1522 |
svr.Get("/model.json", [&llama](const Request &, Response &res)
|
1523 |
{
|
1524 |
const json data = format_generation_settings(llama);
|
|
|
200 |
llama_model *model = nullptr;
|
201 |
llama_context *ctx = nullptr;
|
202 |
gpt_params params;
|
203 |
+
llama_sampling_context ctx_sampling;
|
204 |
int n_ctx;
|
205 |
|
206 |
grammar_parser::parse_state parsed_grammar;
|
|
|
255 |
if (grammar != nullptr) {
|
256 |
llama_grammar_free(grammar);
|
257 |
grammar = nullptr;
|
258 |
+
ctx_sampling = llama_sampling_context_init(params, NULL);
|
259 |
}
|
260 |
}
|
261 |
|
|
|
331 |
grammar_parser::print_grammar(stderr, parsed_grammar);
|
332 |
|
333 |
{
|
334 |
+
auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx));
|
335 |
+
if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) {
|
336 |
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
|
337 |
}
|
338 |
}
|
|
|
341 |
grammar = llama_grammar_init(
|
342 |
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
343 |
}
|
344 |
+
ctx_sampling = llama_sampling_context_init(params, grammar);
|
345 |
return true;
|
346 |
}
|
347 |
|
348 |
+
void loadInfill()
|
349 |
+
{
|
350 |
+
bool suff_rm_leading_spc = true;
|
351 |
+
if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
352 |
+
params.input_suffix.erase(0, 1);
|
353 |
+
suff_rm_leading_spc = false;
|
354 |
+
}
|
355 |
+
|
356 |
+
auto prefix_tokens = tokenize(params.input_prefix, false);
|
357 |
+
auto suffix_tokens = tokenize(params.input_suffix, false);
|
358 |
+
const int space_token = 29871;
|
359 |
+
if (suff_rm_leading_spc && suffix_tokens[0] == space_token) {
|
360 |
+
suffix_tokens.erase(suffix_tokens.begin());
|
361 |
+
}
|
362 |
+
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx));
|
363 |
+
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS
|
364 |
+
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx));
|
365 |
+
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
366 |
+
prefix_tokens.push_back(llama_token_middle(ctx));
|
367 |
+
auto prompt_tokens = prefix_tokens;
|
368 |
+
|
369 |
+
num_prompt_tokens = prompt_tokens.size();
|
370 |
+
|
371 |
+
if (params.n_keep < 0)
|
372 |
+
{
|
373 |
+
params.n_keep = (int)num_prompt_tokens;
|
374 |
+
}
|
375 |
+
params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
|
376 |
+
|
377 |
+
// if input prompt is too big, truncate like normal
|
378 |
+
if (num_prompt_tokens >= (size_t)params.n_ctx)
|
379 |
+
{
|
380 |
+
printf("Input prompt is too big, truncating. Can only take %d tokens but got %zu\n", params.n_ctx, num_prompt_tokens);
|
381 |
+
// todo we probably want to cut from both sides
|
382 |
+
const int n_left = (params.n_ctx - params.n_keep) / 2;
|
383 |
+
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
|
384 |
+
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
|
385 |
+
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
|
386 |
+
std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
|
387 |
+
|
388 |
+
LOG_VERBOSE("input truncated", {
|
389 |
+
{"n_ctx", params.n_ctx},
|
390 |
+
{"n_keep", params.n_keep},
|
391 |
+
{"n_left", n_left},
|
392 |
+
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
393 |
+
});
|
394 |
+
|
395 |
+
truncated = true;
|
396 |
+
prompt_tokens = new_tokens;
|
397 |
+
}
|
398 |
+
else
|
399 |
+
{
|
400 |
+
const size_t ps = num_prompt_tokens;
|
401 |
+
std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
|
402 |
+
std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
|
403 |
+
}
|
404 |
+
|
405 |
+
// compare the evaluated prompt with the new prompt
|
406 |
+
n_past = common_part(embd, prompt_tokens);
|
407 |
+
embd = prompt_tokens;
|
408 |
+
|
409 |
+
if (n_past == num_prompt_tokens)
|
410 |
+
{
|
411 |
+
// we have to evaluate at least 1 token to generate logits.
|
412 |
+
printf("we have to evaluate at least 1 token to generate logits\n");
|
413 |
+
n_past--;
|
414 |
+
}
|
415 |
+
|
416 |
+
// since #3228 we now have to manually manage the KV cache
|
417 |
+
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
418 |
+
|
419 |
+
LOG_VERBOSE("prompt ingested", {
|
420 |
+
{"n_past", n_past},
|
421 |
+
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
422 |
+
{"to_eval", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
|
423 |
+
});
|
424 |
+
|
425 |
+
has_next_token = true;
|
426 |
+
}
|
427 |
void loadPrompt()
|
428 |
{
|
429 |
auto prompt_tokens = tokenize(prompt, true); // always add BOS
|
|
|
465 |
// compare the evaluated prompt with the new prompt
|
466 |
n_past = common_part(embd, prompt_tokens);
|
467 |
|
|
|
|
|
|
|
468 |
embd = prompt_tokens;
|
469 |
if (n_past == num_prompt_tokens)
|
470 |
{
|
|
|
472 |
n_past--;
|
473 |
}
|
474 |
|
475 |
+
// since #3228 we now have to manually manage the KV cache
|
476 |
+
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
477 |
+
|
478 |
LOG_VERBOSE("prompt ingested", {
|
479 |
{"n_past", n_past},
|
480 |
{"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
|
|
|
522 |
});
|
523 |
}
|
524 |
|
525 |
+
bool tg = true;
|
526 |
while (n_past < embd.size())
|
527 |
{
|
528 |
int n_eval = (int)embd.size() - n_past;
|
529 |
+
tg = n_eval == 1;
|
530 |
if (n_eval > params.n_batch)
|
531 |
{
|
532 |
n_eval = params.n_batch;
|
|
|
552 |
return result;
|
553 |
}
|
554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
555 |
{
|
556 |
+
// out of user input, sample next token
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
557 |
std::vector<llama_token_data> candidates;
|
558 |
+
candidates.reserve(llama_n_vocab(model));
|
|
|
|
|
|
|
|
|
559 |
|
560 |
+
result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
561 |
|
562 |
+
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
|
|
|
563 |
|
564 |
+
const int32_t n_probs = params.sampling_params.n_probs;
|
565 |
+
if (params.sampling_params.temp <= 0 && n_probs > 0)
|
566 |
{
|
567 |
+
// For llama_sample_token_greedy we need to sort candidates
|
568 |
+
llama_sample_softmax(ctx, &candidates_p);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
569 |
}
|
570 |
|
571 |
for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
|
|
|
575 |
|
576 |
last_n_tokens.erase(last_n_tokens.begin());
|
577 |
last_n_tokens.push_back(result.tok);
|
578 |
+
if (tg) {
|
579 |
+
num_tokens_predicted++;
|
580 |
+
}
|
581 |
}
|
582 |
|
583 |
// add it to the context
|
|
|
637 |
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
|
638 |
generated_text += token_text;
|
639 |
|
640 |
+
if (params.sampling_params.n_probs > 0)
|
641 |
{
|
642 |
generated_token_probs.push_back(token_with_probs);
|
643 |
}
|
|
|
718 |
printf("usage: %s [options]\n", argv0);
|
719 |
printf("\n");
|
720 |
printf("options:\n");
|
721 |
+
printf(" -h, --help show this help message and exit\n");
|
722 |
+
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
723 |
+
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
724 |
+
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
725 |
+
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
726 |
+
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
727 |
+
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
728 |
+
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
729 |
+
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
730 |
+
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
731 |
if (llama_mlock_supported())
|
732 |
{
|
733 |
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
|
|
872 |
}
|
873 |
params.n_threads = std::stoi(argv[i]);
|
874 |
}
|
875 |
+
else if (arg == "--threads-batch" || arg == "-tb")
|
876 |
+
{
|
877 |
+
if (++i >= argc)
|
878 |
+
{
|
879 |
+
invalid_param = true;
|
880 |
+
break;
|
881 |
+
}
|
882 |
+
params.n_threads_batch = std::stoi(argv[i]);
|
883 |
+
}
|
884 |
else if (arg == "-b" || arg == "--batch-size")
|
885 |
{
|
886 |
if (++i >= argc)
|
|
|
965 |
invalid_param = true;
|
966 |
break;
|
967 |
}
|
968 |
+
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
|
969 |
params.use_mmap = false;
|
970 |
}
|
971 |
else if (arg == "--lora-scaled")
|
|
|
981 |
invalid_param = true;
|
982 |
break;
|
983 |
}
|
984 |
+
params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
|
985 |
params.use_mmap = false;
|
986 |
}
|
987 |
else if (arg == "--lora-base")
|
|
|
1035 |
|
1036 |
static json format_generation_settings(llama_server_context &llama)
|
1037 |
{
|
1038 |
+
const auto & sparams = llama.params.sampling_params;
|
1039 |
+
const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx));
|
1040 |
+
const bool ignore_eos = eos_bias != sparams.logit_bias.end() &&
|
1041 |
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
1042 |
|
1043 |
return json{
|
1044 |
{"n_ctx", llama.n_ctx},
|
1045 |
{"model", llama.params.model_alias},
|
1046 |
{"seed", llama.params.seed},
|
1047 |
+
{"temp", sparams.temp},
|
1048 |
+
{"top_k", sparams.top_k},
|
1049 |
+
{"top_p", sparams.top_p},
|
1050 |
+
{"tfs_z", sparams.tfs_z},
|
1051 |
+
{"typical_p", sparams.typical_p},
|
1052 |
+
{"repeat_last_n", sparams.repeat_last_n},
|
1053 |
+
{"repeat_penalty", sparams.repeat_penalty},
|
1054 |
+
{"presence_penalty", sparams.presence_penalty},
|
1055 |
+
{"frequency_penalty", sparams.frequency_penalty},
|
1056 |
+
{"mirostat", sparams.mirostat},
|
1057 |
+
{"mirostat_tau", sparams.mirostat_tau},
|
1058 |
+
{"mirostat_eta", sparams.mirostat_eta},
|
1059 |
+
{"penalize_nl", sparams.penalize_nl},
|
1060 |
{"stop", llama.params.antiprompt},
|
1061 |
{"n_predict", llama.params.n_predict},
|
1062 |
{"n_keep", llama.params.n_keep},
|
1063 |
{"ignore_eos", ignore_eos},
|
1064 |
{"stream", llama.stream},
|
1065 |
+
{"logit_bias", sparams.logit_bias},
|
1066 |
+
{"n_probs", sparams.n_probs},
|
1067 |
{"grammar", llama.params.grammar},
|
1068 |
};
|
1069 |
}
|
|
|
1079 |
{
|
1080 |
const auto timings = llama_get_timings(llama.ctx);
|
1081 |
|
|
|
|
|
1082 |
return json{
|
1083 |
{"prompt_n", timings.n_p_eval},
|
1084 |
{"prompt_ms", timings.t_p_eval_ms},
|
|
|
1112 |
{"timings", format_timings(llama)},
|
1113 |
};
|
1114 |
|
1115 |
+
if (llama.params.sampling_params.n_probs > 0)
|
1116 |
{
|
1117 |
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
1118 |
}
|
|
|
1128 |
{"stop", false},
|
1129 |
};
|
1130 |
|
1131 |
+
if (llama.params.sampling_params.n_probs > 0)
|
1132 |
{
|
1133 |
res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
|
1134 |
}
|
|
|
1160 |
static void parse_options_completion(const json &body, llama_server_context &llama)
|
1161 |
{
|
1162 |
gpt_params default_params;
|
1163 |
+
const auto & default_sparams = default_params.sampling_params;
|
1164 |
+
auto & sparams = llama.params.sampling_params;
|
1165 |
|
1166 |
llama.stream = json_value(body, "stream", false);
|
1167 |
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
|
1168 |
+
sparams.top_k = json_value(body, "top_k", default_sparams.top_k);
|
1169 |
+
sparams.top_p = json_value(body, "top_p", default_sparams.top_p);
|
1170 |
+
sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z);
|
1171 |
+
sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p);
|
1172 |
+
sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n);
|
1173 |
+
sparams.temp = json_value(body, "temperature", default_sparams.temp);
|
1174 |
+
sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty);
|
1175 |
+
sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty);
|
1176 |
+
sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty);
|
1177 |
+
sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat);
|
1178 |
+
sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
1179 |
+
sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
1180 |
+
sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
1181 |
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
|
1182 |
llama.params.seed = json_value(body, "seed", default_params.seed);
|
1183 |
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
|
1184 |
+
sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs);
|
1185 |
|
1186 |
if (body.count("prompt") != 0)
|
1187 |
{
|
|
|
1192 |
llama.prompt = "";
|
1193 |
}
|
1194 |
|
1195 |
+
sparams.logit_bias.clear();
|
1196 |
if (json_value(body, "ignore_eos", false))
|
1197 |
{
|
1198 |
+
sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
|
1199 |
}
|
1200 |
|
1201 |
const auto &logit_bias = body.find("logit_bias");
|
|
|
1211 |
{
|
1212 |
if (el[1].is_number())
|
1213 |
{
|
1214 |
+
sparams.logit_bias[tok] = el[1].get<float>();
|
1215 |
}
|
1216 |
else if (el[1].is_boolean() && !el[1].get<bool>())
|
1217 |
{
|
1218 |
+
sparams.logit_bias[tok] = -INFINITY;
|
1219 |
}
|
1220 |
}
|
1221 |
}
|
|
|
1235 |
}
|
1236 |
}
|
1237 |
|
1238 |
+
llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar);
|
1239 |
+
|
1240 |
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
1241 |
}
|
1242 |
|
1243 |
+
static void parse_options_infill(const json &body, llama_server_context &llama)
|
1244 |
+
{
|
1245 |
+
if (body.count("input_prefix") != 0)
|
1246 |
+
{
|
1247 |
+
llama.params.input_prefix = body["input_prefix"];
|
1248 |
+
}
|
1249 |
+
else
|
1250 |
+
{
|
1251 |
+
llama.params.input_prefix = "";
|
1252 |
+
}
|
1253 |
+
if (body.count("input_suffix") != 0)
|
1254 |
+
{
|
1255 |
+
llama.params.input_suffix = body["input_suffix"];
|
1256 |
+
}
|
1257 |
+
else
|
1258 |
+
{
|
1259 |
+
llama.params.input_suffix = "";
|
1260 |
+
}
|
1261 |
+
parse_options_completion(body, llama);
|
1262 |
+
}
|
1263 |
+
|
1264 |
static void log_server_request(const Request &req, const Response &res)
|
1265 |
{
|
1266 |
LOG_INFO("request", {
|
|
|
1445 |
}
|
1446 |
|
1447 |
auto probs = llama.generated_token_probs;
|
1448 |
+
if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
|
1449 |
const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
|
1450 |
probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
|
1451 |
}
|
|
|
1497 |
|
1498 |
std::vector<completion_token_output> probs_output = {};
|
1499 |
|
1500 |
+
if (llama.params.sampling_params.n_probs > 0) {
|
1501 |
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
1502 |
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
1503 |
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
|
|
1561 |
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
|
1562 |
} });
|
1563 |
|
1564 |
+
svr.Post("/infill", [&llama](const Request &req, Response &res)
|
1565 |
+
{
|
1566 |
+
auto lock = llama.lock();
|
1567 |
+
|
1568 |
+
llama.rewind();
|
1569 |
+
|
1570 |
+
llama_reset_timings(llama.ctx);
|
1571 |
+
|
1572 |
+
parse_options_infill(json::parse(req.body), llama);
|
1573 |
+
|
1574 |
+
if (!llama.loadGrammar())
|
1575 |
+
{
|
1576 |
+
res.status = 400;
|
1577 |
+
return;
|
1578 |
+
}
|
1579 |
+
llama.loadInfill();
|
1580 |
+
llama.beginCompletion();
|
1581 |
+
const auto chunked_content_provider = [&](size_t, DataSink & sink) {
|
1582 |
+
size_t sent_count = 0;
|
1583 |
+
size_t sent_token_probs_index = 0;
|
1584 |
+
|
1585 |
+
while (llama.has_next_token) {
|
1586 |
+
const completion_token_output token_with_probs = llama.doCompletion();
|
1587 |
+
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
|
1588 |
+
continue;
|
1589 |
+
}
|
1590 |
+
const std::string token_text = llama_token_to_piece(llama.ctx, token_with_probs.tok);
|
1591 |
+
|
1592 |
+
size_t pos = std::min(sent_count, llama.generated_text.size());
|
1593 |
+
|
1594 |
+
const std::string str_test = llama.generated_text.substr(pos);
|
1595 |
+
bool is_stop_full = false;
|
1596 |
+
size_t stop_pos =
|
1597 |
+
llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
|
1598 |
+
if (stop_pos != std::string::npos) {
|
1599 |
+
is_stop_full = true;
|
1600 |
+
llama.generated_text.erase(
|
1601 |
+
llama.generated_text.begin() + pos + stop_pos,
|
1602 |
+
llama.generated_text.end());
|
1603 |
+
pos = std::min(sent_count, llama.generated_text.size());
|
1604 |
+
} else {
|
1605 |
+
is_stop_full = false;
|
1606 |
+
stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
|
1607 |
+
STOP_PARTIAL);
|
1608 |
+
}
|
1609 |
+
|
1610 |
+
if (
|
1611 |
+
stop_pos == std::string::npos ||
|
1612 |
+
// Send rest of the text if we are at the end of the generation
|
1613 |
+
(!llama.has_next_token && !is_stop_full && stop_pos > 0)
|
1614 |
+
) {
|
1615 |
+
const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
|
1616 |
+
|
1617 |
+
sent_count += to_send.size();
|
1618 |
+
|
1619 |
+
std::vector<completion_token_output> probs_output = {};
|
1620 |
+
|
1621 |
+
if (llama.params.sampling_params.n_probs > 0) {
|
1622 |
+
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
1623 |
+
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
1624 |
+
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
1625 |
+
if (probs_pos < probs_stop_pos) {
|
1626 |
+
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
|
1627 |
+
}
|
1628 |
+
sent_token_probs_index = probs_stop_pos;
|
1629 |
+
}
|
1630 |
+
|
1631 |
+
const json data = format_partial_response(llama, to_send, probs_output);
|
1632 |
+
|
1633 |
+
const std::string str =
|
1634 |
+
"data: " +
|
1635 |
+
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
1636 |
+
"\n\n";
|
1637 |
+
|
1638 |
+
LOG_VERBOSE("data stream", {
|
1639 |
+
{ "to_send", str }
|
1640 |
+
});
|
1641 |
+
|
1642 |
+
if (!sink.write(str.data(), str.size())) {
|
1643 |
+
LOG_VERBOSE("stream closed", {});
|
1644 |
+
llama_print_timings(llama.ctx);
|
1645 |
+
return false;
|
1646 |
+
}
|
1647 |
+
}
|
1648 |
+
|
1649 |
+
if (!llama.has_next_token) {
|
1650 |
+
// Generation is done, send extra information.
|
1651 |
+
const json data = format_final_response(
|
1652 |
+
llama,
|
1653 |
+
"",
|
1654 |
+
std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
|
1655 |
+
);
|
1656 |
+
|
1657 |
+
const std::string str =
|
1658 |
+
"data: " +
|
1659 |
+
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
1660 |
+
"\n\n";
|
1661 |
+
|
1662 |
+
LOG_VERBOSE("data stream", {
|
1663 |
+
{ "to_send", str }
|
1664 |
+
});
|
1665 |
+
|
1666 |
+
if (!sink.write(str.data(), str.size())) {
|
1667 |
+
LOG_VERBOSE("stream closed", {});
|
1668 |
+
llama_print_timings(llama.ctx);
|
1669 |
+
return false;
|
1670 |
+
}
|
1671 |
+
}
|
1672 |
+
}
|
1673 |
+
|
1674 |
+
llama_print_timings(llama.ctx);
|
1675 |
+
sink.done();
|
1676 |
+
return true;
|
1677 |
+
};
|
1678 |
+
const auto on_complete = [&](bool) {
|
1679 |
+
llama.mutex.unlock();
|
1680 |
+
};
|
1681 |
+
lock.release();
|
1682 |
+
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
|
1683 |
+
});
|
1684 |
+
|
1685 |
svr.Get("/model.json", [&llama](const Request &, Response &res)
|
1686 |
{
|
1687 |
const json data = format_generation_settings(llama);
|
examples/speculative/speculative.cpp
CHANGED
@@ -125,6 +125,8 @@ int main(int argc, char ** argv) {
|
|
125 |
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
126 |
}
|
127 |
|
|
|
|
|
128 |
const auto t_dec_start = ggml_time_us();
|
129 |
|
130 |
while (true) {
|
@@ -134,7 +136,7 @@ int main(int argc, char ** argv) {
|
|
134 |
|
135 |
while (true) {
|
136 |
// sample from the target model
|
137 |
-
llama_token id =
|
138 |
|
139 |
// remember which tokens were sampled - used for repetition penalties during sampling
|
140 |
last_tokens.erase(last_tokens.begin());
|
@@ -172,7 +174,7 @@ int main(int argc, char ** argv) {
|
|
172 |
LOG("out of drafted tokens\n");
|
173 |
}
|
174 |
|
175 |
-
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft,
|
176 |
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
|
177 |
++n_past_dft;
|
178 |
|
@@ -211,7 +213,13 @@ int main(int argc, char ** argv) {
|
|
211 |
if (grammar_dft) {
|
212 |
llama_grammar_free(grammar_dft);
|
213 |
}
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
LOG("copied target grammar to draft grammar\n");
|
217 |
}
|
@@ -257,7 +265,7 @@ int main(int argc, char ** argv) {
|
|
257 |
}
|
258 |
|
259 |
// evaluate the drafted token on the draft model
|
260 |
-
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur,
|
261 |
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
|
262 |
++n_past_cur;
|
263 |
|
@@ -267,7 +275,7 @@ int main(int argc, char ** argv) {
|
|
267 |
}
|
268 |
|
269 |
// evaluate the target model on the drafted tokens
|
270 |
-
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt,
|
271 |
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
|
272 |
++n_past_tgt;
|
273 |
|
|
|
125 |
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
126 |
}
|
127 |
|
128 |
+
llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt);
|
129 |
+
|
130 |
const auto t_dec_start = ggml_time_us();
|
131 |
|
132 |
while (true) {
|
|
|
136 |
|
137 |
while (true) {
|
138 |
// sample from the target model
|
139 |
+
llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft);
|
140 |
|
141 |
// remember which tokens were sampled - used for repetition penalties during sampling
|
142 |
last_tokens.erase(last_tokens.begin());
|
|
|
174 |
LOG("out of drafted tokens\n");
|
175 |
}
|
176 |
|
177 |
+
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
178 |
llama_decode(ctx_dft, llama_batch_get_one(&id, 1, n_past_dft, 0));
|
179 |
++n_past_dft;
|
180 |
|
|
|
213 |
if (grammar_dft) {
|
214 |
llama_grammar_free(grammar_dft);
|
215 |
}
|
216 |
+
// Note: Hardcoded to sequence id 0, if this ever supports parallel generation
|
217 |
+
// that will need to change.
|
218 |
+
auto it = ctx_sampling.sequence_contexts.find(0);
|
219 |
+
GGML_ASSERT(it != ctx_sampling.sequence_contexts.end());
|
220 |
+
// This is necessary because each sequence id in sequence_contexts
|
221 |
+
// uses a copy of the original grammar.
|
222 |
+
grammar_dft = llama_grammar_copy(it->second.grammar);
|
223 |
|
224 |
LOG("copied target grammar to draft grammar\n");
|
225 |
}
|
|
|
265 |
}
|
266 |
|
267 |
// evaluate the drafted token on the draft model
|
268 |
+
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_cur, -1);
|
269 |
llama_decode(ctx_dft, llama_batch_get_one(&drafted.back(), 1, n_past_cur, 0));
|
270 |
++n_past_cur;
|
271 |
|
|
|
275 |
}
|
276 |
|
277 |
// evaluate the target model on the drafted tokens
|
278 |
+
llama_kv_cache_seq_rm(ctx_tgt, 0, n_past_tgt, -1);
|
279 |
llama_decode(ctx_tgt, llama_batch_get_one(drafted.data(), drafted.size(), n_past_tgt, 0));
|
280 |
++n_past_tgt;
|
281 |
|
ggml-alloc.c
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
#include "ggml-alloc.h"
|
|
|
2 |
#include "ggml.h"
|
3 |
#include <assert.h>
|
4 |
#include <stdarg.h>
|
@@ -6,25 +7,6 @@
|
|
6 |
#include <stdlib.h>
|
7 |
#include <string.h>
|
8 |
|
9 |
-
#ifdef __has_include
|
10 |
-
#if __has_include(<unistd.h>)
|
11 |
-
#include <unistd.h>
|
12 |
-
#if defined(_POSIX_MAPPED_FILES)
|
13 |
-
#include <sys/types.h>
|
14 |
-
#include <sys/mman.h>
|
15 |
-
#endif
|
16 |
-
#endif
|
17 |
-
#endif
|
18 |
-
|
19 |
-
#if defined(_WIN32)
|
20 |
-
#define WIN32_LEAN_AND_MEAN
|
21 |
-
#ifndef NOMINMAX
|
22 |
-
#define NOMINMAX
|
23 |
-
#endif
|
24 |
-
#include <windows.h>
|
25 |
-
#include <memoryapi.h>
|
26 |
-
#endif
|
27 |
-
|
28 |
|
29 |
#define UNUSED(x) (void)(x)
|
30 |
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
@@ -80,8 +62,9 @@ struct free_block {
|
|
80 |
#define MAX_FREE_BLOCKS 256
|
81 |
|
82 |
struct ggml_allocr {
|
|
|
|
|
83 |
void * data;
|
84 |
-
size_t size;
|
85 |
size_t alignment;
|
86 |
int n_free_blocks;
|
87 |
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
@@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
|
119 |
}
|
120 |
#endif
|
121 |
|
122 |
-
static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
123 |
-
return ggml_nbytes(tensor);
|
124 |
-
|
125 |
-
UNUSED(alloc);
|
126 |
-
}
|
127 |
-
|
128 |
// check if a tensor is allocated by this buffer
|
129 |
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
130 |
-
|
131 |
-
return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
|
132 |
}
|
133 |
|
134 |
static bool ggml_is_view(struct ggml_tensor * t) {
|
@@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) {
|
|
136 |
}
|
137 |
|
138 |
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
139 |
-
#ifdef GGML_ALLOCATOR_DEBUG
|
140 |
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
141 |
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
142 |
-
|
143 |
-
size_t size =
|
144 |
size = aligned_offset(NULL, size, alloc->alignment);
|
145 |
|
146 |
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
@@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
188 |
|
189 |
tensor->data = addr;
|
190 |
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
|
|
|
|
191 |
|
192 |
#ifdef GGML_ALLOCATOR_DEBUG
|
193 |
add_allocated_tensor(alloc, tensor);
|
@@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
|
208 |
|
209 |
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
210 |
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
211 |
-
void * ptr = tensor->data;
|
212 |
-
|
213 |
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
214 |
// the tensor was not allocated in this buffer
|
215 |
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
216 |
// the easiest way to deal with this is just to ignore it
|
|
|
217 |
return;
|
218 |
}
|
219 |
|
220 |
-
|
|
|
|
|
221 |
size = aligned_offset(NULL, size, alloc->alignment);
|
222 |
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
223 |
-
|
|
|
224 |
|
225 |
#ifdef GGML_ALLOCATOR_DEBUG
|
226 |
remove_allocated_tensor(alloc, tensor);
|
@@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
|
285 |
alloc->n_free_blocks = 1;
|
286 |
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
287 |
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
288 |
-
alloc->free_blocks[0].size = alloc->
|
289 |
}
|
290 |
|
291 |
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
292 |
-
struct
|
|
|
|
|
293 |
|
294 |
*alloc = (struct ggml_allocr){
|
295 |
-
/*.
|
296 |
-
/*.
|
|
|
297 |
/*.alignment = */ alignment,
|
298 |
/*.n_free_blocks = */ 0,
|
299 |
/*.free_blocks = */ {{0}},
|
@@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
|
|
312 |
return alloc;
|
313 |
}
|
314 |
|
315 |
-
// OS specific functions to allocate and free uncommitted virtual memory
|
316 |
-
static void * alloc_vmem(size_t size) {
|
317 |
-
#if defined(_WIN32)
|
318 |
-
return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
|
319 |
-
#elif defined(_POSIX_MAPPED_FILES)
|
320 |
-
void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
321 |
-
if (ptr == MAP_FAILED) {
|
322 |
-
return NULL;
|
323 |
-
}
|
324 |
-
return ptr;
|
325 |
-
#else
|
326 |
-
// use a fixed address for other platforms
|
327 |
-
uintptr_t base_addr = (uintptr_t)-size - 0x100;
|
328 |
-
return (void *)base_addr;
|
329 |
-
#endif
|
330 |
-
}
|
331 |
-
|
332 |
-
static void free_vmem(void * base_addr, size_t size) {
|
333 |
-
#if defined(_WIN32)
|
334 |
-
VirtualFree(base_addr, 0, MEM_RELEASE);
|
335 |
-
UNUSED(size);
|
336 |
-
#elif defined(_POSIX_MAPPED_FILES)
|
337 |
-
munmap(base_addr, size);
|
338 |
-
#else
|
339 |
-
// nothing to do
|
340 |
-
UNUSED(base_addr);
|
341 |
-
UNUSED(size);
|
342 |
-
#endif
|
343 |
-
}
|
344 |
-
|
345 |
-
// allocate uncommitted virtual memory to measure the size of the graph
|
346 |
-
static void alloc_measure_vmem(void ** base_addr, size_t * size) {
|
347 |
-
// 128GB for 64-bit, 1GB for 32-bit
|
348 |
-
*size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
|
349 |
-
do {
|
350 |
-
*base_addr = alloc_vmem(*size);
|
351 |
-
if (*base_addr != NULL) {
|
352 |
-
AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
|
353 |
-
return;
|
354 |
-
}
|
355 |
-
// try again with half the size
|
356 |
-
*size /= 2;
|
357 |
-
} while (*size > 0);
|
358 |
-
|
359 |
-
GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
|
360 |
-
}
|
361 |
-
|
362 |
-
static void free_measure_vmem(void * base_addr, size_t size) {
|
363 |
-
free_vmem(base_addr, size);
|
364 |
-
}
|
365 |
-
|
366 |
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
367 |
-
struct ggml_allocr * alloc = (
|
|
|
368 |
|
369 |
-
|
370 |
-
|
371 |
|
372 |
-
|
|
|
373 |
|
374 |
*alloc = (struct ggml_allocr){
|
375 |
-
/*.
|
376 |
-
/*.
|
377 |
-
/*.
|
|
|
378 |
/*.n_free_blocks = */ 0,
|
379 |
/*.free_blocks = */ {{0}},
|
380 |
/*.hash_table = */ {{0}},
|
381 |
/*.max_size = */ 0,
|
382 |
-
/*.measure = */
|
383 |
/*.parse_seq = */ {0},
|
384 |
/*.parse_seq_len = */ 0,
|
385 |
#ifdef GGML_ALLOCATOR_DEBUG
|
@@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
|
393 |
}
|
394 |
|
395 |
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
396 |
-
if (alloc->
|
397 |
-
|
398 |
}
|
399 |
free(alloc);
|
400 |
}
|
@@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
437 |
case GGML_OP_ROPE:
|
438 |
case GGML_OP_RMS_NORM:
|
439 |
case GGML_OP_SOFT_MAX:
|
440 |
-
case GGML_OP_CONT:
|
441 |
return true;
|
442 |
|
443 |
default:
|
@@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
|
|
445 |
}
|
446 |
}
|
447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
448 |
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
449 |
struct hash_node * ht = alloc->hash_table;
|
450 |
if (node->data == NULL) {
|
451 |
if (ggml_is_view(node)) {
|
452 |
-
|
453 |
-
node->data = (char *)node->view_src->data + node->view_offs;
|
454 |
} else {
|
455 |
// see if we can reuse a parent's buffer (inplace)
|
456 |
if (ggml_op_can_inplace(node->op)) {
|
@@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
478 |
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
479 |
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
480 |
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
481 |
-
node->
|
|
|
|
|
482 |
return;
|
483 |
}
|
484 |
}
|
485 |
else {
|
486 |
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
487 |
-
node->
|
|
|
|
|
488 |
return;
|
489 |
}
|
490 |
}
|
@@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
|
|
495 |
}
|
496 |
}
|
497 |
|
498 |
-
|
499 |
struct ggml_allocr * alloc,
|
500 |
struct ggml_cgraph ** graphs, int n_graphs,
|
501 |
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
@@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
513 |
if (ggml_is_view(node)) {
|
514 |
struct ggml_tensor * view_src = node->view_src;
|
515 |
hash_get(ht, view_src)->n_views += 1;
|
|
|
|
|
|
|
|
|
516 |
}
|
517 |
|
518 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
@@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
521 |
break;
|
522 |
}
|
523 |
hash_get(ht, parent)->n_children += 1;
|
|
|
|
|
|
|
524 |
}
|
525 |
}
|
526 |
}
|
@@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
|
631 |
}
|
632 |
|
633 |
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
634 |
-
return
|
635 |
}
|
636 |
|
637 |
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
|
|
1 |
#include "ggml-alloc.h"
|
2 |
+
#include "ggml-backend.h"
|
3 |
#include "ggml.h"
|
4 |
#include <assert.h>
|
5 |
#include <stdarg.h>
|
|
|
7 |
#include <stdlib.h>
|
8 |
#include <string.h>
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
#define UNUSED(x) (void)(x)
|
12 |
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
|
|
62 |
#define MAX_FREE_BLOCKS 256
|
63 |
|
64 |
struct ggml_allocr {
|
65 |
+
struct ggml_backend_buffer * buffer;
|
66 |
+
bool buffer_owned;
|
67 |
void * data;
|
|
|
68 |
size_t alignment;
|
69 |
int n_free_blocks;
|
70 |
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
|
|
102 |
}
|
103 |
#endif
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
// check if a tensor is allocated by this buffer
|
106 |
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
|
107 |
+
return tensor->buffer == alloc->buffer;
|
|
|
108 |
}
|
109 |
|
110 |
static bool ggml_is_view(struct ggml_tensor * t) {
|
|
|
112 |
}
|
113 |
|
114 |
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
|
|
115 |
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
|
116 |
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
|
117 |
+
|
118 |
+
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
119 |
size = aligned_offset(NULL, size, alloc->alignment);
|
120 |
|
121 |
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
|
|
163 |
|
164 |
tensor->data = addr;
|
165 |
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
166 |
+
tensor->buffer = alloc->buffer;
|
167 |
+
ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
|
168 |
|
169 |
#ifdef GGML_ALLOCATOR_DEBUG
|
170 |
add_allocated_tensor(alloc, tensor);
|
|
|
185 |
|
186 |
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
187 |
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
|
|
|
|
188 |
if (ggml_allocr_is_own(alloc, tensor) == false) {
|
189 |
// the tensor was not allocated in this buffer
|
190 |
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
191 |
// the easiest way to deal with this is just to ignore it
|
192 |
+
AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
|
193 |
return;
|
194 |
}
|
195 |
|
196 |
+
void * ptr = tensor->data;
|
197 |
+
|
198 |
+
size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
|
199 |
size = aligned_offset(NULL, size, alloc->alignment);
|
200 |
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
201 |
+
|
202 |
+
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
203 |
|
204 |
#ifdef GGML_ALLOCATOR_DEBUG
|
205 |
remove_allocated_tensor(alloc, tensor);
|
|
|
264 |
alloc->n_free_blocks = 1;
|
265 |
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
266 |
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
267 |
+
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
268 |
}
|
269 |
|
270 |
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
271 |
+
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
272 |
+
|
273 |
+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
274 |
|
275 |
*alloc = (struct ggml_allocr){
|
276 |
+
/*.buffer = */ buffer,
|
277 |
+
/*.buffer_owned = */ true,
|
278 |
+
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
279 |
/*.alignment = */ alignment,
|
280 |
/*.n_free_blocks = */ 0,
|
281 |
/*.free_blocks = */ {{0}},
|
|
|
294 |
return alloc;
|
295 |
}
|
296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
298 |
+
struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment);
|
299 |
+
alloc->measure = true;
|
300 |
|
301 |
+
return alloc;
|
302 |
+
}
|
303 |
|
304 |
+
struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
305 |
+
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr));
|
306 |
|
307 |
*alloc = (struct ggml_allocr){
|
308 |
+
/*.buffer = */ buffer,
|
309 |
+
/*.buffer_owned = */ false,
|
310 |
+
/*.base = */ ggml_backend_buffer_get_base(buffer),
|
311 |
+
/*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
|
312 |
/*.n_free_blocks = */ 0,
|
313 |
/*.free_blocks = */ {{0}},
|
314 |
/*.hash_table = */ {{0}},
|
315 |
/*.max_size = */ 0,
|
316 |
+
/*.measure = */ false,
|
317 |
/*.parse_seq = */ {0},
|
318 |
/*.parse_seq_len = */ 0,
|
319 |
#ifdef GGML_ALLOCATOR_DEBUG
|
|
|
327 |
}
|
328 |
|
329 |
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
330 |
+
if (alloc->buffer_owned) {
|
331 |
+
ggml_backend_buffer_free(alloc->buffer);
|
332 |
}
|
333 |
free(alloc);
|
334 |
}
|
|
|
371 |
case GGML_OP_ROPE:
|
372 |
case GGML_OP_RMS_NORM:
|
373 |
case GGML_OP_SOFT_MAX:
|
|
|
374 |
return true;
|
375 |
|
376 |
default:
|
|
|
378 |
}
|
379 |
}
|
380 |
|
381 |
+
static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
|
382 |
+
assert(view->view_src != NULL && view->view_src->data != NULL);
|
383 |
+
view->backend = view->view_src->backend;
|
384 |
+
view->buffer = view->view_src->buffer;
|
385 |
+
view->data = (char *)view->view_src->data + view->view_offs;
|
386 |
+
|
387 |
+
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
388 |
+
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
389 |
+
assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
390 |
+
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
391 |
+
}
|
392 |
+
|
393 |
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
394 |
struct hash_node * ht = alloc->hash_table;
|
395 |
if (node->data == NULL) {
|
396 |
if (ggml_is_view(node)) {
|
397 |
+
init_view(alloc, node);
|
|
|
398 |
} else {
|
399 |
// see if we can reuse a parent's buffer (inplace)
|
400 |
if (ggml_op_can_inplace(node->op)) {
|
|
|
422 |
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
423 |
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
424 |
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
425 |
+
node->view_src = view_src;
|
426 |
+
view_src_hn->n_views += 1;
|
427 |
+
init_view(alloc, node);
|
428 |
return;
|
429 |
}
|
430 |
}
|
431 |
else {
|
432 |
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
433 |
+
node->view_src = parent;
|
434 |
+
p_hn->n_views += 1;
|
435 |
+
init_view(alloc, node);
|
436 |
return;
|
437 |
}
|
438 |
}
|
|
|
443 |
}
|
444 |
}
|
445 |
|
446 |
+
size_t ggml_allocr_alloc_graph_n(
|
447 |
struct ggml_allocr * alloc,
|
448 |
struct ggml_cgraph ** graphs, int n_graphs,
|
449 |
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
|
|
461 |
if (ggml_is_view(node)) {
|
462 |
struct ggml_tensor * view_src = node->view_src;
|
463 |
hash_get(ht, view_src)->n_views += 1;
|
464 |
+
if (node->buffer == NULL && node->data != NULL) {
|
465 |
+
// view of a pre-allocated tensor, didn't call init_view() yet
|
466 |
+
init_view(alloc, node);
|
467 |
+
}
|
468 |
}
|
469 |
|
470 |
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
|
473 |
break;
|
474 |
}
|
475 |
hash_get(ht, parent)->n_children += 1;
|
476 |
+
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
477 |
+
init_view(alloc, parent);
|
478 |
+
}
|
479 |
}
|
480 |
}
|
481 |
}
|
|
|
586 |
}
|
587 |
|
588 |
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
589 |
+
return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
|
590 |
}
|
591 |
|
592 |
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
ggml-alloc.h
CHANGED
@@ -6,21 +6,27 @@
|
|
6 |
extern "C" {
|
7 |
#endif
|
8 |
|
|
|
9 |
|
10 |
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
11 |
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
|
|
12 |
|
13 |
// tell the allocator to parse nodes following the order described in the list
|
14 |
// you should call this if your graph are optimized to execute out-of-order
|
15 |
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
16 |
|
17 |
-
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
18 |
-
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
19 |
-
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
20 |
-
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
21 |
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
22 |
-
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
|
23 |
|
|
|
|
|
|
|
|
|
24 |
|
25 |
#ifdef __cplusplus
|
26 |
}
|
|
|
6 |
extern "C" {
|
7 |
#endif
|
8 |
|
9 |
+
struct ggml_backend_buffer;
|
10 |
|
11 |
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
12 |
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
13 |
+
GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
|
14 |
|
15 |
// tell the allocator to parse nodes following the order described in the list
|
16 |
// you should call this if your graph are optimized to execute out-of-order
|
17 |
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
|
18 |
|
19 |
+
GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
|
20 |
+
GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
|
21 |
+
GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
|
22 |
+
GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
23 |
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
24 |
+
GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
|
25 |
|
26 |
+
GGML_API size_t ggml_allocr_alloc_graph_n(
|
27 |
+
struct ggml_allocr * alloc,
|
28 |
+
struct ggml_cgraph ** graphs, int n_graphs,
|
29 |
+
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
|
30 |
|
31 |
#ifdef __cplusplus
|
32 |
}
|
ggml-backend.c
ADDED
@@ -0,0 +1,385 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#include "ggml-backend.h"
|
2 |
+
#include "ggml-alloc.h"
|
3 |
+
|
4 |
+
#include <assert.h>
|
5 |
+
#include <stdarg.h>
|
6 |
+
#include <stdio.h>
|
7 |
+
#include <stdlib.h>
|
8 |
+
#include <string.h>
|
9 |
+
|
10 |
+
#define UNUSED GGML_UNUSED
|
11 |
+
|
12 |
+
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
13 |
+
|
14 |
+
// backend buffer
|
15 |
+
|
16 |
+
ggml_backend_buffer_t ggml_backend_buffer_init(
|
17 |
+
struct ggml_backend * backend,
|
18 |
+
struct ggml_backend_buffer_i iface,
|
19 |
+
ggml_backend_buffer_context_t context,
|
20 |
+
size_t size) {
|
21 |
+
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
|
22 |
+
|
23 |
+
GGML_ASSERT(iface.get_base != NULL);
|
24 |
+
|
25 |
+
(*buffer) = (struct ggml_backend_buffer) {
|
26 |
+
/* .interface = */ iface,
|
27 |
+
/* .backend = */ backend,
|
28 |
+
/* .context = */ context,
|
29 |
+
/* .size = */ size,
|
30 |
+
};
|
31 |
+
|
32 |
+
return buffer;
|
33 |
+
}
|
34 |
+
|
35 |
+
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
36 |
+
if (buffer->iface.free_buffer != NULL) {
|
37 |
+
buffer->iface.free_buffer(buffer);
|
38 |
+
}
|
39 |
+
free(buffer);
|
40 |
+
}
|
41 |
+
|
42 |
+
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
|
43 |
+
return ggml_backend_get_alignment(buffer->backend);
|
44 |
+
}
|
45 |
+
|
46 |
+
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
47 |
+
return buffer->iface.get_base(buffer);
|
48 |
+
}
|
49 |
+
|
50 |
+
size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
|
51 |
+
return buffer->size;
|
52 |
+
}
|
53 |
+
|
54 |
+
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
55 |
+
if (buffer->iface.get_alloc_size) {
|
56 |
+
return buffer->iface.get_alloc_size(buffer, tensor);
|
57 |
+
}
|
58 |
+
return ggml_nbytes(tensor);
|
59 |
+
}
|
60 |
+
|
61 |
+
void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
62 |
+
if (buffer->iface.init_tensor) {
|
63 |
+
buffer->iface.init_tensor(buffer, tensor);
|
64 |
+
}
|
65 |
+
}
|
66 |
+
|
67 |
+
void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
68 |
+
if (buffer->iface.free_tensor) {
|
69 |
+
buffer->iface.free_tensor(buffer, tensor);
|
70 |
+
}
|
71 |
+
}
|
72 |
+
|
73 |
+
// backend
|
74 |
+
|
75 |
+
ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
|
76 |
+
return tensor->buffer->backend;
|
77 |
+
}
|
78 |
+
|
79 |
+
const char * ggml_backend_name(ggml_backend_t backend) {
|
80 |
+
return backend->iface.get_name(backend);
|
81 |
+
}
|
82 |
+
|
83 |
+
void ggml_backend_free(ggml_backend_t backend) {
|
84 |
+
backend->iface.free(backend);
|
85 |
+
}
|
86 |
+
|
87 |
+
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
|
88 |
+
return backend->iface.alloc_buffer(backend, size);
|
89 |
+
}
|
90 |
+
|
91 |
+
size_t ggml_backend_get_alignment(ggml_backend_t backend) {
|
92 |
+
return backend->iface.get_alignment(backend);
|
93 |
+
}
|
94 |
+
|
95 |
+
void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
96 |
+
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
97 |
+
}
|
98 |
+
|
99 |
+
void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
100 |
+
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
101 |
+
}
|
102 |
+
|
103 |
+
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
104 |
+
ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
105 |
+
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
106 |
+
}
|
107 |
+
|
108 |
+
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
109 |
+
ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
|
110 |
+
ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor));
|
111 |
+
}
|
112 |
+
|
113 |
+
void ggml_backend_synchronize(ggml_backend_t backend) {
|
114 |
+
backend->iface.synchronize(backend);
|
115 |
+
}
|
116 |
+
|
117 |
+
ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
118 |
+
return backend->iface.graph_plan_create(backend, cgraph);
|
119 |
+
}
|
120 |
+
|
121 |
+
void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
122 |
+
backend->iface.graph_plan_free(backend, plan);
|
123 |
+
}
|
124 |
+
|
125 |
+
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
126 |
+
backend->iface.graph_plan_compute(backend, plan);
|
127 |
+
}
|
128 |
+
|
129 |
+
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
130 |
+
backend->iface.graph_compute(backend, cgraph);
|
131 |
+
}
|
132 |
+
|
133 |
+
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
134 |
+
return backend->iface.supports_op(backend, op);
|
135 |
+
}
|
136 |
+
|
137 |
+
// backend copy
|
138 |
+
|
139 |
+
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
140 |
+
if (a->type != b->type) {
|
141 |
+
return false;
|
142 |
+
}
|
143 |
+
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
144 |
+
if (a->ne[i] != b->ne[i]) {
|
145 |
+
return false;
|
146 |
+
}
|
147 |
+
if (a->nb[i] != b->nb[i]) {
|
148 |
+
return false;
|
149 |
+
}
|
150 |
+
}
|
151 |
+
return true;
|
152 |
+
}
|
153 |
+
|
154 |
+
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
|
155 |
+
//printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
|
156 |
+
//printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
|
157 |
+
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
|
158 |
+
|
159 |
+
// printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
|
160 |
+
|
161 |
+
if (src == dst) {
|
162 |
+
return;
|
163 |
+
}
|
164 |
+
|
165 |
+
// TODO: allow backends to support copy to/from same backend
|
166 |
+
|
167 |
+
if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) {
|
168 |
+
ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
|
169 |
+
} else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) {
|
170 |
+
ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
|
171 |
+
} else {
|
172 |
+
// shouldn't be hit when copying from/to CPU
|
173 |
+
#ifndef NDEBUG
|
174 |
+
fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
|
175 |
+
#endif
|
176 |
+
size_t nbytes = ggml_nbytes(src);
|
177 |
+
void * data = malloc(nbytes);
|
178 |
+
ggml_backend_tensor_get(src, data, 0, nbytes);
|
179 |
+
ggml_backend_tensor_set(dst, data, 0, nbytes);
|
180 |
+
free(data);
|
181 |
+
}
|
182 |
+
}
|
183 |
+
|
184 |
+
// backend CPU
|
185 |
+
|
186 |
+
struct ggml_backend_cpu_context {
|
187 |
+
int n_threads;
|
188 |
+
void * work_data;
|
189 |
+
size_t work_size;
|
190 |
+
};
|
191 |
+
|
192 |
+
static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
|
193 |
+
return "CPU";
|
194 |
+
|
195 |
+
UNUSED(backend);
|
196 |
+
}
|
197 |
+
|
198 |
+
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
199 |
+
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
200 |
+
free(cpu_ctx->work_data);
|
201 |
+
free(cpu_ctx);
|
202 |
+
free(backend);
|
203 |
+
}
|
204 |
+
|
205 |
+
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
206 |
+
return (void *)buffer->context;
|
207 |
+
}
|
208 |
+
|
209 |
+
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
210 |
+
free(buffer->context);
|
211 |
+
UNUSED(buffer);
|
212 |
+
}
|
213 |
+
|
214 |
+
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
215 |
+
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
216 |
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
217 |
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
218 |
+
/* .init_tensor = */ NULL, // no initialization required
|
219 |
+
/* .free_tensor = */ NULL, // no cleanup required
|
220 |
+
};
|
221 |
+
|
222 |
+
// for buffers from ptr, free is not called
|
223 |
+
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
224 |
+
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
225 |
+
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
226 |
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
227 |
+
/* .init_tensor = */ NULL,
|
228 |
+
/* .free_tensor = */ NULL,
|
229 |
+
};
|
230 |
+
|
231 |
+
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
232 |
+
|
233 |
+
static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
|
234 |
+
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
235 |
+
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
236 |
+
|
237 |
+
return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
|
238 |
+
}
|
239 |
+
|
240 |
+
static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
|
241 |
+
return TENSOR_ALIGNMENT;
|
242 |
+
UNUSED(backend);
|
243 |
+
}
|
244 |
+
|
245 |
+
static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
246 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
247 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
248 |
+
|
249 |
+
memcpy((char *)tensor->data + offset, data, size);
|
250 |
+
|
251 |
+
UNUSED(backend);
|
252 |
+
}
|
253 |
+
|
254 |
+
static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
255 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
256 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
257 |
+
|
258 |
+
memcpy(data, (const char *)tensor->data + offset, size);
|
259 |
+
|
260 |
+
UNUSED(backend);
|
261 |
+
}
|
262 |
+
|
263 |
+
static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
|
264 |
+
UNUSED(backend);
|
265 |
+
}
|
266 |
+
|
267 |
+
static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
268 |
+
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
269 |
+
|
270 |
+
UNUSED(backend);
|
271 |
+
}
|
272 |
+
|
273 |
+
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
274 |
+
// for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
|
275 |
+
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
276 |
+
|
277 |
+
UNUSED(backend);
|
278 |
+
}
|
279 |
+
|
280 |
+
struct ggml_backend_plan_cpu {
|
281 |
+
struct ggml_cplan cplan;
|
282 |
+
struct ggml_cgraph cgraph;
|
283 |
+
};
|
284 |
+
|
285 |
+
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
286 |
+
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
287 |
+
|
288 |
+
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
289 |
+
|
290 |
+
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
291 |
+
cpu_plan->cgraph = *cgraph;
|
292 |
+
|
293 |
+
if (cpu_plan->cplan.work_size > 0) {
|
294 |
+
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
295 |
+
}
|
296 |
+
|
297 |
+
return cpu_plan;
|
298 |
+
}
|
299 |
+
|
300 |
+
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
301 |
+
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
302 |
+
|
303 |
+
free(cpu_plan->cplan.work_data);
|
304 |
+
free(cpu_plan);
|
305 |
+
|
306 |
+
UNUSED(backend);
|
307 |
+
}
|
308 |
+
|
309 |
+
static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
310 |
+
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
311 |
+
|
312 |
+
ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
313 |
+
|
314 |
+
UNUSED(backend);
|
315 |
+
}
|
316 |
+
|
317 |
+
static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
318 |
+
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
319 |
+
|
320 |
+
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
321 |
+
|
322 |
+
if (cpu_ctx->work_size < cplan.work_size) {
|
323 |
+
// TODO: may be faster to free and use malloc to avoid the copy
|
324 |
+
cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
|
325 |
+
cpu_ctx->work_size = cplan.work_size;
|
326 |
+
}
|
327 |
+
|
328 |
+
cplan.work_data = cpu_ctx->work_data;
|
329 |
+
|
330 |
+
ggml_graph_compute(cgraph, &cplan);
|
331 |
+
}
|
332 |
+
|
333 |
+
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
334 |
+
return true;
|
335 |
+
UNUSED(backend);
|
336 |
+
UNUSED(op);
|
337 |
+
}
|
338 |
+
|
339 |
+
static struct ggml_backend_i cpu_backend_i = {
|
340 |
+
/* .get_name = */ ggml_backend_cpu_name,
|
341 |
+
/* .free = */ ggml_backend_cpu_free,
|
342 |
+
/* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer,
|
343 |
+
/* .get_alignment = */ ggml_backend_cpu_get_alignment,
|
344 |
+
/* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async,
|
345 |
+
/* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async,
|
346 |
+
/* .synchronize = */ ggml_backend_cpu_synchronize,
|
347 |
+
/* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from,
|
348 |
+
/* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to,
|
349 |
+
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
350 |
+
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
351 |
+
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
352 |
+
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
353 |
+
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
354 |
+
};
|
355 |
+
|
356 |
+
ggml_backend_t ggml_backend_cpu_init(void) {
|
357 |
+
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
|
358 |
+
|
359 |
+
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
360 |
+
ctx->work_data = NULL;
|
361 |
+
ctx->work_size = 0;
|
362 |
+
|
363 |
+
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
|
364 |
+
|
365 |
+
*cpu_backend = (struct ggml_backend) {
|
366 |
+
/* .interface = */ cpu_backend_i,
|
367 |
+
/* .context = */ ctx
|
368 |
+
};
|
369 |
+
return cpu_backend;
|
370 |
+
}
|
371 |
+
|
372 |
+
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
373 |
+
return backend->iface.get_name == ggml_backend_cpu_name;
|
374 |
+
}
|
375 |
+
|
376 |
+
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
377 |
+
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
378 |
+
|
379 |
+
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
380 |
+
ctx->n_threads = n_threads;
|
381 |
+
}
|
382 |
+
|
383 |
+
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
|
384 |
+
return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
|
385 |
+
}
|
ggml-backend.h
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#pragma once
|
2 |
+
|
3 |
+
#include "ggml.h"
|
4 |
+
|
5 |
+
#ifdef __cplusplus
|
6 |
+
extern "C" {
|
7 |
+
#endif
|
8 |
+
struct ggml_backend;
|
9 |
+
struct ggml_backend_buffer;
|
10 |
+
|
11 |
+
// type-erased backend-specific types / wrappers
|
12 |
+
typedef void * ggml_backend_context_t;
|
13 |
+
typedef void * ggml_backend_graph_plan_t;
|
14 |
+
typedef void * ggml_backend_buffer_context_t;
|
15 |
+
|
16 |
+
// avoid accessing internals of these types
|
17 |
+
typedef struct ggml_backend * ggml_backend_t;
|
18 |
+
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
19 |
+
|
20 |
+
//
|
21 |
+
// backend buffer
|
22 |
+
//
|
23 |
+
|
24 |
+
struct ggml_backend_buffer_i {
|
25 |
+
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
26 |
+
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
27 |
+
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
28 |
+
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
29 |
+
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
30 |
+
};
|
31 |
+
|
32 |
+
// TODO: hide behind API
|
33 |
+
struct ggml_backend_buffer {
|
34 |
+
struct ggml_backend_buffer_i iface;
|
35 |
+
|
36 |
+
ggml_backend_t backend;
|
37 |
+
ggml_backend_buffer_context_t context;
|
38 |
+
|
39 |
+
size_t size;
|
40 |
+
};
|
41 |
+
|
42 |
+
// backend buffer functions
|
43 |
+
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
44 |
+
struct ggml_backend * backend,
|
45 |
+
struct ggml_backend_buffer_i iface,
|
46 |
+
ggml_backend_buffer_context_t context,
|
47 |
+
size_t size);
|
48 |
+
|
49 |
+
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
50 |
+
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
51 |
+
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
52 |
+
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
53 |
+
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
54 |
+
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
55 |
+
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
56 |
+
|
57 |
+
//
|
58 |
+
// backend
|
59 |
+
//
|
60 |
+
|
61 |
+
struct ggml_backend_i {
|
62 |
+
const char * (*get_name)(ggml_backend_t backend);
|
63 |
+
|
64 |
+
void (*free)(ggml_backend_t backend);
|
65 |
+
|
66 |
+
// buffer allocation
|
67 |
+
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
68 |
+
|
69 |
+
// get buffer alignment
|
70 |
+
size_t (*get_alignment)(ggml_backend_t backend);
|
71 |
+
|
72 |
+
// tensor data access
|
73 |
+
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
74 |
+
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
75 |
+
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
76 |
+
void (*synchronize) (ggml_backend_t backend);
|
77 |
+
|
78 |
+
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
79 |
+
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
80 |
+
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
81 |
+
|
82 |
+
// compute graph with a plan
|
83 |
+
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
84 |
+
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
85 |
+
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
86 |
+
|
87 |
+
// compute graph without a plan
|
88 |
+
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
89 |
+
|
90 |
+
// check if the backend supports an operation
|
91 |
+
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
92 |
+
};
|
93 |
+
|
94 |
+
// TODO: hide behind API
|
95 |
+
struct ggml_backend {
|
96 |
+
struct ggml_backend_i iface;
|
97 |
+
|
98 |
+
ggml_backend_context_t context;
|
99 |
+
};
|
100 |
+
|
101 |
+
// backend helper functions
|
102 |
+
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
103 |
+
|
104 |
+
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
105 |
+
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
106 |
+
|
107 |
+
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
108 |
+
|
109 |
+
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
110 |
+
|
111 |
+
GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
112 |
+
GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
113 |
+
|
114 |
+
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
115 |
+
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
116 |
+
|
117 |
+
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
118 |
+
|
119 |
+
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
120 |
+
|
121 |
+
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
122 |
+
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
123 |
+
GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
124 |
+
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
125 |
+
|
126 |
+
// tensor copy between different backends
|
127 |
+
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
128 |
+
|
129 |
+
//
|
130 |
+
// CPU backend
|
131 |
+
//
|
132 |
+
|
133 |
+
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
134 |
+
|
135 |
+
GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
|
136 |
+
|
137 |
+
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
138 |
+
|
139 |
+
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
140 |
+
|
141 |
+
#ifdef __cplusplus
|
142 |
+
}
|
143 |
+
#endif
|
ggml-cuda.cu
CHANGED
@@ -62,6 +62,7 @@
|
|
62 |
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
63 |
#define cudaMemcpyKind hipMemcpyKind
|
64 |
#define cudaMemset hipMemset
|
|
|
65 |
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
66 |
#define cudaSetDevice hipSetDevice
|
67 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
@@ -414,11 +415,13 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|
414 |
#define CUDA_SILU_BLOCK_SIZE 256
|
415 |
#define CUDA_CPY_BLOCK_SIZE 32
|
416 |
#define CUDA_SCALE_BLOCK_SIZE 256
|
|
|
417 |
#define CUDA_ROPE_BLOCK_SIZE 256
|
418 |
#define CUDA_ALIBI_BLOCK_SIZE 32
|
419 |
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
420 |
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
421 |
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
|
|
422 |
|
423 |
// dmmv = dequantize_mul_mat_vec
|
424 |
#ifndef GGML_CUDA_DMMV_X
|
@@ -1574,6 +1577,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|
1574 |
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1575 |
}
|
1576 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1577 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1578 |
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1579 |
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
@@ -4555,6 +4586,24 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale
|
|
4555 |
dst[i] = scale * x[i];
|
4556 |
}
|
4557 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4558 |
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4559 |
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4560 |
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
@@ -5436,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|
5436 |
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5437 |
}
|
5438 |
|
|
|
|
|
|
|
|
|
|
|
5439 |
template<typename T>
|
5440 |
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5441 |
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
@@ -5699,7 +5753,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5699 |
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5700 |
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5701 |
kind = cudaMemcpyDeviceToDevice;
|
5702 |
-
|
5703 |
int id;
|
5704 |
CUDA_CHECK(cudaGetDevice(&id));
|
5705 |
src_ptr = (char *) extra->data_device[id];
|
@@ -5735,6 +5789,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
|
5735 |
}
|
5736 |
}
|
5737 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5738 |
inline void ggml_cuda_op_add(
|
5739 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5740 |
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
@@ -6275,12 +6430,12 @@ inline void ggml_cuda_op_alibi(
|
|
6275 |
const int64_t ne02 = src0->ne[2];
|
6276 |
const int64_t nrows = ggml_nrows(src0);
|
6277 |
|
6278 |
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
6279 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
6280 |
float max_bias;
|
6281 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
6282 |
|
6283 |
-
GGML_ASSERT(ne01 + n_past == ne00);
|
6284 |
GGML_ASSERT(n_head == ne02);
|
6285 |
|
6286 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
@@ -6339,7 +6494,14 @@ inline void ggml_cuda_op_scale(
|
|
6339 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6340 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6341 |
|
6342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6343 |
|
6344 |
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
6345 |
CUDA_CHECK(cudaGetLastError());
|
@@ -6349,6 +6511,24 @@ inline void ggml_cuda_op_scale(
|
|
6349 |
(void) src1_dd;
|
6350 |
}
|
6351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6352 |
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6353 |
const int64_t nrows0 = ggml_nrows(src0);
|
6354 |
|
@@ -6358,9 +6538,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
|
6358 |
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6359 |
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6360 |
|
6361 |
-
|
6362 |
-
|
6363 |
-
|
6364 |
|
6365 |
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6366 |
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
@@ -6501,9 +6681,9 @@ static void ggml_cuda_op_mul_mat(
|
|
6501 |
const size_t q8_1_ts = sizeof(block_q8_1);
|
6502 |
const size_t q8_1_bs = QK8_1;
|
6503 |
|
6504 |
-
|
6505 |
-
|
6506 |
-
|
6507 |
|
6508 |
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6509 |
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
@@ -6581,7 +6761,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6581 |
if (convert_src1_to_q8_1) {
|
6582 |
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6583 |
|
6584 |
-
if (
|
6585 |
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6586 |
CUDA_CHECK(cudaGetLastError());
|
6587 |
}
|
@@ -6663,7 +6843,7 @@ static void ggml_cuda_op_mul_mat(
|
|
6663 |
GGML_ASSERT(false);
|
6664 |
}
|
6665 |
|
6666 |
-
if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
|
6667 |
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6668 |
CUDA_CHECK(cudaGetLastError());
|
6669 |
}
|
@@ -6754,6 +6934,14 @@ static void ggml_cuda_op_mul_mat(
|
|
6754 |
}
|
6755 |
}
|
6756 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6757 |
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6758 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6759 |
}
|
@@ -6808,13 +6996,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
|
|
6808 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6809 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6810 |
|
6811 |
-
|
6812 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6813 |
|
6814 |
-
|
6815 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6816 |
|
6817 |
-
|
6818 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6819 |
|
6820 |
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
@@ -6839,13 +7027,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
|
6839 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6840 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6841 |
|
6842 |
-
|
6843 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
6844 |
|
6845 |
-
|
6846 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
6847 |
|
6848 |
-
|
6849 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
6850 |
|
6851 |
const int64_t row_stride_x = nb01 / sizeof(half);
|
@@ -6866,11 +7054,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|
6866 |
}
|
6867 |
}
|
6868 |
|
6869 |
-
if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
6870 |
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
6871 |
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
6872 |
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
6873 |
-
}else if (src0->type == GGML_TYPE_F32) {
|
6874 |
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
6875 |
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
6876 |
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
@@ -6902,6 +7090,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
6902 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
6903 |
}
|
6904 |
|
|
|
|
|
|
|
|
|
6905 |
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6906 |
const int64_t ne = ggml_nelements(src0);
|
6907 |
GGML_ASSERT(ne == ggml_nelements(src1));
|
@@ -6931,8 +7123,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
|
6931 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6932 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6933 |
|
6934 |
-
const
|
6935 |
-
const
|
6936 |
|
6937 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
6938 |
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
@@ -6987,8 +7179,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
6987 |
|
6988 |
const size_t nb1 = tensor->nb[1];
|
6989 |
|
6990 |
-
|
6991 |
-
|
6992 |
memset(extra, 0, sizeof(*extra));
|
6993 |
|
6994 |
for (int64_t id = 0; id < g_device_count; ++id) {
|
@@ -7042,7 +7234,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
|
7042 |
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
7043 |
}
|
7044 |
|
7045 |
-
|
7046 |
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
7047 |
|
7048 |
extra->data_device[id] = buf;
|
@@ -7081,17 +7272,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
|
|
7081 |
delete extra;
|
7082 |
}
|
7083 |
|
7084 |
-
static
|
7085 |
static size_t g_temp_tensor_extra_index = 0;
|
7086 |
|
7087 |
-
static
|
7088 |
if (g_temp_tensor_extras == nullptr) {
|
7089 |
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7090 |
}
|
7091 |
|
7092 |
size_t alloc_index = g_temp_tensor_extra_index;
|
7093 |
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7094 |
-
|
7095 |
memset(extra, 0, sizeof(*extra));
|
7096 |
|
7097 |
return extra;
|
@@ -7119,7 +7310,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7119 |
return;
|
7120 |
}
|
7121 |
|
7122 |
-
|
7123 |
|
7124 |
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7125 |
tensor->op == GGML_OP_VIEW ||
|
@@ -7128,7 +7319,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7128 |
|
7129 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7130 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7131 |
-
|
7132 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7133 |
size_t offset = 0;
|
7134 |
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7137,7 +7328,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra
|
|
7137 |
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7138 |
extra->data_device[g_main_device] = src0_ddc + offset;
|
7139 |
} else if (tensor->op == GGML_OP_CPY) {
|
7140 |
-
|
7141 |
void * src1_ddv = src1_extra->data_device[g_main_device];
|
7142 |
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7143 |
extra->data_device[g_main_device] = src1_ddv;
|
@@ -7179,13 +7370,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset)
|
|
7179 |
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
7180 |
}
|
7181 |
|
7182 |
-
|
7183 |
|
7184 |
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7185 |
tensor->op == GGML_OP_VIEW;
|
7186 |
|
7187 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7188 |
-
|
7189 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7190 |
size_t view_offset = 0;
|
7191 |
if (tensor->op == GGML_OP_VIEW) {
|
@@ -7203,7 +7394,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
|
|
7203 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7204 |
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7205 |
|
7206 |
-
|
7207 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7208 |
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7209 |
}
|
@@ -7260,58 +7451,47 @@ void ggml_cuda_free_scratch() {
|
|
7260 |
g_scratch_buffer = nullptr;
|
7261 |
}
|
7262 |
|
7263 |
-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
|
7264 |
ggml_cuda_func_t func;
|
7265 |
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7266 |
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
7267 |
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
7268 |
|
|
|
|
|
|
|
|
|
7269 |
switch (tensor->op) {
|
|
|
|
|
|
|
|
|
|
|
|
|
7270 |
case GGML_OP_DUP:
|
7271 |
-
if (!any_on_device) {
|
7272 |
-
return false;
|
7273 |
-
}
|
7274 |
func = ggml_cuda_dup;
|
7275 |
break;
|
7276 |
case GGML_OP_ADD:
|
7277 |
-
if (!any_on_device) {
|
7278 |
-
return false;
|
7279 |
-
}
|
7280 |
func = ggml_cuda_add;
|
7281 |
break;
|
7282 |
case GGML_OP_MUL:
|
7283 |
-
if (!any_on_device) {
|
7284 |
-
return false;
|
7285 |
-
}
|
7286 |
func = ggml_cuda_mul;
|
7287 |
break;
|
7288 |
case GGML_OP_UNARY:
|
7289 |
switch (ggml_get_unary_op(tensor)) {
|
7290 |
case GGML_UNARY_OP_GELU:
|
7291 |
-
if (!any_on_device) {
|
7292 |
-
return false;
|
7293 |
-
}
|
7294 |
func = ggml_cuda_gelu;
|
7295 |
break;
|
7296 |
case GGML_UNARY_OP_SILU:
|
7297 |
-
if (!any_on_device) {
|
7298 |
-
return false;
|
7299 |
-
}
|
7300 |
func = ggml_cuda_silu;
|
7301 |
break;
|
7302 |
default:
|
7303 |
return false;
|
7304 |
} break;
|
7305 |
case GGML_OP_NORM:
|
7306 |
-
if (!any_on_device) {
|
7307 |
-
return false;
|
7308 |
-
}
|
7309 |
func = ggml_cuda_norm;
|
7310 |
break;
|
7311 |
case GGML_OP_RMS_NORM:
|
7312 |
-
if (!any_on_device) {
|
7313 |
-
return false;
|
7314 |
-
}
|
7315 |
func = ggml_cuda_rms_norm;
|
7316 |
break;
|
7317 |
case GGML_OP_MUL_MAT:
|
@@ -7321,54 +7501,36 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|
7321 |
func = ggml_cuda_mul_mat;
|
7322 |
break;
|
7323 |
case GGML_OP_SCALE:
|
7324 |
-
if (!any_on_device) {
|
7325 |
-
return false;
|
7326 |
-
}
|
7327 |
func = ggml_cuda_scale;
|
7328 |
break;
|
7329 |
-
case
|
7330 |
if (!any_on_device) {
|
7331 |
return false;
|
7332 |
}
|
|
|
|
|
|
|
7333 |
func = ggml_cuda_cpy;
|
7334 |
break;
|
7335 |
case GGML_OP_CONT:
|
7336 |
-
if (!any_on_device) {
|
7337 |
-
return false;
|
7338 |
-
}
|
7339 |
func = ggml_cuda_dup;
|
7340 |
break;
|
7341 |
case GGML_OP_RESHAPE:
|
7342 |
case GGML_OP_VIEW:
|
7343 |
case GGML_OP_PERMUTE:
|
7344 |
case GGML_OP_TRANSPOSE:
|
7345 |
-
if (!any_on_device) {
|
7346 |
-
return false;
|
7347 |
-
}
|
7348 |
func = ggml_cuda_nop;
|
7349 |
break;
|
7350 |
case GGML_OP_DIAG_MASK_INF:
|
7351 |
-
if (!any_on_device) {
|
7352 |
-
return false;
|
7353 |
-
}
|
7354 |
func = ggml_cuda_diag_mask_inf;
|
7355 |
break;
|
7356 |
case GGML_OP_SOFT_MAX:
|
7357 |
-
if (!any_on_device) {
|
7358 |
-
return false;
|
7359 |
-
}
|
7360 |
func = ggml_cuda_soft_max;
|
7361 |
break;
|
7362 |
case GGML_OP_ROPE:
|
7363 |
-
if (!any_on_device) {
|
7364 |
-
return false;
|
7365 |
-
}
|
7366 |
func = ggml_cuda_rope;
|
7367 |
break;
|
7368 |
case GGML_OP_ALIBI:
|
7369 |
-
if (!any_on_device) {
|
7370 |
-
return false;
|
7371 |
-
}
|
7372 |
func = ggml_cuda_alibi;
|
7373 |
break;
|
7374 |
default:
|
@@ -7396,3 +7558,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|
7396 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
7397 |
snprintf(description, description_size, "%s", prop.name);
|
7398 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
63 |
#define cudaMemcpyKind hipMemcpyKind
|
64 |
#define cudaMemset hipMemset
|
65 |
+
#define cudaMemsetAsync hipMemsetAsync
|
66 |
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
67 |
#define cudaSetDevice hipSetDevice
|
68 |
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
|
|
415 |
#define CUDA_SILU_BLOCK_SIZE 256
|
416 |
#define CUDA_CPY_BLOCK_SIZE 32
|
417 |
#define CUDA_SCALE_BLOCK_SIZE 256
|
418 |
+
#define CUDA_CLAMP_BLOCK_SIZE 256
|
419 |
#define CUDA_ROPE_BLOCK_SIZE 256
|
420 |
#define CUDA_ALIBI_BLOCK_SIZE 32
|
421 |
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
422 |
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
423 |
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
424 |
+
#define CUDA_GET_ROWS_BLOCK_SIZE 256
|
425 |
|
426 |
// dmmv = dequantize_mul_mat_vec
|
427 |
#ifndef GGML_CUDA_DMMV_X
|
|
|
1577 |
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
1578 |
}
|
1579 |
|
1580 |
+
template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1581 |
+
static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
|
1582 |
+
const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
|
1583 |
+
const int row = blockDim.y*blockIdx.y + threadIdx.y;
|
1584 |
+
|
1585 |
+
if (col >= ncols) {
|
1586 |
+
return;
|
1587 |
+
}
|
1588 |
+
|
1589 |
+
const int r = y[row];
|
1590 |
+
|
1591 |
+
// copy x[r*ncols + col] to dst[row*ncols + col]
|
1592 |
+
const int xi = r*ncols + col;
|
1593 |
+
const int di = row*ncols + col;
|
1594 |
+
|
1595 |
+
const int ib = xi/qk; // block index
|
1596 |
+
const int iqs = (xi%qk)/qr; // quant index
|
1597 |
+
const int iybs = di - di%qk; // y block start index
|
1598 |
+
const int y_offset = qr == 1 ? 1 : qk/2;
|
1599 |
+
|
1600 |
+
// dequantize
|
1601 |
+
dfloat2 v;
|
1602 |
+
dequantize_kernel(x, ib, iqs, v);
|
1603 |
+
|
1604 |
+
dst[iybs + iqs + 0] = v.x;
|
1605 |
+
dst[iybs + iqs + y_offset] = v.y;
|
1606 |
+
}
|
1607 |
+
|
1608 |
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
1609 |
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
|
1610 |
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
|
|
4586 |
dst[i] = scale * x[i];
|
4587 |
}
|
4588 |
|
4589 |
+
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) {
|
4590 |
+
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
4591 |
+
|
4592 |
+
if (i >= k) {
|
4593 |
+
return;
|
4594 |
+
}
|
4595 |
+
|
4596 |
+
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
|
4597 |
+
}
|
4598 |
+
|
4599 |
+
template<int qk, int qr, dequantize_kernel_t dq>
|
4600 |
+
static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
|
4601 |
+
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
|
4602 |
+
const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
|
4603 |
+
const dim3 block_nums(block_num_x, nrows, 1);
|
4604 |
+
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
|
4605 |
+
}
|
4606 |
+
|
4607 |
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
|
4608 |
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
|
4609 |
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
|
|
|
5485 |
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
|
5486 |
}
|
5487 |
|
5488 |
+
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) {
|
5489 |
+
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
|
5490 |
+
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
|
5491 |
+
}
|
5492 |
+
|
5493 |
template<typename T>
|
5494 |
static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale,
|
5495 |
const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
|
|
5753 |
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
|
5754 |
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
|
5755 |
kind = cudaMemcpyDeviceToDevice;
|
5756 |
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
|
5757 |
int id;
|
5758 |
CUDA_CHECK(cudaGetDevice(&id));
|
5759 |
src_ptr = (char *) extra->data_device[id];
|
|
|
5789 |
}
|
5790 |
}
|
5791 |
|
5792 |
+
static void ggml_cuda_op_repeat(
|
5793 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5794 |
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5795 |
+
// guaranteed to be an integer due to the check in ggml_can_repeat
|
5796 |
+
const int64_t ne0 = dst->ne[0];
|
5797 |
+
const int64_t ne1 = dst->ne[1];
|
5798 |
+
const int64_t ne2 = dst->ne[2];
|
5799 |
+
const int64_t ne3 = dst->ne[3];
|
5800 |
+
|
5801 |
+
const int64_t ne00 = src0->ne[0];
|
5802 |
+
const int64_t ne01 = src0->ne[1];
|
5803 |
+
const int64_t ne02 = src0->ne[2];
|
5804 |
+
const int64_t ne03 = src0->ne[3];
|
5805 |
+
|
5806 |
+
const size_t nb0 = dst->nb[0];
|
5807 |
+
const size_t nb1 = dst->nb[1];
|
5808 |
+
const size_t nb2 = dst->nb[2];
|
5809 |
+
const size_t nb3 = dst->nb[3];
|
5810 |
+
|
5811 |
+
const size_t nb00 = src0->nb[0];
|
5812 |
+
const size_t nb01 = src0->nb[1];
|
5813 |
+
const size_t nb02 = src0->nb[2];
|
5814 |
+
const size_t nb03 = src0->nb[3];
|
5815 |
+
|
5816 |
+
const int nr0 = (int)(ne0/ne00);
|
5817 |
+
const int nr1 = (int)(ne1/ne01);
|
5818 |
+
const int nr2 = (int)(ne2/ne02);
|
5819 |
+
const int nr3 = (int)(ne3/ne03);
|
5820 |
+
|
5821 |
+
// TODO: support for transposed / permuted tensors
|
5822 |
+
GGML_ASSERT(nb0 == sizeof(float));
|
5823 |
+
GGML_ASSERT(nb00 == sizeof(float));
|
5824 |
+
|
5825 |
+
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
5826 |
+
for (int i3 = 0; i3 < nr3; i3++) {
|
5827 |
+
for (int k3 = 0; k3 < ne03; k3++) {
|
5828 |
+
for (int i2 = 0; i2 < nr2; i2++) {
|
5829 |
+
for (int k2 = 0; k2 < ne02; k2++) {
|
5830 |
+
for (int i1 = 0; i1 < nr1; i1++) {
|
5831 |
+
for (int k1 = 0; k1 < ne01; k1++) {
|
5832 |
+
for (int i0 = 0; i0 < nr0; i0++) {
|
5833 |
+
CUDA_CHECK(cudaMemcpyAsync(
|
5834 |
+
(char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
5835 |
+
(const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
5836 |
+
ne00*nb0, cudaMemcpyDeviceToDevice, stream));
|
5837 |
+
}
|
5838 |
+
}
|
5839 |
+
}
|
5840 |
+
}
|
5841 |
+
}
|
5842 |
+
}
|
5843 |
+
}
|
5844 |
+
|
5845 |
+
(void) src1;
|
5846 |
+
(void) src1_d;
|
5847 |
+
}
|
5848 |
+
|
5849 |
+
static void ggml_cuda_op_get_rows(
|
5850 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5851 |
+
const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
|
5852 |
+
|
5853 |
+
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
5854 |
+
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
5855 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
5856 |
+
GGML_ASSERT(ggml_is_contiguous(src1));
|
5857 |
+
GGML_ASSERT(ggml_is_contiguous(dst));
|
5858 |
+
|
5859 |
+
const int ncols = src0->ne[0];
|
5860 |
+
const int nrows = ggml_nelements(src1);
|
5861 |
+
|
5862 |
+
const int32_t * src1_i32 = (const int32_t *) src1_d;
|
5863 |
+
|
5864 |
+
switch (src0->type) {
|
5865 |
+
case GGML_TYPE_F16:
|
5866 |
+
get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5867 |
+
break;
|
5868 |
+
case GGML_TYPE_F32:
|
5869 |
+
get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5870 |
+
break;
|
5871 |
+
case GGML_TYPE_Q4_0:
|
5872 |
+
get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5873 |
+
break;
|
5874 |
+
case GGML_TYPE_Q4_1:
|
5875 |
+
get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5876 |
+
break;
|
5877 |
+
case GGML_TYPE_Q5_0:
|
5878 |
+
get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5879 |
+
break;
|
5880 |
+
case GGML_TYPE_Q5_1:
|
5881 |
+
get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5882 |
+
break;
|
5883 |
+
case GGML_TYPE_Q8_0:
|
5884 |
+
get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
|
5885 |
+
break;
|
5886 |
+
default:
|
5887 |
+
// TODO: k-quants
|
5888 |
+
GGML_ASSERT(false);
|
5889 |
+
break;
|
5890 |
+
}
|
5891 |
+
}
|
5892 |
+
|
5893 |
inline void ggml_cuda_op_add(
|
5894 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
5895 |
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
|
|
6430 |
const int64_t ne02 = src0->ne[2];
|
6431 |
const int64_t nrows = ggml_nrows(src0);
|
6432 |
|
6433 |
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
6434 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
6435 |
float max_bias;
|
6436 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
6437 |
|
6438 |
+
//GGML_ASSERT(ne01 + n_past == ne00);
|
6439 |
GGML_ASSERT(n_head == ne02);
|
6440 |
|
6441 |
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
|
|
6494 |
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
6495 |
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6496 |
|
6497 |
+
float scale;
|
6498 |
+
// HACK: support for ggml backend interface
|
6499 |
+
if (src1->backend == GGML_BACKEND_CPU) {
|
6500 |
+
scale = ((float *) src1->data)[0];
|
6501 |
+
} else {
|
6502 |
+
// TODO: pass pointer to kernel instead of copying to host
|
6503 |
+
CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
|
6504 |
+
}
|
6505 |
|
6506 |
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
|
6507 |
CUDA_CHECK(cudaGetLastError());
|
|
|
6511 |
(void) src1_dd;
|
6512 |
}
|
6513 |
|
6514 |
+
inline void ggml_cuda_op_clamp(
|
6515 |
+
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
6516 |
+
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
6517 |
+
|
6518 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
6519 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
6520 |
+
|
6521 |
+
const float min = ((float *) dst->op_params)[0];
|
6522 |
+
const float max = ((float *) dst->op_params)[1];
|
6523 |
+
|
6524 |
+
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
|
6525 |
+
CUDA_CHECK(cudaGetLastError());
|
6526 |
+
|
6527 |
+
(void) src1;
|
6528 |
+
(void) dst;
|
6529 |
+
(void) src1_dd;
|
6530 |
+
}
|
6531 |
+
|
6532 |
static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) {
|
6533 |
const int64_t nrows0 = ggml_nrows(src0);
|
6534 |
|
|
|
6538 |
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
|
6539 |
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
|
6540 |
|
6541 |
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6542 |
+
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
6543 |
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6544 |
|
6545 |
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6546 |
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
|
|
|
6681 |
const size_t q8_1_ts = sizeof(block_q8_1);
|
6682 |
const size_t q8_1_bs = QK8_1;
|
6683 |
|
6684 |
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
6685 |
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
6686 |
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
6687 |
|
6688 |
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
|
6689 |
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
|
|
6761 |
if (convert_src1_to_q8_1) {
|
6762 |
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
6763 |
|
6764 |
+
if (src1_on_device && src1_is_contiguous) {
|
6765 |
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
6766 |
CUDA_CHECK(cudaGetLastError());
|
6767 |
}
|
|
|
6843 |
GGML_ASSERT(false);
|
6844 |
}
|
6845 |
|
6846 |
+
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
|
6847 |
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
6848 |
CUDA_CHECK(cudaGetLastError());
|
6849 |
}
|
|
|
6934 |
}
|
6935 |
}
|
6936 |
|
6937 |
+
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6938 |
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
|
6939 |
+
}
|
6940 |
+
|
6941 |
+
static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6942 |
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
|
6943 |
+
}
|
6944 |
+
|
6945 |
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
6946 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
|
6947 |
}
|
|
|
6996 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
6997 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
6998 |
|
6999 |
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7000 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
7001 |
|
7002 |
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7003 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7004 |
|
7005 |
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
7006 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7007 |
|
7008 |
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
|
|
7027 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7028 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7029 |
|
7030 |
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7031 |
void * src0_ddq = src0_extra->data_device[g_main_device];
|
7032 |
|
7033 |
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7034 |
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
7035 |
|
7036 |
+
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
7037 |
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
7038 |
|
7039 |
const int64_t row_stride_x = nb01 / sizeof(half);
|
|
|
7054 |
}
|
7055 |
}
|
7056 |
|
7057 |
+
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
7058 |
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
7059 |
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
|
7060 |
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
7061 |
+
} else if (src0->type == GGML_TYPE_F32) {
|
7062 |
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
7063 |
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
7064 |
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
|
|
7090 |
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
|
7091 |
}
|
7092 |
|
7093 |
+
static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7094 |
+
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
|
7095 |
+
}
|
7096 |
+
|
7097 |
static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
7098 |
const int64_t ne = ggml_nelements(src0);
|
7099 |
GGML_ASSERT(ne == ggml_nelements(src1));
|
|
|
7123 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7124 |
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
7125 |
|
7126 |
+
const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
|
7127 |
+
const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
7128 |
|
7129 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7130 |
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
|
|
|
7179 |
|
7180 |
const size_t nb1 = tensor->nb[1];
|
7181 |
|
7182 |
+
ggml_backend_type backend = tensor->backend;
|
7183 |
+
ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
7184 |
memset(extra, 0, sizeof(*extra));
|
7185 |
|
7186 |
for (int64_t id = 0; id < g_device_count; ++id) {
|
|
|
7234 |
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
7235 |
}
|
7236 |
|
|
|
7237 |
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
|
7238 |
|
7239 |
extra->data_device[id] = buf;
|
|
|
7272 |
delete extra;
|
7273 |
}
|
7274 |
|
7275 |
+
static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
|
7276 |
static size_t g_temp_tensor_extra_index = 0;
|
7277 |
|
7278 |
+
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7279 |
if (g_temp_tensor_extras == nullptr) {
|
7280 |
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7281 |
}
|
7282 |
|
7283 |
size_t alloc_index = g_temp_tensor_extra_index;
|
7284 |
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7285 |
+
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
7286 |
memset(extra, 0, sizeof(*extra));
|
7287 |
|
7288 |
return extra;
|
|
|
7310 |
return;
|
7311 |
}
|
7312 |
|
7313 |
+
ggml_tensor_extra_gpu * extra;
|
7314 |
|
7315 |
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7316 |
tensor->op == GGML_OP_VIEW ||
|
|
|
7319 |
|
7320 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7321 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7322 |
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7323 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7324 |
size_t offset = 0;
|
7325 |
if (tensor->op == GGML_OP_VIEW) {
|
|
|
7328 |
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7329 |
extra->data_device[g_main_device] = src0_ddc + offset;
|
7330 |
} else if (tensor->op == GGML_OP_CPY) {
|
7331 |
+
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
|
7332 |
void * src1_ddv = src1_extra->data_device[g_main_device];
|
7333 |
extra = ggml_cuda_alloc_temp_tensor_extra();
|
7334 |
extra->data_device[g_main_device] = src1_ddv;
|
|
|
7370 |
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
|
7371 |
}
|
7372 |
|
7373 |
+
ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
|
7374 |
|
7375 |
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
|
7376 |
tensor->op == GGML_OP_VIEW;
|
7377 |
|
7378 |
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
|
7379 |
+
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
|
7380 |
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
|
7381 |
size_t view_offset = 0;
|
7382 |
if (tensor->op == GGML_OP_VIEW) {
|
|
|
7394 |
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7395 |
GGML_ASSERT(ggml_is_contiguous(tensor));
|
7396 |
|
7397 |
+
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
|
7398 |
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
7399 |
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
|
7400 |
}
|
|
|
7451 |
g_scratch_buffer = nullptr;
|
7452 |
}
|
7453 |
|
7454 |
+
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
7455 |
ggml_cuda_func_t func;
|
7456 |
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
7457 |
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
7458 |
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
7459 |
|
7460 |
+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
|
7461 |
+
return false;
|
7462 |
+
}
|
7463 |
+
|
7464 |
switch (tensor->op) {
|
7465 |
+
case GGML_OP_REPEAT:
|
7466 |
+
func = ggml_cuda_repeat;
|
7467 |
+
break;
|
7468 |
+
case GGML_OP_GET_ROWS:
|
7469 |
+
func = ggml_cuda_get_rows;
|
7470 |
+
break;
|
7471 |
case GGML_OP_DUP:
|
|
|
|
|
|
|
7472 |
func = ggml_cuda_dup;
|
7473 |
break;
|
7474 |
case GGML_OP_ADD:
|
|
|
|
|
|
|
7475 |
func = ggml_cuda_add;
|
7476 |
break;
|
7477 |
case GGML_OP_MUL:
|
|
|
|
|
|
|
7478 |
func = ggml_cuda_mul;
|
7479 |
break;
|
7480 |
case GGML_OP_UNARY:
|
7481 |
switch (ggml_get_unary_op(tensor)) {
|
7482 |
case GGML_UNARY_OP_GELU:
|
|
|
|
|
|
|
7483 |
func = ggml_cuda_gelu;
|
7484 |
break;
|
7485 |
case GGML_UNARY_OP_SILU:
|
|
|
|
|
|
|
7486 |
func = ggml_cuda_silu;
|
7487 |
break;
|
7488 |
default:
|
7489 |
return false;
|
7490 |
} break;
|
7491 |
case GGML_OP_NORM:
|
|
|
|
|
|
|
7492 |
func = ggml_cuda_norm;
|
7493 |
break;
|
7494 |
case GGML_OP_RMS_NORM:
|
|
|
|
|
|
|
7495 |
func = ggml_cuda_rms_norm;
|
7496 |
break;
|
7497 |
case GGML_OP_MUL_MAT:
|
|
|
7501 |
func = ggml_cuda_mul_mat;
|
7502 |
break;
|
7503 |
case GGML_OP_SCALE:
|
|
|
|
|
|
|
7504 |
func = ggml_cuda_scale;
|
7505 |
break;
|
7506 |
+
case GGML_OP_CLAMP:
|
7507 |
if (!any_on_device) {
|
7508 |
return false;
|
7509 |
}
|
7510 |
+
func = ggml_cuda_clamp;
|
7511 |
+
break;
|
7512 |
+
case GGML_OP_CPY:
|
7513 |
func = ggml_cuda_cpy;
|
7514 |
break;
|
7515 |
case GGML_OP_CONT:
|
|
|
|
|
|
|
7516 |
func = ggml_cuda_dup;
|
7517 |
break;
|
7518 |
case GGML_OP_RESHAPE:
|
7519 |
case GGML_OP_VIEW:
|
7520 |
case GGML_OP_PERMUTE:
|
7521 |
case GGML_OP_TRANSPOSE:
|
|
|
|
|
|
|
7522 |
func = ggml_cuda_nop;
|
7523 |
break;
|
7524 |
case GGML_OP_DIAG_MASK_INF:
|
|
|
|
|
|
|
7525 |
func = ggml_cuda_diag_mask_inf;
|
7526 |
break;
|
7527 |
case GGML_OP_SOFT_MAX:
|
|
|
|
|
|
|
7528 |
func = ggml_cuda_soft_max;
|
7529 |
break;
|
7530 |
case GGML_OP_ROPE:
|
|
|
|
|
|
|
7531 |
func = ggml_cuda_rope;
|
7532 |
break;
|
7533 |
case GGML_OP_ALIBI:
|
|
|
|
|
|
|
7534 |
func = ggml_cuda_alibi;
|
7535 |
break;
|
7536 |
default:
|
|
|
7558 |
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
7559 |
snprintf(description, description_size, "%s", prop.name);
|
7560 |
}
|
7561 |
+
|
7562 |
+
////////////////////////////////////////////////////////////////////////////////
|
7563 |
+
|
7564 |
+
// backend interface
|
7565 |
+
|
7566 |
+
#define UNUSED GGML_UNUSED
|
7567 |
+
|
7568 |
+
struct ggml_backend_context_cuda {
|
7569 |
+
};
|
7570 |
+
|
7571 |
+
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
7572 |
+
return GGML_CUDA_NAME;
|
7573 |
+
|
7574 |
+
UNUSED(backend);
|
7575 |
+
}
|
7576 |
+
|
7577 |
+
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
7578 |
+
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
7579 |
+
delete cuda_ctx;
|
7580 |
+
delete backend;
|
7581 |
+
}
|
7582 |
+
|
7583 |
+
struct ggml_backend_buffer_context_cuda {
|
7584 |
+
void * device;
|
7585 |
+
|
7586 |
+
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
7587 |
+
size_t temp_tensor_extra_index = 0;
|
7588 |
+
|
7589 |
+
~ggml_backend_buffer_context_cuda() {
|
7590 |
+
delete[] temp_tensor_extras;
|
7591 |
+
}
|
7592 |
+
|
7593 |
+
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
7594 |
+
if (temp_tensor_extras == nullptr) {
|
7595 |
+
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
|
7596 |
+
}
|
7597 |
+
|
7598 |
+
size_t alloc_index = temp_tensor_extra_index;
|
7599 |
+
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
|
7600 |
+
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
7601 |
+
memset(extra, 0, sizeof(*extra));
|
7602 |
+
|
7603 |
+
return extra;
|
7604 |
+
}
|
7605 |
+
};
|
7606 |
+
|
7607 |
+
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
7608 |
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7609 |
+
CUDA_CHECK(cudaFree(ctx->device));
|
7610 |
+
delete ctx;
|
7611 |
+
}
|
7612 |
+
|
7613 |
+
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
7614 |
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7615 |
+
return ctx->device;
|
7616 |
+
}
|
7617 |
+
|
7618 |
+
static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7619 |
+
int64_t row_low = 0;
|
7620 |
+
int64_t row_high = ggml_nrows(tensor);
|
7621 |
+
int64_t nrows_split = row_high - row_low;
|
7622 |
+
|
7623 |
+
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
7624 |
+
|
7625 |
+
int64_t ne0 = tensor->ne[0];
|
7626 |
+
|
7627 |
+
if (ggml_is_quantized(tensor->type)) {
|
7628 |
+
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
7629 |
+
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
7630 |
+
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
7631 |
+
}
|
7632 |
+
}
|
7633 |
+
|
7634 |
+
return size;
|
7635 |
+
|
7636 |
+
UNUSED(buffer);
|
7637 |
+
}
|
7638 |
+
|
7639 |
+
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
7640 |
+
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
7641 |
+
|
7642 |
+
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
7643 |
+
assert(tensor->view_src->buffer->backend == buffer->backend);
|
7644 |
+
tensor->backend = tensor->view_src->backend;
|
7645 |
+
tensor->extra = tensor->view_src->extra;
|
7646 |
+
return;
|
7647 |
+
}
|
7648 |
+
|
7649 |
+
ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
|
7650 |
+
|
7651 |
+
extra->data_device[g_main_device] = tensor->data;
|
7652 |
+
|
7653 |
+
tensor->backend = GGML_BACKEND_GPU;
|
7654 |
+
tensor->extra = extra;
|
7655 |
+
|
7656 |
+
if (ggml_is_quantized(tensor->type)) {
|
7657 |
+
// initialize padding to 0 to avoid possible NaN values
|
7658 |
+
int64_t row_low = 0;
|
7659 |
+
int64_t row_high = ggml_nrows(tensor);
|
7660 |
+
int64_t nrows_split = row_high - row_low;
|
7661 |
+
|
7662 |
+
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
7663 |
+
size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
|
7664 |
+
|
7665 |
+
if (padded_size > original_size && tensor->view_src == nullptr) {
|
7666 |
+
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
|
7667 |
+
}
|
7668 |
+
}
|
7669 |
+
|
7670 |
+
UNUSED(buffer);
|
7671 |
+
}
|
7672 |
+
|
7673 |
+
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
7674 |
+
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
7675 |
+
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
7676 |
+
/* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
|
7677 |
+
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
7678 |
+
/* .free_tensor = */ NULL,
|
7679 |
+
};
|
7680 |
+
|
7681 |
+
static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
|
7682 |
+
ggml_cuda_set_device(g_main_device);
|
7683 |
+
|
7684 |
+
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
|
7685 |
+
CUDA_CHECK(cudaMalloc(&ctx->device, size));
|
7686 |
+
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
|
7687 |
+
}
|
7688 |
+
|
7689 |
+
static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
|
7690 |
+
return 128;
|
7691 |
+
UNUSED(backend);
|
7692 |
+
}
|
7693 |
+
|
7694 |
+
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
7695 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
7696 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7697 |
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7698 |
+
|
7699 |
+
CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
|
7700 |
+
|
7701 |
+
UNUSED(backend);
|
7702 |
+
}
|
7703 |
+
|
7704 |
+
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
7705 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
7706 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
7707 |
+
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
7708 |
+
|
7709 |
+
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
7710 |
+
|
7711 |
+
UNUSED(backend);
|
7712 |
+
}
|
7713 |
+
|
7714 |
+
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
7715 |
+
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
7716 |
+
|
7717 |
+
UNUSED(backend);
|
7718 |
+
}
|
7719 |
+
|
7720 |
+
static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7721 |
+
GGML_ASSERT(!"not implemented");
|
7722 |
+
|
7723 |
+
return nullptr;
|
7724 |
+
|
7725 |
+
UNUSED(backend);
|
7726 |
+
UNUSED(cgraph);
|
7727 |
+
}
|
7728 |
+
|
7729 |
+
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7730 |
+
GGML_ASSERT(!"not implemented");
|
7731 |
+
|
7732 |
+
UNUSED(backend);
|
7733 |
+
UNUSED(plan);
|
7734 |
+
}
|
7735 |
+
|
7736 |
+
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
7737 |
+
GGML_ASSERT(!"not implemented");
|
7738 |
+
|
7739 |
+
UNUSED(backend);
|
7740 |
+
UNUSED(plan);
|
7741 |
+
}
|
7742 |
+
|
7743 |
+
static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
7744 |
+
ggml_cuda_set_device(g_main_device);
|
7745 |
+
|
7746 |
+
ggml_compute_params params = {};
|
7747 |
+
params.type = GGML_TASK_COMPUTE;
|
7748 |
+
params.ith = 0;
|
7749 |
+
for (int i = 0; i < cgraph->n_nodes; i++) {
|
7750 |
+
ggml_tensor * node = cgraph->nodes[i];
|
7751 |
+
|
7752 |
+
assert(node->backend == GGML_BACKEND_GPU);
|
7753 |
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
7754 |
+
if (node->src[j] != nullptr) {
|
7755 |
+
assert(node->src[j]->backend == GGML_BACKEND_GPU);
|
7756 |
+
}
|
7757 |
+
}
|
7758 |
+
|
7759 |
+
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
7760 |
+
if (!ok) {
|
7761 |
+
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
7762 |
+
}
|
7763 |
+
GGML_ASSERT(ok);
|
7764 |
+
|
7765 |
+
#if 0
|
7766 |
+
if (node->type == GGML_TYPE_F32) {
|
7767 |
+
cudaDeviceSynchronize();
|
7768 |
+
std::vector<float> tmp(ggml_nelements(node), 0.0f);
|
7769 |
+
cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
|
7770 |
+
printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
|
7771 |
+
ggml_type_name(node->src[0]->type),
|
7772 |
+
node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
|
7773 |
+
node->src[0]->name,
|
7774 |
+
node->src[1] ? node->src[1]->name : "none");
|
7775 |
+
double sum = 0.0;
|
7776 |
+
double sq_sum = 0.0;
|
7777 |
+
for (int i = 0; i < ggml_nelements(node); i++) {
|
7778 |
+
printf("%f ", tmp[i]);
|
7779 |
+
sum += tmp[i];
|
7780 |
+
sq_sum += tmp[i]*tmp[i];
|
7781 |
+
}
|
7782 |
+
printf("\n");
|
7783 |
+
printf("sum: %f, ", sum);
|
7784 |
+
printf("sq_sum: %f\n", sq_sum);
|
7785 |
+
}
|
7786 |
+
#endif
|
7787 |
+
}
|
7788 |
+
|
7789 |
+
UNUSED(backend);
|
7790 |
+
}
|
7791 |
+
|
7792 |
+
static ggml_backend_i cuda_backend_i = {
|
7793 |
+
/* .get_name = */ ggml_backend_cuda_name,
|
7794 |
+
/* .free = */ ggml_backend_cuda_free,
|
7795 |
+
/* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
|
7796 |
+
/* .get_alignment = */ ggml_backend_cuda_get_alignment,
|
7797 |
+
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
7798 |
+
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
7799 |
+
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
7800 |
+
/* .cpy_tensor_from = */ nullptr,
|
7801 |
+
/* .cpy_tensor_to = */ nullptr,
|
7802 |
+
/* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
|
7803 |
+
/* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
|
7804 |
+
/* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
|
7805 |
+
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
7806 |
+
/* .supports_op = */ nullptr,
|
7807 |
+
};
|
7808 |
+
|
7809 |
+
ggml_backend_t ggml_backend_cuda_init() {
|
7810 |
+
ggml_init_cublas(); // TODO: remove from ggml.c
|
7811 |
+
|
7812 |
+
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
|
7813 |
+
|
7814 |
+
ggml_backend_t cuda_backend = new ggml_backend {
|
7815 |
+
/* .interface = */ cuda_backend_i,
|
7816 |
+
/* .context = */ ctx
|
7817 |
+
};
|
7818 |
+
|
7819 |
+
return cuda_backend;
|
7820 |
+
}
|
ggml-cuda.h
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
#pragma once
|
2 |
|
3 |
#include "ggml.h"
|
|
|
4 |
|
5 |
#ifdef GGML_USE_HIPBLAS
|
6 |
#define GGML_CUDA_NAME "ROCm"
|
@@ -42,6 +43,9 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s
|
|
42 |
GGML_API int ggml_cuda_get_device_count(void);
|
43 |
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
44 |
|
|
|
|
|
|
|
45 |
#ifdef __cplusplus
|
46 |
}
|
47 |
#endif
|
|
|
1 |
#pragma once
|
2 |
|
3 |
#include "ggml.h"
|
4 |
+
#include "ggml-backend.h"
|
5 |
|
6 |
#ifdef GGML_USE_HIPBLAS
|
7 |
#define GGML_CUDA_NAME "ROCm"
|
|
|
43 |
GGML_API int ggml_cuda_get_device_count(void);
|
44 |
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
45 |
|
46 |
+
// backend API
|
47 |
+
GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
|
48 |
+
|
49 |
#ifdef __cplusplus
|
50 |
}
|
51 |
#endif
|
ggml-metal.h
CHANGED
@@ -20,6 +20,7 @@
|
|
20 |
#pragma once
|
21 |
|
22 |
#include "ggml.h"
|
|
|
23 |
|
24 |
#include <stddef.h>
|
25 |
#include <stdbool.h>
|
@@ -35,10 +36,15 @@ struct ggml_cgraph;
|
|
35 |
extern "C" {
|
36 |
#endif
|
37 |
|
38 |
-
|
|
|
|
|
|
|
39 |
|
40 |
struct ggml_metal_context;
|
41 |
|
|
|
|
|
42 |
// number of command buffers to use
|
43 |
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
44 |
void ggml_metal_free(struct ggml_metal_context * ctx);
|
@@ -83,6 +89,17 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
|
83 |
// creates gf->n_threads command buffers in parallel
|
84 |
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
#ifdef __cplusplus
|
87 |
}
|
88 |
#endif
|
|
|
20 |
#pragma once
|
21 |
|
22 |
#include "ggml.h"
|
23 |
+
#include "ggml-backend.h"
|
24 |
|
25 |
#include <stddef.h>
|
26 |
#include <stdbool.h>
|
|
|
36 |
extern "C" {
|
37 |
#endif
|
38 |
|
39 |
+
//
|
40 |
+
// internal API
|
41 |
+
// temporary exposed to user-code
|
42 |
+
//
|
43 |
|
44 |
struct ggml_metal_context;
|
45 |
|
46 |
+
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
47 |
+
|
48 |
// number of command buffers to use
|
49 |
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
50 |
void ggml_metal_free(struct ggml_metal_context * ctx);
|
|
|
89 |
// creates gf->n_threads command buffers in parallel
|
90 |
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
91 |
|
92 |
+
//
|
93 |
+
// backend API
|
94 |
+
// user-code should use only these functions
|
95 |
+
//
|
96 |
+
|
97 |
+
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
98 |
+
|
99 |
+
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
100 |
+
|
101 |
+
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
102 |
+
|
103 |
#ifdef __cplusplus
|
104 |
}
|
105 |
#endif
|
ggml-metal.m
CHANGED
@@ -779,8 +779,8 @@ void ggml_metal_graph_compute(
|
|
779 |
} break;
|
780 |
case GGML_OP_CONCAT:
|
781 |
{
|
|
|
782 |
|
783 |
-
int64_t nb = ne00;
|
784 |
[encoder setComputePipelineState:ctx->pipeline_concat];
|
785 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
786 |
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
@@ -812,6 +812,7 @@ void ggml_metal_graph_compute(
|
|
812 |
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
813 |
|
814 |
const int nth = MIN(1024, ne0);
|
|
|
815 |
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
816 |
} break;
|
817 |
case GGML_OP_ADD:
|
@@ -909,9 +910,10 @@ void ggml_metal_graph_compute(
|
|
909 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
910 |
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
911 |
|
912 |
-
const int64_t n = ggml_nelements(dst)
|
|
|
913 |
|
914 |
-
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
915 |
} break;
|
916 |
case GGML_OP_UNARY:
|
917 |
switch (ggml_get_unary_op(gf->nodes[i])) {
|
@@ -921,9 +923,10 @@ void ggml_metal_graph_compute(
|
|
921 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
922 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
923 |
|
924 |
-
const int64_t n = ggml_nelements(dst)
|
|
|
925 |
|
926 |
-
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
927 |
} break;
|
928 |
case GGML_UNARY_OP_RELU:
|
929 |
{
|
@@ -941,9 +944,10 @@ void ggml_metal_graph_compute(
|
|
941 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
942 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
943 |
|
944 |
-
const int64_t n = ggml_nelements(dst)
|
|
|
945 |
|
946 |
-
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
947 |
} break;
|
948 |
default:
|
949 |
{
|
@@ -1040,7 +1044,7 @@ void ggml_metal_graph_compute(
|
|
1040 |
!ggml_is_transposed(src0) &&
|
1041 |
!ggml_is_transposed(src1) &&
|
1042 |
src1t == GGML_TYPE_F32 &&
|
1043 |
-
ne00 % 32 == 0 &&
|
1044 |
ne11 > ne11_mm_min) {
|
1045 |
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
1046 |
switch (src0->type) {
|
@@ -1251,6 +1255,8 @@ void ggml_metal_graph_compute(
|
|
1251 |
} break;
|
1252 |
case GGML_OP_RMS_NORM:
|
1253 |
{
|
|
|
|
|
1254 |
float eps;
|
1255 |
memcpy(&eps, dst->op_params, sizeof(float));
|
1256 |
|
@@ -1293,7 +1299,7 @@ void ggml_metal_graph_compute(
|
|
1293 |
|
1294 |
const int nth = MIN(1024, ne00);
|
1295 |
|
1296 |
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
1297 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
1298 |
float max_bias;
|
1299 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
@@ -1471,3 +1477,140 @@ preferably one under the recommended max working set size, or else fall back to
|
|
1471 |
|
1472 |
}
|
1473 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
779 |
} break;
|
780 |
case GGML_OP_CONCAT:
|
781 |
{
|
782 |
+
const int64_t nb = ne00;
|
783 |
|
|
|
784 |
[encoder setComputePipelineState:ctx->pipeline_concat];
|
785 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
786 |
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
|
|
812 |
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
813 |
|
814 |
const int nth = MIN(1024, ne0);
|
815 |
+
|
816 |
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
817 |
} break;
|
818 |
case GGML_OP_ADD:
|
|
|
910 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
911 |
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
912 |
|
913 |
+
const int64_t n = ggml_nelements(dst);
|
914 |
+
GGML_ASSERT(n % 4 == 0);
|
915 |
|
916 |
+
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
917 |
} break;
|
918 |
case GGML_OP_UNARY:
|
919 |
switch (ggml_get_unary_op(gf->nodes[i])) {
|
|
|
923 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
924 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
925 |
|
926 |
+
const int64_t n = ggml_nelements(dst);
|
927 |
+
GGML_ASSERT(n % 4 == 0);
|
928 |
|
929 |
+
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
930 |
} break;
|
931 |
case GGML_UNARY_OP_RELU:
|
932 |
{
|
|
|
944 |
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
945 |
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
946 |
|
947 |
+
const int64_t n = ggml_nelements(dst);
|
948 |
+
GGML_ASSERT(n % 4 == 0);
|
949 |
|
950 |
+
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
951 |
} break;
|
952 |
default:
|
953 |
{
|
|
|
1044 |
!ggml_is_transposed(src0) &&
|
1045 |
!ggml_is_transposed(src1) &&
|
1046 |
src1t == GGML_TYPE_F32 &&
|
1047 |
+
ne00 % 32 == 0 && ne00 >= 64 &&
|
1048 |
ne11 > ne11_mm_min) {
|
1049 |
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
1050 |
switch (src0->type) {
|
|
|
1255 |
} break;
|
1256 |
case GGML_OP_RMS_NORM:
|
1257 |
{
|
1258 |
+
GGML_ASSERT(ne00 % 4 == 0);
|
1259 |
+
|
1260 |
float eps;
|
1261 |
memcpy(&eps, dst->op_params, sizeof(float));
|
1262 |
|
|
|
1299 |
|
1300 |
const int nth = MIN(1024, ne00);
|
1301 |
|
1302 |
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
1303 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
1304 |
float max_bias;
|
1305 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
|
|
1477 |
|
1478 |
}
|
1479 |
}
|
1480 |
+
|
1481 |
+
////////////////////////////////////////////////////////////////////////////////
|
1482 |
+
|
1483 |
+
// backend interface
|
1484 |
+
|
1485 |
+
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
1486 |
+
return "Metal";
|
1487 |
+
|
1488 |
+
UNUSED(backend);
|
1489 |
+
}
|
1490 |
+
|
1491 |
+
static void ggml_backend_metal_free(ggml_backend_t backend) {
|
1492 |
+
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
1493 |
+
ggml_metal_free(ctx);
|
1494 |
+
free(backend);
|
1495 |
+
}
|
1496 |
+
|
1497 |
+
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
1498 |
+
return (void *)buffer->context;
|
1499 |
+
}
|
1500 |
+
|
1501 |
+
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
1502 |
+
free(buffer->context);
|
1503 |
+
UNUSED(buffer);
|
1504 |
+
}
|
1505 |
+
|
1506 |
+
static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
1507 |
+
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
1508 |
+
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
1509 |
+
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
1510 |
+
/* .init_tensor = */ NULL, // no initialization required
|
1511 |
+
/* .free_tensor = */ NULL, // no cleanup required
|
1512 |
+
};
|
1513 |
+
|
1514 |
+
static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
|
1515 |
+
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
1516 |
+
|
1517 |
+
void * data = ggml_metal_host_malloc(size);
|
1518 |
+
|
1519 |
+
// TODO: set proper name of the buffers
|
1520 |
+
ggml_metal_add_buffer(ctx, "backend", data, size, 0);
|
1521 |
+
|
1522 |
+
return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
|
1523 |
+
}
|
1524 |
+
|
1525 |
+
static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
|
1526 |
+
return 32;
|
1527 |
+
UNUSED(backend);
|
1528 |
+
}
|
1529 |
+
|
1530 |
+
static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
1531 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
1532 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
1533 |
+
|
1534 |
+
memcpy((char *)tensor->data + offset, data, size);
|
1535 |
+
|
1536 |
+
UNUSED(backend);
|
1537 |
+
}
|
1538 |
+
|
1539 |
+
static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
1540 |
+
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
1541 |
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
1542 |
+
|
1543 |
+
memcpy(data, (const char *)tensor->data + offset, size);
|
1544 |
+
|
1545 |
+
UNUSED(backend);
|
1546 |
+
}
|
1547 |
+
|
1548 |
+
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
1549 |
+
UNUSED(backend);
|
1550 |
+
}
|
1551 |
+
|
1552 |
+
static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
1553 |
+
ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
|
1554 |
+
|
1555 |
+
UNUSED(backend);
|
1556 |
+
}
|
1557 |
+
|
1558 |
+
static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
|
1559 |
+
ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
|
1560 |
+
|
1561 |
+
UNUSED(backend);
|
1562 |
+
}
|
1563 |
+
|
1564 |
+
static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
1565 |
+
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
1566 |
+
|
1567 |
+
ggml_metal_graph_compute(metal_ctx, cgraph);
|
1568 |
+
}
|
1569 |
+
|
1570 |
+
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
1571 |
+
return true;
|
1572 |
+
UNUSED(backend);
|
1573 |
+
UNUSED(op);
|
1574 |
+
}
|
1575 |
+
|
1576 |
+
static struct ggml_backend_i metal_backend_i = {
|
1577 |
+
/* .get_name = */ ggml_backend_metal_name,
|
1578 |
+
/* .free = */ ggml_backend_metal_free,
|
1579 |
+
/* .alloc_buffer = */ ggml_backend_metal_alloc_buffer,
|
1580 |
+
/* .get_alignment = */ ggml_backend_metal_get_alignment,
|
1581 |
+
/* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
|
1582 |
+
/* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
|
1583 |
+
/* .synchronize = */ ggml_backend_metal_synchronize,
|
1584 |
+
/* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from,
|
1585 |
+
/* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to,
|
1586 |
+
/* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
|
1587 |
+
/* .graph_plan_free = */ NULL,
|
1588 |
+
/* .graph_plan_compute = */ NULL,
|
1589 |
+
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
1590 |
+
/* .supports_op = */ ggml_backend_metal_supports_op,
|
1591 |
+
};
|
1592 |
+
|
1593 |
+
ggml_backend_t ggml_backend_metal_init(void) {
|
1594 |
+
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
1595 |
+
|
1596 |
+
ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
1597 |
+
|
1598 |
+
ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
|
1599 |
+
|
1600 |
+
*metal_backend = (struct ggml_backend) {
|
1601 |
+
/* .interface = */ metal_backend_i,
|
1602 |
+
/* .context = */ ctx,
|
1603 |
+
};
|
1604 |
+
|
1605 |
+
return metal_backend;
|
1606 |
+
}
|
1607 |
+
|
1608 |
+
bool ggml_backend_is_metal(ggml_backend_t backend) {
|
1609 |
+
return backend->iface.get_name == ggml_backend_metal_name;
|
1610 |
+
}
|
1611 |
+
|
1612 |
+
void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
|
1613 |
+
struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
|
1614 |
+
|
1615 |
+
ggml_metal_set_n_cb(ctx, n_cb);
|
1616 |
+
}
|
ggml-metal.metal
CHANGED
@@ -345,10 +345,11 @@ kernel void kernel_rms_norm(
|
|
345 |
uint sgitg[[simdgroup_index_in_threadgroup]],
|
346 |
uint tiisg[[thread_index_in_simdgroup]],
|
347 |
uint ntg[[threads_per_threadgroup]]) {
|
348 |
-
device const float4 * x
|
349 |
-
device const float
|
350 |
-
|
351 |
-
|
|
|
352 |
|
353 |
// parallel sum
|
354 |
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
@@ -361,6 +362,7 @@ kernel void kernel_rms_norm(
|
|
361 |
}
|
362 |
|
363 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
|
|
364 |
// broadcast, simd group number is ntg / 32
|
365 |
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
366 |
if (tpitg < i) {
|
@@ -368,7 +370,9 @@ kernel void kernel_rms_norm(
|
|
368 |
}
|
369 |
}
|
370 |
if (tpitg == 0) {
|
371 |
-
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
|
|
|
|
372 |
sum[0] /= ne00;
|
373 |
}
|
374 |
|
@@ -383,7 +387,9 @@ kernel void kernel_rms_norm(
|
|
383 |
y[i00] = x[i00] * scale;
|
384 |
}
|
385 |
if (tpitg == 0) {
|
386 |
-
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
|
|
|
|
387 |
}
|
388 |
}
|
389 |
|
|
|
345 |
uint sgitg[[simdgroup_index_in_threadgroup]],
|
346 |
uint tiisg[[thread_index_in_simdgroup]],
|
347 |
uint ntg[[threads_per_threadgroup]]) {
|
348 |
+
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
349 |
+
device const float * x_scalar = (device const float *) x;
|
350 |
+
|
351 |
+
float4 sumf = 0;
|
352 |
+
float all_sum = 0;
|
353 |
|
354 |
// parallel sum
|
355 |
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
|
|
362 |
}
|
363 |
|
364 |
threadgroup_barrier(mem_flags::mem_threadgroup);
|
365 |
+
|
366 |
// broadcast, simd group number is ntg / 32
|
367 |
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
368 |
if (tpitg < i) {
|
|
|
370 |
}
|
371 |
}
|
372 |
if (tpitg == 0) {
|
373 |
+
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
374 |
+
sum[0] += x_scalar[i];
|
375 |
+
}
|
376 |
sum[0] /= ne00;
|
377 |
}
|
378 |
|
|
|
387 |
y[i00] = x[i00] * scale;
|
388 |
}
|
389 |
if (tpitg == 0) {
|
390 |
+
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
391 |
+
y_scalar[i00] = x_scalar[i00] * scale;
|
392 |
+
}
|
393 |
}
|
394 |
}
|
395 |
|
ggml.c
CHANGED
@@ -162,40 +162,16 @@ typedef void * thread_ret_t;
|
|
162 |
|
163 |
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
164 |
|
|
|
|
|
|
|
|
|
165 |
#ifdef GGML_USE_ACCELERATE
|
166 |
// uncomment to use vDSP for soft max computation
|
167 |
// note: not sure if it is actually faster
|
168 |
//#define GGML_SOFT_MAX_ACCELERATE
|
169 |
#endif
|
170 |
|
171 |
-
//
|
172 |
-
// logging
|
173 |
-
//
|
174 |
-
|
175 |
-
#if (GGML_DEBUG >= 1)
|
176 |
-
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
177 |
-
#else
|
178 |
-
#define GGML_PRINT_DEBUG(...)
|
179 |
-
#endif
|
180 |
-
|
181 |
-
#if (GGML_DEBUG >= 5)
|
182 |
-
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
183 |
-
#else
|
184 |
-
#define GGML_PRINT_DEBUG_5(...)
|
185 |
-
#endif
|
186 |
-
|
187 |
-
#if (GGML_DEBUG >= 10)
|
188 |
-
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
189 |
-
#else
|
190 |
-
#define GGML_PRINT_DEBUG_10(...)
|
191 |
-
#endif
|
192 |
-
|
193 |
-
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
194 |
-
|
195 |
-
//
|
196 |
-
// end of logging block
|
197 |
-
//
|
198 |
-
|
199 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
200 |
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
201 |
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
@@ -4952,6 +4928,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4952 |
*result = (struct ggml_tensor) {
|
4953 |
/*.type =*/ type,
|
4954 |
/*.backend =*/ GGML_BACKEND_CPU,
|
|
|
4955 |
/*.n_dims =*/ n_dims,
|
4956 |
/*.ne =*/ { 1, 1, 1, 1 },
|
4957 |
/*.nb =*/ { 0, 0, 0, 0 },
|
@@ -11257,7 +11234,7 @@ static void ggml_compute_forward_silu_f32(
|
|
11257 |
|
11258 |
#ifndef NDEBUG
|
11259 |
for (int k = 0; k < nc; k++) {
|
11260 |
-
const float x = ((float *) ((char *) dst->data + i1*(
|
11261 |
UNUSED(x);
|
11262 |
assert(!isnan(x));
|
11263 |
assert(!isinf(x));
|
@@ -13083,24 +13060,22 @@ static void ggml_compute_forward_alibi_f32(
|
|
13083 |
return;
|
13084 |
}
|
13085 |
|
13086 |
-
const int n_past = ((int32_t *) dst->op_params)[0];
|
13087 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
13088 |
float max_bias;
|
13089 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13090 |
|
13091 |
-
|
13092 |
-
|
13093 |
-
const
|
13094 |
-
const
|
13095 |
-
const int ne2 = src0->ne[2]; // n_head -> this is k
|
13096 |
-
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
13097 |
|
13098 |
-
const
|
13099 |
-
const
|
13100 |
|
13101 |
-
const
|
13102 |
-
const
|
13103 |
-
const
|
13104 |
//const int nb3 = src0->nb[3];
|
13105 |
|
13106 |
GGML_ASSERT(nb0 == sizeof(float));
|
@@ -13112,9 +13087,9 @@ static void ggml_compute_forward_alibi_f32(
|
|
13112 |
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13113 |
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13114 |
|
13115 |
-
for (
|
13116 |
-
for (
|
13117 |
-
for (
|
13118 |
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13119 |
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13120 |
|
@@ -13129,7 +13104,6 @@ static void ggml_compute_forward_alibi_f32(
|
|
13129 |
}
|
13130 |
|
13131 |
pdst[0] = i * m_k + src[0];
|
13132 |
-
|
13133 |
}
|
13134 |
}
|
13135 |
}
|
@@ -20200,6 +20174,10 @@ static enum ggml_opt_result ggml_opt_lbfgs(
|
|
20200 |
ggml_vec_cpy_f32(nx, xp, x);
|
20201 |
ggml_vec_cpy_f32(nx, gp, g);
|
20202 |
|
|
|
|
|
|
|
|
|
20203 |
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
20204 |
if (cancel) {
|
20205 |
return GGML_OPT_CANCEL;
|
|
|
162 |
|
163 |
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
164 |
|
165 |
+
//
|
166 |
+
// end of logging block
|
167 |
+
//
|
168 |
+
|
169 |
#ifdef GGML_USE_ACCELERATE
|
170 |
// uncomment to use vDSP for soft max computation
|
171 |
// note: not sure if it is actually faster
|
172 |
//#define GGML_SOFT_MAX_ACCELERATE
|
173 |
#endif
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
176 |
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
177 |
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
|
|
4928 |
*result = (struct ggml_tensor) {
|
4929 |
/*.type =*/ type,
|
4930 |
/*.backend =*/ GGML_BACKEND_CPU,
|
4931 |
+
/*.buffer =*/ NULL,
|
4932 |
/*.n_dims =*/ n_dims,
|
4933 |
/*.ne =*/ { 1, 1, 1, 1 },
|
4934 |
/*.nb =*/ { 0, 0, 0, 0 },
|
|
|
11234 |
|
11235 |
#ifndef NDEBUG
|
11236 |
for (int k = 0; k < nc; k++) {
|
11237 |
+
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
11238 |
UNUSED(x);
|
11239 |
assert(!isnan(x));
|
11240 |
assert(!isinf(x));
|
|
|
13060 |
return;
|
13061 |
}
|
13062 |
|
13063 |
+
//const int n_past = ((int32_t *) dst->op_params)[0];
|
13064 |
const int n_head = ((int32_t *) dst->op_params)[1];
|
13065 |
float max_bias;
|
13066 |
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
|
13067 |
|
13068 |
+
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
13069 |
+
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
13070 |
+
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
13071 |
+
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
|
|
|
|
13072 |
|
13073 |
+
const int64_t n = ggml_nrows(src0);
|
13074 |
+
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
13075 |
|
13076 |
+
const size_t nb0 = src0->nb[0];
|
13077 |
+
const size_t nb1 = src0->nb[1];
|
13078 |
+
const size_t nb2 = src0->nb[2];
|
13079 |
//const int nb3 = src0->nb[3];
|
13080 |
|
13081 |
GGML_ASSERT(nb0 == sizeof(float));
|
|
|
13087 |
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
13088 |
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
13089 |
|
13090 |
+
for (int64_t i = 0; i < ne0; i++) {
|
13091 |
+
for (int64_t j = 0; j < ne1; j++) {
|
13092 |
+
for (int64_t k = 0; k < ne2_ne3; k++) {
|
13093 |
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
13094 |
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
13095 |
|
|
|
13104 |
}
|
13105 |
|
13106 |
pdst[0] = i * m_k + src[0];
|
|
|
13107 |
}
|
13108 |
}
|
13109 |
}
|
|
|
20174 |
ggml_vec_cpy_f32(nx, xp, x);
|
20175 |
ggml_vec_cpy_f32(nx, gp, g);
|
20176 |
|
20177 |
+
// TODO: instead of passing &cancel here, use the return code of the linesearch
|
20178 |
+
// to determine if the optimization should be cancelled
|
20179 |
+
// this is a simple change, but not doing this atm, since I don't have a nice
|
20180 |
+
// way to test and don't want to break something with so many changes lined up
|
20181 |
ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data);
|
20182 |
if (cancel) {
|
20183 |
return GGML_OPT_CANCEL;
|
ggml.h
CHANGED
@@ -326,7 +326,7 @@ extern "C" {
|
|
326 |
GGML_TYPE_COUNT,
|
327 |
};
|
328 |
|
329 |
-
enum
|
330 |
GGML_BACKEND_CPU = 0,
|
331 |
GGML_BACKEND_GPU = 10,
|
332 |
GGML_BACKEND_GPU_SPLIT = 20,
|
@@ -479,8 +479,10 @@ extern "C" {
|
|
479 |
|
480 |
// n-dimensional tensor
|
481 |
struct ggml_tensor {
|
482 |
-
enum ggml_type
|
483 |
-
enum
|
|
|
|
|
484 |
|
485 |
int n_dims;
|
486 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
@@ -514,7 +516,7 @@ extern "C" {
|
|
514 |
|
515 |
void * extra; // extra things e.g. for ggml-cuda.cu
|
516 |
|
517 |
-
char padding[
|
518 |
};
|
519 |
|
520 |
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
@@ -1358,7 +1360,7 @@ extern "C" {
|
|
1358 |
|
1359 |
// alibi position embedding
|
1360 |
// in-place, returns view(a)
|
1361 |
-
struct ggml_tensor * ggml_alibi(
|
1362 |
struct ggml_context * ctx,
|
1363 |
struct ggml_tensor * a,
|
1364 |
int n_past,
|
@@ -1367,7 +1369,7 @@ extern "C" {
|
|
1367 |
|
1368 |
// clamp
|
1369 |
// in-place, returns view(a)
|
1370 |
-
struct ggml_tensor * ggml_clamp(
|
1371 |
struct ggml_context * ctx,
|
1372 |
struct ggml_tensor * a,
|
1373 |
float min,
|
@@ -2102,7 +2104,7 @@ extern "C" {
|
|
2102 |
enum ggml_type vec_dot_type;
|
2103 |
} ggml_type_traits_t;
|
2104 |
|
2105 |
-
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2106 |
|
2107 |
#ifdef __cplusplus
|
2108 |
}
|
|
|
326 |
GGML_TYPE_COUNT,
|
327 |
};
|
328 |
|
329 |
+
enum ggml_backend_type {
|
330 |
GGML_BACKEND_CPU = 0,
|
331 |
GGML_BACKEND_GPU = 10,
|
332 |
GGML_BACKEND_GPU_SPLIT = 20,
|
|
|
479 |
|
480 |
// n-dimensional tensor
|
481 |
struct ggml_tensor {
|
482 |
+
enum ggml_type type;
|
483 |
+
enum ggml_backend_type backend;
|
484 |
+
|
485 |
+
struct ggml_backend_buffer * buffer;
|
486 |
|
487 |
int n_dims;
|
488 |
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
|
|
516 |
|
517 |
void * extra; // extra things e.g. for ggml-cuda.cu
|
518 |
|
519 |
+
char padding[12];
|
520 |
};
|
521 |
|
522 |
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
|
|
1360 |
|
1361 |
// alibi position embedding
|
1362 |
// in-place, returns view(a)
|
1363 |
+
GGML_API struct ggml_tensor * ggml_alibi(
|
1364 |
struct ggml_context * ctx,
|
1365 |
struct ggml_tensor * a,
|
1366 |
int n_past,
|
|
|
1369 |
|
1370 |
// clamp
|
1371 |
// in-place, returns view(a)
|
1372 |
+
GGML_API struct ggml_tensor * ggml_clamp(
|
1373 |
struct ggml_context * ctx,
|
1374 |
struct ggml_tensor * a,
|
1375 |
float min,
|
|
|
2104 |
enum ggml_type vec_dot_type;
|
2105 |
} ggml_type_traits_t;
|
2106 |
|
2107 |
+
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
2108 |
|
2109 |
#ifdef __cplusplus
|
2110 |
}
|
gguf-py/gguf/gguf.py
CHANGED
@@ -88,29 +88,31 @@ class MODEL_ARCH(IntEnum):
|
|
88 |
PERSIMMON : int = auto()
|
89 |
REFACT : int = auto()
|
90 |
BERT : int = auto()
|
|
|
91 |
|
92 |
|
93 |
class MODEL_TENSOR(IntEnum):
|
94 |
-
TOKEN_EMBD
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
114 |
|
115 |
|
116 |
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
@@ -125,29 +127,31 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
125 |
MODEL_ARCH.PERSIMMON: "persimmon",
|
126 |
MODEL_ARCH.REFACT: "refact",
|
127 |
MODEL_ARCH.BERT: "bert",
|
|
|
128 |
}
|
129 |
|
130 |
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
131 |
-
MODEL_TENSOR.TOKEN_EMBD:
|
132 |
-
MODEL_TENSOR.
|
133 |
-
MODEL_TENSOR.
|
134 |
-
MODEL_TENSOR.
|
135 |
-
MODEL_TENSOR.
|
136 |
-
MODEL_TENSOR.
|
137 |
-
MODEL_TENSOR.
|
138 |
-
MODEL_TENSOR.
|
139 |
-
MODEL_TENSOR.
|
140 |
-
MODEL_TENSOR.
|
141 |
-
MODEL_TENSOR.
|
142 |
-
MODEL_TENSOR.
|
143 |
-
MODEL_TENSOR.
|
144 |
-
MODEL_TENSOR.
|
145 |
-
MODEL_TENSOR.
|
146 |
-
MODEL_TENSOR.
|
147 |
-
MODEL_TENSOR.
|
148 |
-
MODEL_TENSOR.
|
149 |
-
MODEL_TENSOR.
|
150 |
-
MODEL_TENSOR.
|
|
|
151 |
}
|
152 |
|
153 |
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
@@ -282,6 +286,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
282 |
MODEL_TENSOR.FFN_DOWN,
|
283 |
MODEL_TENSOR.FFN_UP,
|
284 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
MODEL_ARCH.GPT2: [
|
286 |
# TODO
|
287 |
],
|
@@ -311,6 +327,7 @@ class TensorNameMap:
|
|
311 |
"gpt_neox.embed_in", # gptneox
|
312 |
"transformer.wte", # gpt2 gpt-j mpt refact
|
313 |
"transformer.word_embeddings", # falcon
|
|
|
314 |
"model.embed_tokens", # llama-hf
|
315 |
"tok_embeddings", # llama-pth
|
316 |
"embeddings.word_embeddings", # bert
|
@@ -322,6 +339,11 @@ class TensorNameMap:
|
|
322 |
"embeddings.token_type_embeddings", # bert
|
323 |
),
|
324 |
|
|
|
|
|
|
|
|
|
|
|
325 |
# Position embeddings
|
326 |
MODEL_TENSOR.POS_EMBD: (
|
327 |
"transformer.wpe", # gpt2
|
@@ -332,7 +354,7 @@ class TensorNameMap:
|
|
332 |
MODEL_TENSOR.OUTPUT: (
|
333 |
"embed_out", # gptneox
|
334 |
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
335 |
-
"output", # llama-pth
|
336 |
"word_embeddings_for_head", # persimmon
|
337 |
),
|
338 |
|
@@ -344,7 +366,7 @@ class TensorNameMap:
|
|
344 |
"norm", # llama-pth
|
345 |
"embeddings.LayerNorm", # bert
|
346 |
"transformer.norm_f", # mpt
|
347 |
-
"ln_f", # refact
|
348 |
"language_model.encoder.final_layernorm", # persimmon
|
349 |
),
|
350 |
|
@@ -361,6 +383,7 @@ class TensorNameMap:
|
|
361 |
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
362 |
"transformer.blocks.{bid}.norm_1", # mpt
|
363 |
"transformer.h.{bid}.input_layernorm", # falcon7b
|
|
|
364 |
"transformer.h.{bid}.ln_mlp", # falcon40b
|
365 |
"model.layers.{bid}.input_layernorm", # llama-hf
|
366 |
"layers.{bid}.attention_norm", # llama-pth
|
@@ -379,6 +402,7 @@ class TensorNameMap:
|
|
379 |
"transformer.h.{bid}.attn.c_attn", # gpt2
|
380 |
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
381 |
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
|
|
382 |
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
383 |
),
|
384 |
|
@@ -412,6 +436,7 @@ class TensorNameMap:
|
|
412 |
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
413 |
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
414 |
"transformer.h.{bid}.self_attention.dense", # falcon
|
|
|
415 |
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
416 |
"layers.{bid}.attention.wo", # llama-pth
|
417 |
"encoder.layer.{bid}.attention.output.dense", # bert
|
@@ -429,6 +454,7 @@ class TensorNameMap:
|
|
429 |
MODEL_TENSOR.FFN_NORM: (
|
430 |
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
431 |
"transformer.h.{bid}.ln_2", # gpt2 refact
|
|
|
432 |
"transformer.blocks.{bid}.norm_2", # mpt
|
433 |
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
434 |
"layers.{bid}.ffn_norm", # llama-pth
|
@@ -442,6 +468,7 @@ class TensorNameMap:
|
|
442 |
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
443 |
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
444 |
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
|
|
445 |
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
446 |
"layers.{bid}.feed_forward.w3", # llama-pth
|
447 |
"encoder.layer.{bid}.intermediate.dense", # bert
|
@@ -461,6 +488,7 @@ class TensorNameMap:
|
|
461 |
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
462 |
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
463 |
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
|
|
464 |
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
465 |
"layers.{bid}.feed_forward.w2", # llama-pth
|
466 |
"encoder.layer.{bid}.output.dense", # bert
|
|
|
88 |
PERSIMMON : int = auto()
|
89 |
REFACT : int = auto()
|
90 |
BERT : int = auto()
|
91 |
+
BLOOM : int = auto()
|
92 |
|
93 |
|
94 |
class MODEL_TENSOR(IntEnum):
|
95 |
+
TOKEN_EMBD : int = auto()
|
96 |
+
TOKEN_EMBD_NORM : int = auto()
|
97 |
+
TOKEN_TYPES : int = auto()
|
98 |
+
POS_EMBD : int = auto()
|
99 |
+
OUTPUT : int = auto()
|
100 |
+
OUTPUT_NORM : int = auto()
|
101 |
+
ROPE_FREQS : int = auto()
|
102 |
+
ATTN_Q : int = auto()
|
103 |
+
ATTN_K : int = auto()
|
104 |
+
ATTN_V : int = auto()
|
105 |
+
ATTN_QKV : int = auto()
|
106 |
+
ATTN_OUT : int = auto()
|
107 |
+
ATTN_NORM : int = auto()
|
108 |
+
ATTN_NORM_2 : int = auto()
|
109 |
+
ATTN_ROT_EMBD : int = auto()
|
110 |
+
FFN_GATE : int = auto()
|
111 |
+
FFN_DOWN : int = auto()
|
112 |
+
FFN_UP : int = auto()
|
113 |
+
FFN_NORM : int = auto()
|
114 |
+
ATTN_Q_NORM : int = auto()
|
115 |
+
ATTN_K_NORM : int = auto()
|
116 |
|
117 |
|
118 |
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|
|
127 |
MODEL_ARCH.PERSIMMON: "persimmon",
|
128 |
MODEL_ARCH.REFACT: "refact",
|
129 |
MODEL_ARCH.BERT: "bert",
|
130 |
+
MODEL_ARCH.BLOOM: "bloom",
|
131 |
}
|
132 |
|
133 |
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
134 |
+
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
135 |
+
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
136 |
+
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
137 |
+
MODEL_TENSOR.POS_EMBD: "position_embd",
|
138 |
+
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
139 |
+
MODEL_TENSOR.OUTPUT: "output",
|
140 |
+
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
141 |
+
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
142 |
+
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
143 |
+
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
144 |
+
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
145 |
+
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
146 |
+
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
147 |
+
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
148 |
+
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
149 |
+
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
150 |
+
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
151 |
+
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
152 |
+
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
153 |
+
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
154 |
+
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
155 |
}
|
156 |
|
157 |
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
286 |
MODEL_TENSOR.FFN_DOWN,
|
287 |
MODEL_TENSOR.FFN_UP,
|
288 |
],
|
289 |
+
MODEL_ARCH.BLOOM: [
|
290 |
+
MODEL_TENSOR.TOKEN_EMBD,
|
291 |
+
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
292 |
+
MODEL_TENSOR.OUTPUT_NORM,
|
293 |
+
MODEL_TENSOR.OUTPUT,
|
294 |
+
MODEL_TENSOR.ATTN_NORM,
|
295 |
+
MODEL_TENSOR.ATTN_QKV,
|
296 |
+
MODEL_TENSOR.ATTN_OUT,
|
297 |
+
MODEL_TENSOR.FFN_NORM,
|
298 |
+
MODEL_TENSOR.FFN_DOWN,
|
299 |
+
MODEL_TENSOR.FFN_UP,
|
300 |
+
],
|
301 |
MODEL_ARCH.GPT2: [
|
302 |
# TODO
|
303 |
],
|
|
|
327 |
"gpt_neox.embed_in", # gptneox
|
328 |
"transformer.wte", # gpt2 gpt-j mpt refact
|
329 |
"transformer.word_embeddings", # falcon
|
330 |
+
"word_embeddings", # bloom
|
331 |
"model.embed_tokens", # llama-hf
|
332 |
"tok_embeddings", # llama-pth
|
333 |
"embeddings.word_embeddings", # bert
|
|
|
339 |
"embeddings.token_type_embeddings", # bert
|
340 |
),
|
341 |
|
342 |
+
# Normalization of token embeddings
|
343 |
+
MODEL_TENSOR.TOKEN_EMBD_NORM: (
|
344 |
+
"word_embeddings_layernorm", # bloom
|
345 |
+
),
|
346 |
+
|
347 |
# Position embeddings
|
348 |
MODEL_TENSOR.POS_EMBD: (
|
349 |
"transformer.wpe", # gpt2
|
|
|
354 |
MODEL_TENSOR.OUTPUT: (
|
355 |
"embed_out", # gptneox
|
356 |
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
357 |
+
"output", # llama-pth bloom
|
358 |
"word_embeddings_for_head", # persimmon
|
359 |
),
|
360 |
|
|
|
366 |
"norm", # llama-pth
|
367 |
"embeddings.LayerNorm", # bert
|
368 |
"transformer.norm_f", # mpt
|
369 |
+
"ln_f", # refact bloom
|
370 |
"language_model.encoder.final_layernorm", # persimmon
|
371 |
),
|
372 |
|
|
|
383 |
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
384 |
"transformer.blocks.{bid}.norm_1", # mpt
|
385 |
"transformer.h.{bid}.input_layernorm", # falcon7b
|
386 |
+
"h.{bid}.input_layernorm", # bloom
|
387 |
"transformer.h.{bid}.ln_mlp", # falcon40b
|
388 |
"model.layers.{bid}.input_layernorm", # llama-hf
|
389 |
"layers.{bid}.attention_norm", # llama-pth
|
|
|
402 |
"transformer.h.{bid}.attn.c_attn", # gpt2
|
403 |
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
404 |
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
405 |
+
"h.{bid}.self_attention.query_key_value", # bloom
|
406 |
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
407 |
),
|
408 |
|
|
|
436 |
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
437 |
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
438 |
"transformer.h.{bid}.self_attention.dense", # falcon
|
439 |
+
"h.{bid}.self_attention.dense", # bloom
|
440 |
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
441 |
"layers.{bid}.attention.wo", # llama-pth
|
442 |
"encoder.layer.{bid}.attention.output.dense", # bert
|
|
|
454 |
MODEL_TENSOR.FFN_NORM: (
|
455 |
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
456 |
"transformer.h.{bid}.ln_2", # gpt2 refact
|
457 |
+
"h.{bid}.post_attention_layernorm", # bloom
|
458 |
"transformer.blocks.{bid}.norm_2", # mpt
|
459 |
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
460 |
"layers.{bid}.ffn_norm", # llama-pth
|
|
|
468 |
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
469 |
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
470 |
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
471 |
+
"h.{bid}.mlp.dense_h_to_4h", # bloom
|
472 |
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
473 |
"layers.{bid}.feed_forward.w3", # llama-pth
|
474 |
"encoder.layer.{bid}.intermediate.dense", # bert
|
|
|
488 |
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
489 |
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
490 |
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
491 |
+
"h.{bid}.mlp.dense_4h_to_h", # bloom
|
492 |
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
493 |
"layers.{bid}.feed_forward.w2", # llama-pth
|
494 |
"encoder.layer.{bid}.output.dense", # bert
|
gpttype_adapter.cpp
CHANGED
@@ -1768,7 +1768,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|
1768 |
int realnpredict = params.n_predict-stopper_unused_tokens;
|
1769 |
float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
|
1770 |
float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
|
1771 |
-
printf("\
|
1772 |
fflush(stdout);
|
1773 |
output.status = 1;
|
1774 |
generation_finished = true;
|
|
|
1768 |
int realnpredict = params.n_predict-stopper_unused_tokens;
|
1769 |
float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict));
|
1770 |
float tokens_per_second = (realnpredict == 0 ? 0 : realnpredict / (time1 + time2));
|
1771 |
+
printf("\nContextLimit: %d/%d, Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs (%.1fT/s)",current_context_tokens.size(),nctx, time1, pt1, time2, pt2, (time1 + time2), tokens_per_second);
|
1772 |
fflush(stdout);
|
1773 |
output.status = 1;
|
1774 |
generation_finished = true;
|
koboldcpp.py
CHANGED
@@ -184,6 +184,10 @@ def init_library():
|
|
184 |
os.add_dll_directory(dir_path)
|
185 |
os.add_dll_directory(abs_path)
|
186 |
os.add_dll_directory(os.getcwd())
|
|
|
|
|
|
|
|
|
187 |
handle = ctypes.CDLL(os.path.join(dir_path, libname))
|
188 |
|
189 |
handle.load_model.argtypes = [load_model_inputs]
|
@@ -361,7 +365,7 @@ maxhordelen = 256
|
|
361 |
modelbusy = threading.Lock()
|
362 |
requestsinqueue = 0
|
363 |
defaultport = 5001
|
364 |
-
KcppVersion = "1.
|
365 |
showdebug = True
|
366 |
showsamplerwarning = True
|
367 |
showmaxctxwarning = True
|
@@ -369,6 +373,8 @@ session_kudos_earned = 0
|
|
369 |
session_jobs = 0
|
370 |
session_starttime = None
|
371 |
exitcounter = 0
|
|
|
|
|
372 |
totalgens = 0
|
373 |
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
|
374 |
args = None #global args
|
@@ -412,16 +418,34 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
412 |
elif api_format==4:
|
413 |
# translate openai chat completion messages format into one big string.
|
414 |
messages_array = genparams.get('messages', [])
|
|
|
415 |
messages_string = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
416 |
for message in messages_array:
|
417 |
if message['role'] == "system":
|
418 |
-
messages_string+=
|
419 |
elif message['role'] == "user":
|
420 |
-
messages_string+=
|
421 |
elif message['role'] == "assistant":
|
422 |
-
messages_string+=
|
423 |
-
|
424 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
425 |
genparams["prompt"] = messages_string
|
426 |
frqp = genparams.get('frequency_penalty', 0.1)
|
427 |
scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
|
@@ -497,9 +521,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
497 |
async def handle_sse_stream(self, api_format):
|
498 |
global friendlymodelname
|
499 |
self.send_response(200)
|
500 |
-
self.send_header("
|
501 |
-
self.send_header("
|
502 |
-
self.end_headers(
|
503 |
|
504 |
current_token = 0
|
505 |
incomplete_token_buffer = bytearray()
|
@@ -566,10 +590,10 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
566 |
global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
|
567 |
self.path = self.path.rstrip('/')
|
568 |
response_body = None
|
569 |
-
|
570 |
|
571 |
if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
|
572 |
-
|
573 |
if self.embedded_kailite is None:
|
574 |
response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
|
575 |
else:
|
@@ -615,9 +639,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
615 |
|
616 |
elif self.path.endswith('/v1/models'):
|
617 |
response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
|
618 |
-
force_json = True
|
619 |
|
620 |
elif self.path=="/api":
|
|
|
621 |
if self.embedded_kcpp_docs is None:
|
622 |
response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
|
623 |
else:
|
@@ -625,41 +649,40 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
625 |
elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
|
626 |
self.path = "/api"
|
627 |
self.send_response(302)
|
628 |
-
self.send_header("
|
629 |
-
self.end_headers()
|
630 |
return None
|
631 |
|
632 |
if response_body is None:
|
633 |
self.send_response(404)
|
634 |
-
self.end_headers()
|
635 |
rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
|
636 |
self.wfile.write(rp.encode())
|
637 |
else:
|
638 |
self.send_response(200)
|
639 |
-
self.send_header('
|
640 |
-
self.end_headers(
|
641 |
self.wfile.write(response_body)
|
642 |
return
|
643 |
|
644 |
def do_POST(self):
|
645 |
global modelbusy, requestsinqueue, currentusergenkey, totalgens
|
646 |
-
content_length = int(self.headers['
|
647 |
body = self.rfile.read(content_length)
|
648 |
self.path = self.path.rstrip('/')
|
649 |
-
force_json = False
|
650 |
if self.path.endswith(('/api/extra/tokencount')):
|
651 |
try:
|
652 |
genparams = json.loads(body)
|
653 |
countprompt = genparams.get('prompt', "")
|
654 |
count = handle.token_count(countprompt.encode("UTF-8"))
|
655 |
self.send_response(200)
|
656 |
-
self.end_headers()
|
657 |
self.wfile.write(json.dumps({"value": count}).encode())
|
658 |
|
659 |
except ValueError as e:
|
660 |
utfprint("Count Tokens - Body Error: " + str(e))
|
661 |
self.send_response(400)
|
662 |
-
self.end_headers()
|
663 |
self.wfile.write(json.dumps({"value": -1}).encode())
|
664 |
return
|
665 |
|
@@ -672,11 +695,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
672 |
multiuserkey = ""
|
673 |
pass
|
674 |
|
675 |
-
if (multiuserkey
|
676 |
ag = handle.abort_generate()
|
677 |
time.sleep(0.3) #short delay before replying
|
678 |
self.send_response(200)
|
679 |
-
self.end_headers()
|
680 |
self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
|
681 |
print("\nGeneration Aborted")
|
682 |
else:
|
@@ -694,11 +717,11 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
694 |
pass
|
695 |
|
696 |
if totalgens>0:
|
697 |
-
if (multiuserkey
|
698 |
pendtxt = handle.get_pending_output()
|
699 |
pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
|
700 |
self.send_response(200)
|
701 |
-
self.end_headers()
|
702 |
self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
|
703 |
return
|
704 |
|
@@ -708,7 +731,7 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
708 |
requestsinqueue += 1
|
709 |
if not modelbusy.acquire(blocking=reqblocking):
|
710 |
self.send_response(503)
|
711 |
-
self.end_headers()
|
712 |
self.wfile.write(json.dumps({"detail": {
|
713 |
"msg": "Server is busy; please try again later.",
|
714 |
"type": "service_unavailable",
|
@@ -734,11 +757,9 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
734 |
|
735 |
if self.path.endswith('/v1/completions'):
|
736 |
api_format = 3
|
737 |
-
force_json = True
|
738 |
|
739 |
if self.path.endswith('/v1/chat/completions'):
|
740 |
api_format = 4
|
741 |
-
force_json = True
|
742 |
|
743 |
if api_format > 0:
|
744 |
genparams = None
|
@@ -764,8 +785,8 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
764 |
# Headers are already sent when streaming
|
765 |
if not sse_stream_flag:
|
766 |
self.send_response(200)
|
767 |
-
self.end_headers(
|
768 |
-
|
769 |
except:
|
770 |
print("Generate: The response could not be sent, maybe connection was terminated?")
|
771 |
return
|
@@ -773,27 +794,23 @@ class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
|
773 |
modelbusy.release()
|
774 |
|
775 |
self.send_response(404)
|
776 |
-
self.end_headers()
|
777 |
|
778 |
|
779 |
def do_OPTIONS(self):
|
780 |
self.send_response(200)
|
781 |
-
self.end_headers()
|
782 |
|
783 |
def do_HEAD(self):
|
784 |
self.send_response(200)
|
785 |
-
self.end_headers()
|
786 |
-
|
787 |
-
def end_headers(self,
|
788 |
-
self.send_header('
|
789 |
-
self.send_header('
|
790 |
-
self.send_header('
|
791 |
-
if
|
792 |
-
|
793 |
-
self.send_header('Content-type', 'text/event-stream')
|
794 |
-
self.send_header('Content-type', 'application/json')
|
795 |
-
else:
|
796 |
-
self.send_header('Content-type', 'text/html')
|
797 |
return super(ServerRequestHandler, self).end_headers()
|
798 |
|
799 |
|
@@ -1017,7 +1034,8 @@ def show_new_gui():
|
|
1017 |
mmq_var = ctk.IntVar(value=1)
|
1018 |
blas_threads_var = ctk.StringVar()
|
1019 |
blas_size_var = ctk.IntVar()
|
1020 |
-
version_var =ctk.StringVar(value="0")
|
|
|
1021 |
|
1022 |
smartcontext = ctk.IntVar()
|
1023 |
context_var = ctk.IntVar()
|
@@ -1069,11 +1087,15 @@ def show_new_gui():
|
|
1069 |
quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
|
1070 |
mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
1071 |
quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
|
|
|
|
1072 |
else:
|
1073 |
lowvram_box.grid_forget()
|
1074 |
quick_lowvram_box.grid_forget()
|
1075 |
mmq_box.grid_forget()
|
1076 |
quick_mmq_box.grid_forget()
|
|
|
|
|
1077 |
|
1078 |
if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
|
1079 |
gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
|
@@ -1086,6 +1108,7 @@ def show_new_gui():
|
|
1086 |
quick_gpu_layers_label.grid_forget()
|
1087 |
quick_gpu_layers_entry.grid_forget()
|
1088 |
|
|
|
1089 |
# presets selector
|
1090 |
makelabel(quick_tab, "Presets:", 1)
|
1091 |
|
@@ -1118,7 +1141,7 @@ def show_new_gui():
|
|
1118 |
makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
|
1119 |
|
1120 |
# load model
|
1121 |
-
makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170
|
1122 |
|
1123 |
# Hardware Tab
|
1124 |
hardware_tab = tabcontent["Hardware"]
|
@@ -1137,6 +1160,7 @@ def show_new_gui():
|
|
1137 |
gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
|
1138 |
CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
|
1139 |
gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
|
|
|
1140 |
lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0)
|
1141 |
mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
|
1142 |
|
@@ -1185,7 +1209,7 @@ def show_new_gui():
|
|
1185 |
# Model Tab
|
1186 |
model_tab = tabcontent["Model"]
|
1187 |
|
1188 |
-
makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1
|
1189 |
makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
|
1190 |
makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
|
1191 |
|
@@ -1265,6 +1289,12 @@ def show_new_gui():
|
|
1265 |
args.noavx2 = True
|
1266 |
args.noblas = True
|
1267 |
args.nommap = True
|
|
|
|
|
|
|
|
|
|
|
|
|
1268 |
|
1269 |
args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
|
1270 |
|
@@ -1329,6 +1359,9 @@ def show_new_gui():
|
|
1329 |
runopts_var.set(openblas_option)
|
1330 |
if "gpulayers" in dict and dict["gpulayers"]:
|
1331 |
gpulayers_var.set(dict["gpulayers"])
|
|
|
|
|
|
|
1332 |
if "blasthreads" in dict and dict["blasthreads"]:
|
1333 |
blas_threads_var.set(str(dict["blasthreads"]))
|
1334 |
else:
|
@@ -1447,7 +1480,7 @@ def show_gui_msgbox(title,message):
|
|
1447 |
def run_horde_worker(args, api_key, worker_name):
|
1448 |
import urllib.request
|
1449 |
from datetime import datetime
|
1450 |
-
global friendlymodelname, maxhordectx, maxhordelen, exitcounter, modelbusy, session_starttime
|
1451 |
epurl = f"http://localhost:{args.port}"
|
1452 |
if args.host!="":
|
1453 |
epurl = f"http://{args.host}:{args.port}"
|
@@ -1456,10 +1489,11 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1456 |
print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
|
1457 |
|
1458 |
def submit_completed_generation(url, jobid, sessionstart, submit_dict):
|
1459 |
-
global exitcounter, session_kudos_earned, session_jobs
|
1460 |
reply = make_url_request(url, submit_dict)
|
1461 |
if not reply:
|
1462 |
exitcounter += 1
|
|
|
1463 |
print_with_time(f"Error, Job submit failed.")
|
1464 |
else:
|
1465 |
reward = reply["reward"]
|
@@ -1473,6 +1507,11 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1473 |
elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
|
1474 |
earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
|
1475 |
print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
|
|
|
|
|
|
|
|
|
|
|
1476 |
|
1477 |
def make_url_request(url, data, method='POST'):
|
1478 |
try:
|
@@ -1481,7 +1520,7 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1481 |
if method=='POST':
|
1482 |
json_payload = json.dumps(data).encode('utf-8')
|
1483 |
request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
|
1484 |
-
request.add_header('
|
1485 |
else:
|
1486 |
request = urllib.request.Request(url, headers=headers, method=method)
|
1487 |
response_data = ""
|
@@ -1508,17 +1547,23 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1508 |
print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
|
1509 |
BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
|
1510 |
cluster = "https://horde.koboldai.net"
|
1511 |
-
while exitcounter <
|
1512 |
time.sleep(3)
|
1513 |
readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
|
1514 |
if readygo:
|
1515 |
print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
|
1516 |
break
|
1517 |
|
1518 |
-
while exitcounter <
|
1519 |
currentjob_attempts = 0
|
1520 |
current_generation = None
|
1521 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1522 |
#first, make sure we are not generating
|
1523 |
if modelbusy.locked():
|
1524 |
time.sleep(0.2)
|
@@ -1537,6 +1582,7 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1537 |
pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
|
1538 |
if not pop:
|
1539 |
exitcounter += 1
|
|
|
1540 |
print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
|
1541 |
time.sleep(5)
|
1542 |
continue
|
@@ -1555,7 +1601,7 @@ def run_horde_worker(args, api_key, worker_name):
|
|
1555 |
print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
|
1556 |
|
1557 |
#do gen
|
1558 |
-
while exitcounter <
|
1559 |
if not modelbusy.locked():
|
1560 |
current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
|
1561 |
if current_generation:
|
@@ -1880,4 +1926,10 @@ if __name__ == '__main__':
|
|
1880 |
parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
|
1881 |
parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
|
1882 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1883 |
main(parser.parse_args(),start_server=True)
|
|
|
184 |
os.add_dll_directory(dir_path)
|
185 |
os.add_dll_directory(abs_path)
|
186 |
os.add_dll_directory(os.getcwd())
|
187 |
+
if libname == lib_hipblas and "HIP_PATH" in os.environ:
|
188 |
+
os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
|
189 |
+
if args.debugmode == 1:
|
190 |
+
print(f"HIP/ROCm SDK at {os.environ['HIP_PATH']} included in .DLL load path")
|
191 |
handle = ctypes.CDLL(os.path.join(dir_path, libname))
|
192 |
|
193 |
handle.load_model.argtypes = [load_model_inputs]
|
|
|
365 |
modelbusy = threading.Lock()
|
366 |
requestsinqueue = 0
|
367 |
defaultport = 5001
|
368 |
+
KcppVersion = "1.47"
|
369 |
showdebug = True
|
370 |
showsamplerwarning = True
|
371 |
showmaxctxwarning = True
|
|
|
373 |
session_jobs = 0
|
374 |
session_starttime = None
|
375 |
exitcounter = 0
|
376 |
+
punishcounter = 0 #causes a timeout if too many errors
|
377 |
+
rewardcounter = 0 #reduces error counts for successful jobs
|
378 |
totalgens = 0
|
379 |
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
|
380 |
args = None #global args
|
|
|
418 |
elif api_format==4:
|
419 |
# translate openai chat completion messages format into one big string.
|
420 |
messages_array = genparams.get('messages', [])
|
421 |
+
adapter_obj = genparams.get('adapter', {})
|
422 |
messages_string = ""
|
423 |
+
system_message_start = adapter_obj.get("system_start", "\n### Instruction:\n")
|
424 |
+
system_message_end = adapter_obj.get("system_end", "")
|
425 |
+
user_message_start = adapter_obj.get("user_start", "\n### Instruction:\n")
|
426 |
+
user_message_end = adapter_obj.get("user_end", "")
|
427 |
+
assistant_message_start = adapter_obj.get("assistant_start", "\n### Response:\n")
|
428 |
+
assistant_message_end = adapter_obj.get("assistant_end", "")
|
429 |
+
|
430 |
for message in messages_array:
|
431 |
if message['role'] == "system":
|
432 |
+
messages_string += system_message_start
|
433 |
elif message['role'] == "user":
|
434 |
+
messages_string += user_message_start
|
435 |
elif message['role'] == "assistant":
|
436 |
+
messages_string += assistant_message_start
|
437 |
+
|
438 |
+
messages_string += message['content']
|
439 |
+
|
440 |
+
if message['role'] == "system":
|
441 |
+
messages_string += system_message_end
|
442 |
+
elif message['role'] == "user":
|
443 |
+
messages_string += user_message_end
|
444 |
+
elif message['role'] == "assistant":
|
445 |
+
messages_string += assistant_message_end
|
446 |
+
|
447 |
+
messages_string += assistant_message_start
|
448 |
+
|
449 |
genparams["prompt"] = messages_string
|
450 |
frqp = genparams.get('frequency_penalty', 0.1)
|
451 |
scaled_rep_pen = genparams.get('presence_penalty', frqp) + 1
|
|
|
521 |
async def handle_sse_stream(self, api_format):
|
522 |
global friendlymodelname
|
523 |
self.send_response(200)
|
524 |
+
self.send_header("cache-control", "no-cache")
|
525 |
+
self.send_header("connection", "keep-alive")
|
526 |
+
self.end_headers(content_type='text/event-stream')
|
527 |
|
528 |
current_token = 0
|
529 |
incomplete_token_buffer = bytearray()
|
|
|
590 |
global maxctx, maxhordelen, friendlymodelname, KcppVersion, totalgens
|
591 |
self.path = self.path.rstrip('/')
|
592 |
response_body = None
|
593 |
+
content_type = 'application/json'
|
594 |
|
595 |
if self.path in ["", "/?"] or self.path.startswith(('/?','?')): #it's possible for the root url to have ?params without /
|
596 |
+
content_type = 'text/html'
|
597 |
if self.embedded_kailite is None:
|
598 |
response_body = (f"Embedded Kobold Lite is not found.<br>You will have to connect via the main KoboldAI client, or <a href='https://lite.koboldai.net?local=1&port={self.port}'>use this URL</a> to connect.").encode()
|
599 |
else:
|
|
|
639 |
|
640 |
elif self.path.endswith('/v1/models'):
|
641 |
response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":1,"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
|
|
|
642 |
|
643 |
elif self.path=="/api":
|
644 |
+
content_type = 'text/html'
|
645 |
if self.embedded_kcpp_docs is None:
|
646 |
response_body = (f"KoboldCpp partial API reference can be found at the wiki: https://github.com/LostRuins/koboldcpp/wiki").encode()
|
647 |
else:
|
|
|
649 |
elif self.path.endswith(('/api')) or self.path.endswith(('/api/v1')):
|
650 |
self.path = "/api"
|
651 |
self.send_response(302)
|
652 |
+
self.send_header("location", self.path)
|
653 |
+
self.end_headers(content_type='text/html')
|
654 |
return None
|
655 |
|
656 |
if response_body is None:
|
657 |
self.send_response(404)
|
658 |
+
self.end_headers(content_type='text/html')
|
659 |
rp = 'Error: HTTP Server is running, but this endpoint does not exist. Please check the URL.'
|
660 |
self.wfile.write(rp.encode())
|
661 |
else:
|
662 |
self.send_response(200)
|
663 |
+
self.send_header('content-length', str(len(response_body)))
|
664 |
+
self.end_headers(content_type=content_type)
|
665 |
self.wfile.write(response_body)
|
666 |
return
|
667 |
|
668 |
def do_POST(self):
|
669 |
global modelbusy, requestsinqueue, currentusergenkey, totalgens
|
670 |
+
content_length = int(self.headers['content-length'])
|
671 |
body = self.rfile.read(content_length)
|
672 |
self.path = self.path.rstrip('/')
|
|
|
673 |
if self.path.endswith(('/api/extra/tokencount')):
|
674 |
try:
|
675 |
genparams = json.loads(body)
|
676 |
countprompt = genparams.get('prompt', "")
|
677 |
count = handle.token_count(countprompt.encode("UTF-8"))
|
678 |
self.send_response(200)
|
679 |
+
self.end_headers(content_type='application/json')
|
680 |
self.wfile.write(json.dumps({"value": count}).encode())
|
681 |
|
682 |
except ValueError as e:
|
683 |
utfprint("Count Tokens - Body Error: " + str(e))
|
684 |
self.send_response(400)
|
685 |
+
self.end_headers(content_type='application/json')
|
686 |
self.wfile.write(json.dumps({"value": -1}).encode())
|
687 |
return
|
688 |
|
|
|
695 |
multiuserkey = ""
|
696 |
pass
|
697 |
|
698 |
+
if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
|
699 |
ag = handle.abort_generate()
|
700 |
time.sleep(0.3) #short delay before replying
|
701 |
self.send_response(200)
|
702 |
+
self.end_headers(content_type='application/json')
|
703 |
self.wfile.write(json.dumps({"success": ("true" if ag else "false")}).encode())
|
704 |
print("\nGeneration Aborted")
|
705 |
else:
|
|
|
717 |
pass
|
718 |
|
719 |
if totalgens>0:
|
720 |
+
if (multiuserkey=="" and requestsinqueue==0) or (multiuserkey!="" and multiuserkey==currentusergenkey):
|
721 |
pendtxt = handle.get_pending_output()
|
722 |
pendtxtStr = ctypes.string_at(pendtxt).decode("UTF-8","ignore")
|
723 |
self.send_response(200)
|
724 |
+
self.end_headers(content_type='application/json')
|
725 |
self.wfile.write(json.dumps({"results": [{"text": pendtxtStr}]}).encode())
|
726 |
return
|
727 |
|
|
|
731 |
requestsinqueue += 1
|
732 |
if not modelbusy.acquire(blocking=reqblocking):
|
733 |
self.send_response(503)
|
734 |
+
self.end_headers(content_type='application/json')
|
735 |
self.wfile.write(json.dumps({"detail": {
|
736 |
"msg": "Server is busy; please try again later.",
|
737 |
"type": "service_unavailable",
|
|
|
757 |
|
758 |
if self.path.endswith('/v1/completions'):
|
759 |
api_format = 3
|
|
|
760 |
|
761 |
if self.path.endswith('/v1/chat/completions'):
|
762 |
api_format = 4
|
|
|
763 |
|
764 |
if api_format > 0:
|
765 |
genparams = None
|
|
|
785 |
# Headers are already sent when streaming
|
786 |
if not sse_stream_flag:
|
787 |
self.send_response(200)
|
788 |
+
self.end_headers(content_type='application/json')
|
789 |
+
self.wfile.write(json.dumps(gen).encode())
|
790 |
except:
|
791 |
print("Generate: The response could not be sent, maybe connection was terminated?")
|
792 |
return
|
|
|
794 |
modelbusy.release()
|
795 |
|
796 |
self.send_response(404)
|
797 |
+
self.end_headers(content_type='text/html')
|
798 |
|
799 |
|
800 |
def do_OPTIONS(self):
|
801 |
self.send_response(200)
|
802 |
+
self.end_headers(content_type='text/html')
|
803 |
|
804 |
def do_HEAD(self):
|
805 |
self.send_response(200)
|
806 |
+
self.end_headers(content_type='text/html')
|
807 |
+
|
808 |
+
def end_headers(self, content_type=None):
|
809 |
+
self.send_header('access-control-allow-origin', '*')
|
810 |
+
self.send_header('access-control-allow-methods', '*')
|
811 |
+
self.send_header('access-control-allow-headers', '*, Accept, Content-Type, Content-Length, Accept-Encoding, X-CSRF-Token, Client-Agent, X-Fields, Content-Type, Authorization, X-Requested-With, X-HTTP-Method-Override, apikey, genkey')
|
812 |
+
if content_type is not None:
|
813 |
+
self.send_header('content-type', content_type)
|
|
|
|
|
|
|
|
|
814 |
return super(ServerRequestHandler, self).end_headers()
|
815 |
|
816 |
|
|
|
1034 |
mmq_var = ctk.IntVar(value=1)
|
1035 |
blas_threads_var = ctk.StringVar()
|
1036 |
blas_size_var = ctk.IntVar()
|
1037 |
+
version_var = ctk.StringVar(value="0")
|
1038 |
+
tensor_split_str_vars = ctk.StringVar(value="")
|
1039 |
|
1040 |
smartcontext = ctk.IntVar()
|
1041 |
context_var = ctk.IntVar()
|
|
|
1087 |
quick_lowvram_box.grid(row=4, column=0, padx=8, pady=1, stick="nw")
|
1088 |
mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
1089 |
quick_mmq_box.grid(row=4, column=1, padx=8, pady=1, stick="nw")
|
1090 |
+
tensor_split_label.grid(row=6, column=0, padx = 8, pady=1, stick="nw")
|
1091 |
+
tensor_split_entry.grid(row=6, column=1, padx=8, pady=1, stick="nw")
|
1092 |
else:
|
1093 |
lowvram_box.grid_forget()
|
1094 |
quick_lowvram_box.grid_forget()
|
1095 |
mmq_box.grid_forget()
|
1096 |
quick_mmq_box.grid_forget()
|
1097 |
+
tensor_split_label.grid_forget()
|
1098 |
+
tensor_split_entry.grid_forget()
|
1099 |
|
1100 |
if index == "Use CLBlast" or index == "Use CuBLAS" or index == "Use hipBLAS (ROCm)":
|
1101 |
gpu_layers_label.grid(row=5, column=0, padx = 8, pady=1, stick="nw")
|
|
|
1108 |
quick_gpu_layers_label.grid_forget()
|
1109 |
quick_gpu_layers_entry.grid_forget()
|
1110 |
|
1111 |
+
|
1112 |
# presets selector
|
1113 |
makelabel(quick_tab, "Presets:", 1)
|
1114 |
|
|
|
1141 |
makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
|
1142 |
|
1143 |
# load model
|
1144 |
+
makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170)
|
1145 |
|
1146 |
# Hardware Tab
|
1147 |
hardware_tab = tabcontent["Hardware"]
|
|
|
1160 |
gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4"], width=60, variable=gpu_choice_var, state="readonly")
|
1161 |
CUDA_gpu_selector_box = ctk.CTkComboBox(hardware_tab, values=["1","2","3","4", "All"], width=60, variable=gpu_choice_var, state="readonly")
|
1162 |
gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 5, 50)
|
1163 |
+
tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 6, 80)
|
1164 |
lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0)
|
1165 |
mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1)
|
1166 |
|
|
|
1209 |
# Model Tab
|
1210 |
model_tab = tabcontent["Model"]
|
1211 |
|
1212 |
+
makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1)
|
1213 |
makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
|
1214 |
makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
|
1215 |
|
|
|
1289 |
args.noavx2 = True
|
1290 |
args.noblas = True
|
1291 |
args.nommap = True
|
1292 |
+
if tensor_split_str_vars.get()!="":
|
1293 |
+
tssv = tensor_split_str_vars.get()
|
1294 |
+
if "," in tssv:
|
1295 |
+
args.tensor_split = [float(x) for x in tssv.split(",")]
|
1296 |
+
else:
|
1297 |
+
args.tensor_split = [float(x) for x in tssv.split(" ")]
|
1298 |
|
1299 |
args.blasthreads = None if blas_threads_var.get()=="" else int(blas_threads_var.get())
|
1300 |
|
|
|
1359 |
runopts_var.set(openblas_option)
|
1360 |
if "gpulayers" in dict and dict["gpulayers"]:
|
1361 |
gpulayers_var.set(dict["gpulayers"])
|
1362 |
+
if "tensor_split" in dict and dict["tensor_split"]:
|
1363 |
+
tssep = ','.join(map(str, dict["tensor_split"]))
|
1364 |
+
tensor_split_str_vars.set(tssep)
|
1365 |
if "blasthreads" in dict and dict["blasthreads"]:
|
1366 |
blas_threads_var.set(str(dict["blasthreads"]))
|
1367 |
else:
|
|
|
1480 |
def run_horde_worker(args, api_key, worker_name):
|
1481 |
import urllib.request
|
1482 |
from datetime import datetime
|
1483 |
+
global friendlymodelname, maxhordectx, maxhordelen, exitcounter, punishcounter, modelbusy, session_starttime
|
1484 |
epurl = f"http://localhost:{args.port}"
|
1485 |
if args.host!="":
|
1486 |
epurl = f"http://{args.host}:{args.port}"
|
|
|
1489 |
print(f"{datetime.now().strftime('[%H:%M:%S]')} " + txt)
|
1490 |
|
1491 |
def submit_completed_generation(url, jobid, sessionstart, submit_dict):
|
1492 |
+
global exitcounter, punishcounter, session_kudos_earned, session_jobs, rewardcounter
|
1493 |
reply = make_url_request(url, submit_dict)
|
1494 |
if not reply:
|
1495 |
exitcounter += 1
|
1496 |
+
punishcounter += 1
|
1497 |
print_with_time(f"Error, Job submit failed.")
|
1498 |
else:
|
1499 |
reward = reply["reward"]
|
|
|
1507 |
elapsedtimestr = f"{hrs:03d}h:{mins:02d}m:{secs:02d}s"
|
1508 |
earnrate = session_kudos_earned/(elapsedtime.seconds/3600)
|
1509 |
print_with_time(f'Submitted {jobid} and earned {reward:.0f} kudos\n[Total:{session_kudos_earned:.0f} kudos, Time:{elapsedtimestr}, Jobs:{session_jobs}, EarnRate:{earnrate:.0f} kudos/hr]')
|
1510 |
+
rewardcounter += 1
|
1511 |
+
if rewardcounter > 50:
|
1512 |
+
rewardcounter = 0
|
1513 |
+
if exitcounter > 5:
|
1514 |
+
exitcounter -= 1
|
1515 |
|
1516 |
def make_url_request(url, data, method='POST'):
|
1517 |
try:
|
|
|
1520 |
if method=='POST':
|
1521 |
json_payload = json.dumps(data).encode('utf-8')
|
1522 |
request = urllib.request.Request(url, data=json_payload, headers=headers, method=method)
|
1523 |
+
request.add_header('content-type', 'application/json')
|
1524 |
else:
|
1525 |
request = urllib.request.Request(url, headers=headers, method=method)
|
1526 |
response_data = ""
|
|
|
1547 |
print(f"===\nEmbedded Horde Worker '{worker_name}' Starting...\n(To use your own KAI Bridge/Scribe worker instead, don't set your API key)")
|
1548 |
BRIDGE_AGENT = f"KoboldCppEmbedWorker:2:https://github.com/LostRuins/koboldcpp"
|
1549 |
cluster = "https://horde.koboldai.net"
|
1550 |
+
while exitcounter < 35:
|
1551 |
time.sleep(3)
|
1552 |
readygo = make_url_request(f'{epurl}/api/v1/info/version', None,'GET')
|
1553 |
if readygo:
|
1554 |
print_with_time(f"Embedded Horde Worker '{worker_name}' is started.")
|
1555 |
break
|
1556 |
|
1557 |
+
while exitcounter < 35:
|
1558 |
currentjob_attempts = 0
|
1559 |
current_generation = None
|
1560 |
|
1561 |
+
if punishcounter >= 10:
|
1562 |
+
punishcounter = 0
|
1563 |
+
print_with_time(f"Horde Worker Paused for 10 min - Too many errors. It will resume automatically.")
|
1564 |
+
print_with_time(f"Caution: Too many failed jobs may lead to entering maintenance mode.")
|
1565 |
+
time.sleep(600)
|
1566 |
+
|
1567 |
#first, make sure we are not generating
|
1568 |
if modelbusy.locked():
|
1569 |
time.sleep(0.2)
|
|
|
1582 |
pop = make_url_request(f'{cluster}/api/v2/generate/text/pop',gen_dict)
|
1583 |
if not pop:
|
1584 |
exitcounter += 1
|
1585 |
+
punishcounter += 1
|
1586 |
print_with_time(f"Failed to fetch job from {cluster}. Waiting 5 seconds...")
|
1587 |
time.sleep(5)
|
1588 |
continue
|
|
|
1601 |
print_with_time(f"Job received from {cluster} for {current_payload.get('max_length',80)} tokens and {current_payload.get('max_context_length',1024)} max context. Starting generation...")
|
1602 |
|
1603 |
#do gen
|
1604 |
+
while exitcounter < 35:
|
1605 |
if not modelbusy.locked():
|
1606 |
current_generation = make_url_request(f'{epurl}/api/v1/generate', current_payload)
|
1607 |
if current_generation:
|
|
|
1926 |
parser.add_argument("--multiuser", help="Runs in multiuser mode, which queues incoming requests instead of blocking them.", action='store_true')
|
1927 |
parser.add_argument("--foreground", help="Windows only. Sends the terminal to the foreground every time a new prompt is generated. This helps avoid some idle slowdown issues.", action='store_true')
|
1928 |
|
1929 |
+
# #deprecated hidden args. they do nothing. do not use
|
1930 |
+
# parser.add_argument("--psutil_set_threads", action='store_true', help=argparse.SUPPRESS)
|
1931 |
+
# parser.add_argument("--stream", action='store_true', help=argparse.SUPPRESS)
|
1932 |
+
# parser.add_argument("--unbantokens", action='store_true', help=argparse.SUPPRESS)
|
1933 |
+
# parser.add_argument("--usemirostat", action='store_true', help=argparse.SUPPRESS)
|
1934 |
+
|
1935 |
main(parser.parse_args(),start_server=True)
|
llama.cpp
CHANGED
@@ -189,6 +189,7 @@ enum llm_arch {
|
|
189 |
LLM_ARCH_STARCODER,
|
190 |
LLM_ARCH_PERSIMMON,
|
191 |
LLM_ARCH_REFACT,
|
|
|
192 |
LLM_ARCH_UNKNOWN,
|
193 |
};
|
194 |
|
@@ -202,7 +203,8 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
202 |
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
203 |
{ LLM_ARCH_STARCODER, "starcoder" },
|
204 |
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
205 |
-
{ LLM_ARCH_REFACT, "refact"
|
|
|
206 |
};
|
207 |
|
208 |
enum llm_kv {
|
@@ -305,6 +307,7 @@ struct LLM_KV {
|
|
305 |
|
306 |
enum llm_tensor {
|
307 |
LLM_TENSOR_TOKEN_EMBD,
|
|
|
308 |
LLM_TENSOR_POS_EMBD,
|
309 |
LLM_TENSOR_OUTPUT,
|
310 |
LLM_TENSOR_OUTPUT_NORM,
|
@@ -425,6 +428,14 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
425 |
LLM_ARCH_MPT,
|
426 |
{
|
427 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
428 |
},
|
429 |
},
|
430 |
{
|
@@ -459,6 +470,21 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
459 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
460 |
},
|
461 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
462 |
{
|
463 |
LLM_ARCH_UNKNOWN,
|
464 |
{
|
@@ -1016,6 +1042,9 @@ struct llama_hparams {
|
|
1016 |
float rope_freq_base_train;
|
1017 |
float rope_freq_scale_train;
|
1018 |
|
|
|
|
|
|
|
1019 |
bool operator!=(const llama_hparams & other) const {
|
1020 |
if (this->vocab_only != other.vocab_only) return true;
|
1021 |
if (this->n_vocab != other.n_vocab) return true;
|
@@ -1201,6 +1230,8 @@ struct llama_model {
|
|
1201 |
|
1202 |
struct ggml_tensor * tok_embeddings;
|
1203 |
struct ggml_tensor * pos_embeddings;
|
|
|
|
|
1204 |
|
1205 |
struct ggml_tensor * output_norm;
|
1206 |
struct ggml_tensor * output_norm_b;
|
@@ -1330,7 +1361,11 @@ static bool llama_kv_cache_init(
|
|
1330 |
cache.cells.clear();
|
1331 |
cache.cells.resize(n_ctx);
|
1332 |
|
|
|
|
|
|
|
1333 |
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
|
|
1334 |
|
1335 |
struct ggml_init_params params;
|
1336 |
params.mem_size = cache.buf.size;
|
@@ -1736,7 +1771,7 @@ struct llama_model_loader {
|
|
1736 |
}
|
1737 |
}
|
1738 |
|
1739 |
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta,
|
1740 |
if (backend != GGML_BACKEND_CPU) {
|
1741 |
ggml_set_no_alloc(ctx, true);
|
1742 |
}
|
@@ -1754,7 +1789,7 @@ struct llama_model_loader {
|
|
1754 |
return tensor;
|
1755 |
}
|
1756 |
|
1757 |
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
1758 |
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1759 |
|
1760 |
if (cur == NULL) {
|
@@ -2047,13 +2082,13 @@ static void llm_load_hparams(
|
|
2047 |
}
|
2048 |
} break;
|
2049 |
case LLM_ARCH_PERSIMMON:
|
2050 |
-
|
2051 |
-
|
2052 |
-
|
2053 |
-
|
2054 |
-
|
2055 |
-
|
2056 |
-
|
2057 |
case LLM_ARCH_REFACT:
|
2058 |
{
|
2059 |
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
@@ -2062,6 +2097,33 @@ static void llm_load_hparams(
|
|
2062 |
default: model.type = e_model::MODEL_UNKNOWN;
|
2063 |
}
|
2064 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2065 |
default: (void)0;
|
2066 |
}
|
2067 |
|
@@ -2206,6 +2268,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2206 |
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2207 |
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2208 |
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
|
|
|
|
2209 |
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2210 |
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2211 |
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -2305,8 +2369,8 @@ static void llm_load_tensors(
|
|
2305 |
|
2306 |
// output
|
2307 |
{
|
2308 |
-
|
2309 |
-
|
2310 |
|
2311 |
if (n_gpu_layers > int(n_layer)) {
|
2312 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2341,8 +2405,8 @@ static void llm_load_tensors(
|
|
2341 |
model.layers.resize(n_layer);
|
2342 |
|
2343 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2344 |
-
const
|
2345 |
-
const
|
2346 |
|
2347 |
auto & layer = model.layers[i];
|
2348 |
|
@@ -2371,8 +2435,8 @@ static void llm_load_tensors(
|
|
2371 |
{
|
2372 |
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2373 |
{
|
2374 |
-
|
2375 |
-
|
2376 |
|
2377 |
if (n_gpu_layers > int(n_layer)) {
|
2378 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2407,8 +2471,8 @@ static void llm_load_tensors(
|
|
2407 |
model.layers.resize(n_layer);
|
2408 |
|
2409 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2410 |
-
const
|
2411 |
-
const
|
2412 |
|
2413 |
auto & layer = model.layers[i];
|
2414 |
|
@@ -2441,8 +2505,8 @@ static void llm_load_tensors(
|
|
2441 |
|
2442 |
// output
|
2443 |
{
|
2444 |
-
|
2445 |
-
|
2446 |
|
2447 |
if (n_gpu_layers > int(n_layer)) {
|
2448 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2479,8 +2543,8 @@ static void llm_load_tensors(
|
|
2479 |
model.layers.resize(n_layer);
|
2480 |
|
2481 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2482 |
-
const
|
2483 |
-
const
|
2484 |
|
2485 |
auto & layer = model.layers[i];
|
2486 |
|
@@ -2518,8 +2582,8 @@ static void llm_load_tensors(
|
|
2518 |
|
2519 |
// output
|
2520 |
{
|
2521 |
-
|
2522 |
-
|
2523 |
|
2524 |
if (n_gpu_layers > int(n_layer)) {
|
2525 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2556,8 +2620,8 @@ static void llm_load_tensors(
|
|
2556 |
model.layers.resize(n_layer);
|
2557 |
|
2558 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2559 |
-
const
|
2560 |
-
const
|
2561 |
|
2562 |
auto & layer = model.layers[i];
|
2563 |
|
@@ -2595,8 +2659,8 @@ static void llm_load_tensors(
|
|
2595 |
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2596 |
|
2597 |
{
|
2598 |
-
|
2599 |
-
|
2600 |
|
2601 |
if (n_gpu_layers > int(n_layer)) {
|
2602 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
@@ -2630,8 +2694,8 @@ static void llm_load_tensors(
|
|
2630 |
const int i_gpu_start = n_layer - n_gpu_layers;
|
2631 |
model.layers.resize(n_layer);
|
2632 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2633 |
-
const
|
2634 |
-
const
|
2635 |
auto & layer = model.layers[i];
|
2636 |
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2637 |
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
@@ -2651,6 +2715,155 @@ static void llm_load_tensors(
|
|
2651 |
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2652 |
}
|
2653 |
} break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2654 |
default:
|
2655 |
throw std::runtime_error("unknown architecture");
|
2656 |
}
|
@@ -4507,7 +4720,6 @@ static struct ggml_cgraph * llm_build_starcoder(
|
|
4507 |
return gf;
|
4508 |
}
|
4509 |
|
4510 |
-
|
4511 |
static struct ggml_cgraph * llm_build_persimmon(
|
4512 |
llama_context & lctx,
|
4513 |
const llama_batch & batch) {
|
@@ -4905,37 +5117,604 @@ static struct ggml_cgraph * llm_build_persimmon(
|
|
4905 |
return gf;
|
4906 |
}
|
4907 |
|
4908 |
-
static struct ggml_cgraph *
|
4909 |
llama_context & lctx,
|
4910 |
const llama_batch & batch) {
|
4911 |
-
const auto & model
|
|
|
|
|
4912 |
|
4913 |
-
|
4914 |
|
4915 |
-
|
4916 |
-
|
4917 |
-
|
4918 |
-
|
4919 |
-
|
4920 |
-
|
4921 |
-
|
4922 |
-
|
4923 |
-
|
4924 |
-
|
4925 |
-
|
4926 |
-
|
4927 |
-
|
4928 |
-
|
4929 |
-
|
4930 |
-
|
4931 |
-
|
4932 |
-
|
4933 |
-
|
4934 |
-
|
4935 |
-
|
4936 |
-
|
4937 |
-
|
4938 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4939 |
} break;
|
4940 |
default:
|
4941 |
GGML_ASSERT(false);
|
@@ -5067,7 +5846,8 @@ static int llama_decode_internal(
|
|
5067 |
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
5068 |
model.arch == LLM_ARCH_BAICHUAN ||
|
5069 |
model.arch == LLM_ARCH_FALCON ||
|
5070 |
-
model.arch == LLM_ARCH_REFACT
|
|
|
5071 |
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5072 |
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
5073 |
n_threads = 1;
|
@@ -5568,7 +6348,6 @@ private:
|
|
5568 |
for (int i = 0; i < (int)text_utf.size(); i++) {
|
5569 |
const std::string & utf_char = text_utf[i];
|
5570 |
bool split_condition = false;
|
5571 |
-
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
|
5572 |
int bytes_remain = text_utf.size() - i;
|
5573 |
// forward backward lookups
|
5574 |
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
@@ -5594,9 +6373,9 @@ private:
|
|
5594 |
if (!split_condition && bytes_remain >= 3) {
|
5595 |
// 're|'ve|'ll
|
5596 |
if (utf_char == "\'" && (
|
5597 |
-
(utf_char_next == "r"
|
5598 |
-
(utf_char_next == "v"
|
5599 |
-
(utf_char_next == "l"
|
5600 |
) {
|
5601 |
split_condition = true;
|
5602 |
}
|
@@ -5647,7 +6426,7 @@ private:
|
|
5647 |
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
5648 |
split_condition = true;
|
5649 |
}
|
5650 |
-
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next)
|
5651 |
split_condition = true;
|
5652 |
}
|
5653 |
}
|
@@ -7166,7 +7945,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7166 |
const std::string name = ggml_get_name(meta);
|
7167 |
|
7168 |
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
7169 |
-
if (name.find("attn_v.weight") != std::string::npos) {
|
7170 |
++n_attention_wv;
|
7171 |
}
|
7172 |
else if (name.find("ffn_down.weight") != std::string::npos) {
|
|
|
189 |
LLM_ARCH_STARCODER,
|
190 |
LLM_ARCH_PERSIMMON,
|
191 |
LLM_ARCH_REFACT,
|
192 |
+
LLM_ARCH_BLOOM,
|
193 |
LLM_ARCH_UNKNOWN,
|
194 |
};
|
195 |
|
|
|
203 |
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
204 |
{ LLM_ARCH_STARCODER, "starcoder" },
|
205 |
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
206 |
+
{ LLM_ARCH_REFACT, "refact" },
|
207 |
+
{ LLM_ARCH_BLOOM, "bloom" },
|
208 |
};
|
209 |
|
210 |
enum llm_kv {
|
|
|
307 |
|
308 |
enum llm_tensor {
|
309 |
LLM_TENSOR_TOKEN_EMBD,
|
310 |
+
LLM_TENSOR_TOKEN_EMBD_NORM,
|
311 |
LLM_TENSOR_POS_EMBD,
|
312 |
LLM_TENSOR_OUTPUT,
|
313 |
LLM_TENSOR_OUTPUT_NORM,
|
|
|
428 |
LLM_ARCH_MPT,
|
429 |
{
|
430 |
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
431 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
432 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
433 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
434 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
435 |
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
436 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
437 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
438 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
439 |
},
|
440 |
},
|
441 |
{
|
|
|
470 |
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
471 |
},
|
472 |
},
|
473 |
+
{
|
474 |
+
LLM_ARCH_BLOOM,
|
475 |
+
{
|
476 |
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
477 |
+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
478 |
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
479 |
+
{ LLM_TENSOR_OUTPUT, "output" },
|
480 |
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
481 |
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
482 |
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
483 |
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
484 |
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
485 |
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
486 |
+
},
|
487 |
+
},
|
488 |
{
|
489 |
LLM_ARCH_UNKNOWN,
|
490 |
{
|
|
|
1042 |
float rope_freq_base_train;
|
1043 |
float rope_freq_scale_train;
|
1044 |
|
1045 |
+
float f_clamp_kqv;
|
1046 |
+
float f_max_alibi_bias;
|
1047 |
+
|
1048 |
bool operator!=(const llama_hparams & other) const {
|
1049 |
if (this->vocab_only != other.vocab_only) return true;
|
1050 |
if (this->n_vocab != other.n_vocab) return true;
|
|
|
1230 |
|
1231 |
struct ggml_tensor * tok_embeddings;
|
1232 |
struct ggml_tensor * pos_embeddings;
|
1233 |
+
struct ggml_tensor * tok_norm;
|
1234 |
+
struct ggml_tensor * tok_norm_b;
|
1235 |
|
1236 |
struct ggml_tensor * output_norm;
|
1237 |
struct ggml_tensor * output_norm_b;
|
|
|
1361 |
cache.cells.clear();
|
1362 |
cache.cells.resize(n_ctx);
|
1363 |
|
1364 |
+
// TODO: this should be:
|
1365 |
+
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
1366 |
+
// change it and test that it works
|
1367 |
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
1368 |
+
memset(cache.buf.data, 0, cache.buf.size);
|
1369 |
|
1370 |
struct ggml_init_params params;
|
1371 |
params.mem_size = cache.buf.size;
|
|
|
1771 |
}
|
1772 |
}
|
1773 |
|
1774 |
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
|
1775 |
if (backend != GGML_BACKEND_CPU) {
|
1776 |
ggml_set_no_alloc(ctx, true);
|
1777 |
}
|
|
|
1789 |
return tensor;
|
1790 |
}
|
1791 |
|
1792 |
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
1793 |
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1794 |
|
1795 |
if (cur == NULL) {
|
|
|
2082 |
}
|
2083 |
} break;
|
2084 |
case LLM_ARCH_PERSIMMON:
|
2085 |
+
{
|
2086 |
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2087 |
+
switch (hparams.n_layer) {
|
2088 |
+
case 36: model.type = e_model::MODEL_8B; break;
|
2089 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2090 |
+
}
|
2091 |
+
} break;
|
2092 |
case LLM_ARCH_REFACT:
|
2093 |
{
|
2094 |
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
|
|
2097 |
default: model.type = e_model::MODEL_UNKNOWN;
|
2098 |
}
|
2099 |
} break;
|
2100 |
+
case LLM_ARCH_BLOOM:
|
2101 |
+
{
|
2102 |
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2103 |
+
|
2104 |
+
switch (hparams.n_layer) {
|
2105 |
+
case 24: model.type = e_model::MODEL_1B; break;
|
2106 |
+
case 30:
|
2107 |
+
switch (hparams.n_embd) {
|
2108 |
+
case 2560: model.type = e_model::MODEL_3B; break;
|
2109 |
+
case 4096: model.type = e_model::MODEL_7B; break;
|
2110 |
+
} break;
|
2111 |
+
}
|
2112 |
+
} break;
|
2113 |
+
case LLM_ARCH_MPT:
|
2114 |
+
{
|
2115 |
+
hparams.f_clamp_kqv = 0.0f;
|
2116 |
+
|
2117 |
+
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
2118 |
+
GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV));
|
2119 |
+
GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS));
|
2120 |
+
|
2121 |
+
switch (hparams.n_layer) {
|
2122 |
+
case 32: model.type = e_model::MODEL_7B; break;
|
2123 |
+
case 48: model.type = e_model::MODEL_30B; break;
|
2124 |
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2125 |
+
}
|
2126 |
+
} break;
|
2127 |
default: (void)0;
|
2128 |
}
|
2129 |
|
|
|
2268 |
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
|
2269 |
LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps);
|
2270 |
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
2271 |
+
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
2272 |
+
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
2273 |
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
2274 |
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
2275 |
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
|
|
2369 |
|
2370 |
// output
|
2371 |
{
|
2372 |
+
ggml_backend_type backend_norm;
|
2373 |
+
ggml_backend_type backend_output;
|
2374 |
|
2375 |
if (n_gpu_layers > int(n_layer)) {
|
2376 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2405 |
model.layers.resize(n_layer);
|
2406 |
|
2407 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2408 |
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2409 |
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2410 |
|
2411 |
auto & layer = model.layers[i];
|
2412 |
|
|
|
2435 |
{
|
2436 |
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2437 |
{
|
2438 |
+
ggml_backend_type backend_norm;
|
2439 |
+
ggml_backend_type backend_output;
|
2440 |
|
2441 |
if (n_gpu_layers > int(n_layer)) {
|
2442 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2471 |
model.layers.resize(n_layer);
|
2472 |
|
2473 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2474 |
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2475 |
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2476 |
|
2477 |
auto & layer = model.layers[i];
|
2478 |
|
|
|
2505 |
|
2506 |
// output
|
2507 |
{
|
2508 |
+
ggml_backend_type backend_norm;
|
2509 |
+
ggml_backend_type backend_output;
|
2510 |
|
2511 |
if (n_gpu_layers > int(n_layer)) {
|
2512 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2543 |
model.layers.resize(n_layer);
|
2544 |
|
2545 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2546 |
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2547 |
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2548 |
|
2549 |
auto & layer = model.layers[i];
|
2550 |
|
|
|
2582 |
|
2583 |
// output
|
2584 |
{
|
2585 |
+
ggml_backend_type backend_norm;
|
2586 |
+
ggml_backend_type backend_output;
|
2587 |
|
2588 |
if (n_gpu_layers > int(n_layer)) {
|
2589 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2620 |
model.layers.resize(n_layer);
|
2621 |
|
2622 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2623 |
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2624 |
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2625 |
|
2626 |
auto & layer = model.layers[i];
|
2627 |
|
|
|
2659 |
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2660 |
|
2661 |
{
|
2662 |
+
ggml_backend_type backend_norm;
|
2663 |
+
ggml_backend_type backend_output;
|
2664 |
|
2665 |
if (n_gpu_layers > int(n_layer)) {
|
2666 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
|
|
2694 |
const int i_gpu_start = n_layer - n_gpu_layers;
|
2695 |
model.layers.resize(n_layer);
|
2696 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
2697 |
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2698 |
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2699 |
auto & layer = model.layers[i];
|
2700 |
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2701 |
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
|
|
2715 |
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
2716 |
}
|
2717 |
} break;
|
2718 |
+
case LLM_ARCH_BLOOM:
|
2719 |
+
{
|
2720 |
+
// TODO: CPU-only for now
|
2721 |
+
|
2722 |
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2723 |
+
model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
|
2724 |
+
model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
|
2725 |
+
|
2726 |
+
// output
|
2727 |
+
{
|
2728 |
+
ggml_backend_type backend_norm;
|
2729 |
+
ggml_backend_type backend_output;
|
2730 |
+
|
2731 |
+
if (n_gpu_layers > int(n_layer)) {
|
2732 |
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2733 |
+
// on Windows however this is detrimental unless everything is on the GPU
|
2734 |
+
#ifndef _WIN32
|
2735 |
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2736 |
+
#else
|
2737 |
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2738 |
+
#endif // _WIN32
|
2739 |
+
|
2740 |
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2741 |
+
} else {
|
2742 |
+
backend_norm = GGML_BACKEND_CPU;
|
2743 |
+
backend_output = GGML_BACKEND_CPU;
|
2744 |
+
}
|
2745 |
+
|
2746 |
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2747 |
+
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
2748 |
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2749 |
+
|
2750 |
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2751 |
+
vram_weights += ggml_nbytes(model.output_norm);
|
2752 |
+
vram_weights += ggml_nbytes(model.output_norm_b);
|
2753 |
+
}
|
2754 |
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2755 |
+
vram_weights += ggml_nbytes(model.output);
|
2756 |
+
}
|
2757 |
+
}
|
2758 |
+
|
2759 |
+
const uint32_t n_ff = hparams.n_ff;
|
2760 |
+
|
2761 |
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2762 |
+
|
2763 |
+
model.layers.resize(n_layer);
|
2764 |
+
|
2765 |
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2766 |
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2767 |
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2768 |
+
|
2769 |
+
auto & layer = model.layers[i];
|
2770 |
+
|
2771 |
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2772 |
+
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
2773 |
+
|
2774 |
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
2775 |
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
2776 |
+
|
2777 |
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2778 |
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
2779 |
+
|
2780 |
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2781 |
+
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
2782 |
+
|
2783 |
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
2784 |
+
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
2785 |
+
|
2786 |
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2787 |
+
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
2788 |
+
|
2789 |
+
if (backend == GGML_BACKEND_GPU) {
|
2790 |
+
vram_weights +=
|
2791 |
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
2792 |
+
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
2793 |
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) +
|
2794 |
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) +
|
2795 |
+
ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) +
|
2796 |
+
ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2);
|
2797 |
+
}
|
2798 |
+
}
|
2799 |
+
} break;
|
2800 |
+
case LLM_ARCH_MPT:
|
2801 |
+
{
|
2802 |
+
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
2803 |
+
|
2804 |
+
// output
|
2805 |
+
{
|
2806 |
+
ggml_backend_type backend_norm;
|
2807 |
+
ggml_backend_type backend_output;
|
2808 |
+
|
2809 |
+
if (n_gpu_layers > int(n_layer)) {
|
2810 |
+
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
2811 |
+
// on Windows however this is detrimental unless everything is on the GPU
|
2812 |
+
#ifndef _WIN32
|
2813 |
+
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
2814 |
+
#else
|
2815 |
+
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
2816 |
+
#endif // _WIN32
|
2817 |
+
|
2818 |
+
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
2819 |
+
} else {
|
2820 |
+
backend_norm = GGML_BACKEND_CPU;
|
2821 |
+
backend_output = GGML_BACKEND_CPU;
|
2822 |
+
}
|
2823 |
+
|
2824 |
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
2825 |
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
2826 |
+
|
2827 |
+
if (backend_norm == GGML_BACKEND_GPU) {
|
2828 |
+
vram_weights += ggml_nbytes(model.output_norm);
|
2829 |
+
}
|
2830 |
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
2831 |
+
vram_weights += ggml_nbytes(model.output);
|
2832 |
+
}
|
2833 |
+
}
|
2834 |
+
|
2835 |
+
const uint32_t n_ff = hparams.n_ff;
|
2836 |
+
|
2837 |
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
2838 |
+
|
2839 |
+
model.layers.resize(n_layer);
|
2840 |
+
|
2841 |
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
2842 |
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
2843 |
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
2844 |
+
|
2845 |
+
auto & layer = model.layers[i];
|
2846 |
+
|
2847 |
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
2848 |
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split);
|
2849 |
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2850 |
+
|
2851 |
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2852 |
+
|
2853 |
+
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
2854 |
+
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
2855 |
+
|
2856 |
+
if (backend == GGML_BACKEND_GPU) {
|
2857 |
+
vram_weights +=
|
2858 |
+
ggml_nbytes(layer.attn_norm) +
|
2859 |
+
ggml_nbytes(layer.wqkv) +
|
2860 |
+
ggml_nbytes(layer.wo) +
|
2861 |
+
ggml_nbytes(layer.ffn_norm) +
|
2862 |
+
ggml_nbytes(layer.w2) +
|
2863 |
+
ggml_nbytes(layer.w3);
|
2864 |
+
}
|
2865 |
+
}
|
2866 |
+
} break;
|
2867 |
default:
|
2868 |
throw std::runtime_error("unknown architecture");
|
2869 |
}
|
|
|
4720 |
return gf;
|
4721 |
}
|
4722 |
|
|
|
4723 |
static struct ggml_cgraph * llm_build_persimmon(
|
4724 |
llama_context & lctx,
|
4725 |
const llama_batch & batch) {
|
|
|
5117 |
return gf;
|
5118 |
}
|
5119 |
|
5120 |
+
static struct ggml_cgraph * llm_build_bloom(
|
5121 |
llama_context & lctx,
|
5122 |
const llama_batch & batch) {
|
5123 |
+
const auto & model = lctx.model;
|
5124 |
+
const auto & hparams = model.hparams;
|
5125 |
+
const auto & cparams = lctx.cparams;
|
5126 |
|
5127 |
+
const auto & kv_self = lctx.kv_self;
|
5128 |
|
5129 |
+
GGML_ASSERT(!!kv_self.ctx);
|
5130 |
+
|
5131 |
+
const int64_t n_embd = hparams.n_embd;
|
5132 |
+
const int64_t n_layer = hparams.n_layer;
|
5133 |
+
const int64_t n_ctx = cparams.n_ctx;
|
5134 |
+
const int64_t n_head = hparams.n_head;
|
5135 |
+
const int64_t n_head_kv = hparams.n_head_kv;
|
5136 |
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5137 |
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5138 |
+
|
5139 |
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5140 |
+
|
5141 |
+
const float norm_eps = hparams.f_norm_eps;
|
5142 |
+
|
5143 |
+
const int32_t n_tokens = batch.n_tokens;
|
5144 |
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5145 |
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5146 |
+
|
5147 |
+
auto & buf_compute = lctx.buf_compute;
|
5148 |
+
|
5149 |
+
struct ggml_init_params params = {
|
5150 |
+
/*.mem_size =*/ buf_compute.size,
|
5151 |
+
/*.mem_buffer =*/ buf_compute.data,
|
5152 |
+
/*.no_alloc =*/ false,
|
5153 |
+
};
|
5154 |
+
|
5155 |
+
params.no_alloc = true;
|
5156 |
+
|
5157 |
+
struct ggml_context * ctx0 = ggml_init(params);
|
5158 |
+
|
5159 |
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5160 |
+
|
5161 |
+
struct ggml_tensor * cur;
|
5162 |
+
struct ggml_tensor * token;
|
5163 |
+
struct ggml_tensor * inpL;
|
5164 |
+
|
5165 |
+
if (batch.token) {
|
5166 |
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5167 |
+
|
5168 |
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5169 |
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5170 |
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5171 |
+
}
|
5172 |
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5173 |
+
|
5174 |
+
token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5175 |
+
} else {
|
5176 |
+
#ifdef GGML_USE_MPI
|
5177 |
+
GGML_ASSERT(false && "not implemented");
|
5178 |
+
#endif
|
5179 |
+
|
5180 |
+
token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5181 |
+
|
5182 |
+
ggml_allocr_alloc(lctx.alloc, token);
|
5183 |
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5184 |
+
memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token));
|
5185 |
+
}
|
5186 |
+
}
|
5187 |
+
|
5188 |
+
// KQ_scale
|
5189 |
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5190 |
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5191 |
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5192 |
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5193 |
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5194 |
+
}
|
5195 |
+
|
5196 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5197 |
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5198 |
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5199 |
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5200 |
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5201 |
+
float * data = (float *) KQ_mask->data;
|
5202 |
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5203 |
+
|
5204 |
+
for (int h = 0; h < 1; ++h) {
|
5205 |
+
for (int j = 0; j < n_tokens; ++j) {
|
5206 |
+
const llama_pos pos = batch.pos[j];
|
5207 |
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5208 |
+
|
5209 |
+
for (int i = 0; i < n_kv; ++i) {
|
5210 |
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5211 |
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5212 |
+
}
|
5213 |
+
}
|
5214 |
+
}
|
5215 |
+
}
|
5216 |
+
}
|
5217 |
+
|
5218 |
+
// norm
|
5219 |
+
{
|
5220 |
+
inpL = ggml_norm(ctx0, token, norm_eps);
|
5221 |
+
inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b);
|
5222 |
+
}
|
5223 |
+
|
5224 |
+
ggml_set_name(inpL, "inpL");
|
5225 |
+
|
5226 |
+
for (int il = 0; il < n_layer; ++il) {
|
5227 |
+
{
|
5228 |
+
// Norm
|
5229 |
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5230 |
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
|
5231 |
+
}
|
5232 |
+
|
5233 |
+
{
|
5234 |
+
// Self Attention
|
5235 |
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
|
5236 |
+
|
5237 |
+
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
|
5238 |
+
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
|
5239 |
+
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
|
5240 |
+
|
5241 |
+
struct ggml_tensor * Qcur = tmpq;
|
5242 |
+
struct ggml_tensor * Kcur = tmpk;
|
5243 |
+
|
5244 |
+
// store key and value to memory
|
5245 |
+
{
|
5246 |
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5247 |
+
ggml_set_name(Vcur, "Vcur");
|
5248 |
+
|
5249 |
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5250 |
+
ggml_set_name(k, "k");
|
5251 |
+
|
5252 |
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5253 |
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5254 |
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5255 |
+
|
5256 |
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5257 |
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5258 |
+
}
|
5259 |
+
|
5260 |
+
struct ggml_tensor * Q =
|
5261 |
+
ggml_permute(ctx0,
|
5262 |
+
ggml_cpy(ctx0,
|
5263 |
+
Qcur,
|
5264 |
+
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
|
5265 |
+
0, 2, 1, 3);
|
5266 |
+
ggml_set_name(Q, "Q");
|
5267 |
+
|
5268 |
+
struct ggml_tensor * K =
|
5269 |
+
ggml_view_3d(ctx0, kv_self.k,
|
5270 |
+
n_embd_head, n_kv, n_head_kv,
|
5271 |
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5272 |
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5273 |
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5274 |
+
ggml_set_name(K, "K");
|
5275 |
+
|
5276 |
+
// K * Q
|
5277 |
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5278 |
+
ggml_set_name(KQ, "KQ");
|
5279 |
+
|
5280 |
+
// KQ_scaled = KQ / sqrt(n_embd_head)
|
5281 |
+
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
|
5282 |
+
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
5283 |
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5284 |
+
|
5285 |
+
struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8);
|
5286 |
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5287 |
+
|
5288 |
+
// KQ_masked = mask_past(KQ_scaled)
|
5289 |
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5290 |
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5291 |
+
|
5292 |
+
// KQ = soft_max(KQ_masked)
|
5293 |
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
5294 |
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5295 |
+
|
5296 |
+
// split cached V into n_head heads
|
5297 |
+
struct ggml_tensor * V =
|
5298 |
+
ggml_view_3d(ctx0, kv_self.v,
|
5299 |
+
n_kv, n_embd_head, n_head_kv,
|
5300 |
+
ggml_element_size(kv_self.v)*n_ctx,
|
5301 |
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5302 |
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5303 |
+
ggml_set_name(V, "V");
|
5304 |
+
|
5305 |
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5306 |
+
ggml_set_name(KQV, "KQV");
|
5307 |
+
|
5308 |
+
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
5309 |
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5310 |
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5311 |
+
|
5312 |
+
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
|
5313 |
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5314 |
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5315 |
+
}
|
5316 |
+
|
5317 |
+
// Projection
|
5318 |
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
|
5319 |
+
|
5320 |
+
// Add the input
|
5321 |
+
cur = ggml_add(ctx0, cur, inpL);
|
5322 |
+
|
5323 |
+
struct ggml_tensor * inpFF = cur;
|
5324 |
+
|
5325 |
+
// FF
|
5326 |
+
{
|
5327 |
+
// Norm
|
5328 |
+
{
|
5329 |
+
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
5330 |
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
|
5331 |
+
}
|
5332 |
+
|
5333 |
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
|
5334 |
+
|
5335 |
+
// GELU activation
|
5336 |
+
cur = ggml_gelu(ctx0, cur);
|
5337 |
+
|
5338 |
+
// Projection
|
5339 |
+
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
|
5340 |
+
}
|
5341 |
+
|
5342 |
+
inpL = ggml_add(ctx0, cur, inpFF);
|
5343 |
+
}
|
5344 |
+
|
5345 |
+
// Output Norm
|
5346 |
+
{
|
5347 |
+
cur = ggml_norm(ctx0, inpL, norm_eps);
|
5348 |
+
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
|
5349 |
+
}
|
5350 |
+
ggml_set_name(cur, "result_norm");
|
5351 |
+
|
5352 |
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5353 |
+
ggml_set_name(cur, "result_output");
|
5354 |
+
|
5355 |
+
ggml_build_forward_expand(gf, cur);
|
5356 |
+
|
5357 |
+
ggml_free(ctx0);
|
5358 |
+
|
5359 |
+
return gf;
|
5360 |
+
}
|
5361 |
+
|
5362 |
+
static struct ggml_cgraph * llm_build_mpt(
|
5363 |
+
llama_context & lctx,
|
5364 |
+
const llama_batch & batch) {
|
5365 |
+
const auto & model = lctx.model;
|
5366 |
+
const auto & hparams = model.hparams;
|
5367 |
+
const auto & cparams = lctx.cparams;
|
5368 |
+
|
5369 |
+
const auto & kv_self = lctx.kv_self;
|
5370 |
+
|
5371 |
+
GGML_ASSERT(!!kv_self.ctx);
|
5372 |
+
|
5373 |
+
const int64_t n_embd = hparams.n_embd;
|
5374 |
+
const int64_t n_layer = hparams.n_layer;
|
5375 |
+
const int64_t n_ctx = cparams.n_ctx;
|
5376 |
+
const int64_t n_head = hparams.n_head;
|
5377 |
+
const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA
|
5378 |
+
const int64_t n_embd_head = hparams.n_embd_head();
|
5379 |
+
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
5380 |
+
|
5381 |
+
const float norm_eps = hparams.f_norm_eps;
|
5382 |
+
const float clamp_kqv = hparams.f_clamp_kqv;
|
5383 |
+
const float max_alibi_bias = hparams.f_max_alibi_bias;
|
5384 |
+
|
5385 |
+
const int n_gpu_layers = model.n_gpu_layers;
|
5386 |
+
|
5387 |
+
const int32_t n_tokens = batch.n_tokens;
|
5388 |
+
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
5389 |
+
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
5390 |
+
|
5391 |
+
auto & buf_compute = lctx.buf_compute;
|
5392 |
+
|
5393 |
+
struct ggml_init_params params = {
|
5394 |
+
/*.mem_size =*/ buf_compute.size,
|
5395 |
+
/*.mem_buffer =*/ buf_compute.data,
|
5396 |
+
/*.no_alloc =*/ false,
|
5397 |
+
};
|
5398 |
+
|
5399 |
+
params.no_alloc = true;
|
5400 |
+
|
5401 |
+
struct ggml_context * ctx0 = ggml_init(params);
|
5402 |
+
|
5403 |
+
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
5404 |
+
|
5405 |
+
struct ggml_tensor * cur;
|
5406 |
+
struct ggml_tensor * inpL;
|
5407 |
+
|
5408 |
+
//int warmup = 0;
|
5409 |
+
if (batch.token) {
|
5410 |
+
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5411 |
+
|
5412 |
+
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
5413 |
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5414 |
+
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
5415 |
+
//warmup = ((uint32_t*) inp_tokens->data)[0] == 0;
|
5416 |
+
}
|
5417 |
+
|
5418 |
+
ggml_set_name(inp_tokens, "inp_tokens");
|
5419 |
+
|
5420 |
+
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
5421 |
+
} else {
|
5422 |
+
#ifdef GGML_USE_MPI
|
5423 |
+
GGML_ASSERT(false && "not implemented");
|
5424 |
+
#endif
|
5425 |
+
|
5426 |
+
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
5427 |
+
|
5428 |
+
ggml_allocr_alloc(lctx.alloc, inpL);
|
5429 |
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5430 |
+
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
5431 |
+
}
|
5432 |
+
}
|
5433 |
+
|
5434 |
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
5435 |
+
(void) i_gpu_start;
|
5436 |
+
|
5437 |
+
// offload functions set the tensor output backend to GPU
|
5438 |
+
// tensors are GPU-accelerated if any input or the output has been offloaded
|
5439 |
+
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
5440 |
+
offload_func_t offload_func_kq = llama_nop;
|
5441 |
+
offload_func_t offload_func_v = llama_nop;
|
5442 |
+
|
5443 |
+
#ifdef GGML_USE_CUBLAS
|
5444 |
+
if (n_gpu_layers > n_layer) {
|
5445 |
+
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
|
5446 |
+
}
|
5447 |
+
if (n_gpu_layers > n_layer + 1) {
|
5448 |
+
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
|
5449 |
+
}
|
5450 |
+
if (n_gpu_layers > n_layer + 2) {
|
5451 |
+
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
|
5452 |
+
}
|
5453 |
+
#endif // GGML_USE_CUBLAS
|
5454 |
+
|
5455 |
+
// KQ_scale
|
5456 |
+
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5457 |
+
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
5458 |
+
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
5459 |
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5460 |
+
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
5461 |
+
}
|
5462 |
+
|
5463 |
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5464 |
+
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5465 |
+
offload_func_kq(KQ_mask);
|
5466 |
+
ggml_set_name(KQ_mask, "KQ_mask");
|
5467 |
+
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
5468 |
+
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
5469 |
+
float * data = (float *) KQ_mask->data;
|
5470 |
+
memset(data, 0, ggml_nbytes(KQ_mask));
|
5471 |
+
|
5472 |
+
for (int h = 0; h < 1; ++h) {
|
5473 |
+
for (int j = 0; j < n_tokens; ++j) {
|
5474 |
+
const llama_pos pos = batch.pos[j];
|
5475 |
+
const llama_seq_id seq_id = batch.seq_id[j];
|
5476 |
+
|
5477 |
+
for (int i = 0; i < n_kv; ++i) {
|
5478 |
+
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
5479 |
+
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
5480 |
+
}
|
5481 |
+
}
|
5482 |
+
}
|
5483 |
+
}
|
5484 |
+
}
|
5485 |
+
|
5486 |
+
for (int il = 0; il < n_layer; ++il) {
|
5487 |
+
struct ggml_tensor * attn_norm;
|
5488 |
+
|
5489 |
+
offload_func_t offload_func = llama_nop;
|
5490 |
+
|
5491 |
+
#ifdef GGML_USE_CUBLAS
|
5492 |
+
if (il >= i_gpu_start) {
|
5493 |
+
offload_func = ggml_cuda_assign_buffers_no_alloc;
|
5494 |
+
}
|
5495 |
+
#endif // GGML_USE_CUBLAS
|
5496 |
+
|
5497 |
+
// self-attention
|
5498 |
+
// TODO: refactor into common function (shared with LLaMA)
|
5499 |
+
{
|
5500 |
+
attn_norm = ggml_norm(ctx0, inpL, norm_eps);
|
5501 |
+
offload_func(attn_norm);
|
5502 |
+
|
5503 |
+
attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm);
|
5504 |
+
offload_func(attn_norm);
|
5505 |
+
|
5506 |
+
if (1) {
|
5507 |
+
cur = attn_norm;
|
5508 |
+
}
|
5509 |
+
|
5510 |
+
// compute QKV
|
5511 |
+
|
5512 |
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5513 |
+
offload_func_kq(cur);
|
5514 |
+
|
5515 |
+
if (clamp_kqv > 0.0f) {
|
5516 |
+
cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv);
|
5517 |
+
offload_func_kq(cur);
|
5518 |
+
}
|
5519 |
+
|
5520 |
+
const size_t wsize = ggml_type_size(cur->type);
|
5521 |
+
|
5522 |
+
struct ggml_tensor * Qcur = ggml_view_3d(
|
5523 |
+
ctx0, cur, n_embd_head, n_head, n_tokens,
|
5524 |
+
wsize * n_embd_head,
|
5525 |
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5526 |
+
0);
|
5527 |
+
offload_func_kq(Qcur);
|
5528 |
+
|
5529 |
+
struct ggml_tensor * Kcur = ggml_view_3d(
|
5530 |
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5531 |
+
wsize * n_embd_head,
|
5532 |
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5533 |
+
wsize * n_embd_head * n_head);
|
5534 |
+
offload_func_kq(Kcur);
|
5535 |
+
|
5536 |
+
struct ggml_tensor * tmpv = ggml_view_3d(
|
5537 |
+
ctx0, cur, n_embd_head, n_head_kv, n_tokens,
|
5538 |
+
wsize * n_embd_head,
|
5539 |
+
wsize * n_embd_head * (n_head + 2 * n_head_kv),
|
5540 |
+
wsize * n_embd_head * (n_head + n_head_kv));
|
5541 |
+
offload_func_kq(Kcur);
|
5542 |
+
|
5543 |
+
ggml_set_name(Qcur, "Qcur");
|
5544 |
+
ggml_set_name(Kcur, "Kcur");
|
5545 |
+
|
5546 |
+
{
|
5547 |
+
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
|
5548 |
+
offload_func_v(Vcur);
|
5549 |
+
offload_func_v(Vcur->src[0]->src[0]);
|
5550 |
+
ggml_set_name(Vcur, "Vcur");
|
5551 |
+
|
5552 |
+
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
|
5553 |
+
offload_func_kq(k);
|
5554 |
+
ggml_set_name(k, "k");
|
5555 |
+
|
5556 |
+
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
5557 |
+
( n_ctx)*ggml_element_size(kv_self.v),
|
5558 |
+
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
5559 |
+
offload_func_v(v);
|
5560 |
+
|
5561 |
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
5562 |
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
5563 |
+
}
|
5564 |
+
|
5565 |
+
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
5566 |
+
offload_func_kq(Q);
|
5567 |
+
ggml_set_name(Q, "Q");
|
5568 |
+
|
5569 |
+
struct ggml_tensor * K =
|
5570 |
+
ggml_view_3d(ctx0, kv_self.k,
|
5571 |
+
n_embd_head, n_kv, n_head_kv,
|
5572 |
+
ggml_element_size(kv_self.k)*n_embd_gqa,
|
5573 |
+
ggml_element_size(kv_self.k)*n_embd_head,
|
5574 |
+
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
5575 |
+
offload_func_kq(K);
|
5576 |
+
ggml_set_name(K, "K");
|
5577 |
+
|
5578 |
+
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
5579 |
+
offload_func_kq(KQ);
|
5580 |
+
ggml_set_name(KQ, "KQ");
|
5581 |
+
|
5582 |
+
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
5583 |
+
offload_func_kq(KQ_scaled);
|
5584 |
+
ggml_set_name(KQ_scaled, "KQ_scaled");
|
5585 |
+
|
5586 |
+
// TODO: replace with ggml_add()
|
5587 |
+
struct ggml_tensor * KQ_scaled_alibi =
|
5588 |
+
ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias);
|
5589 |
+
offload_func_kq(KQ_scaled_alibi);
|
5590 |
+
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
|
5591 |
+
|
5592 |
+
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask);
|
5593 |
+
offload_func_kq(KQ_masked);
|
5594 |
+
ggml_set_name(KQ_masked, "KQ_masked");
|
5595 |
+
|
5596 |
+
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
5597 |
+
offload_func_v(KQ_soft_max);
|
5598 |
+
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
5599 |
+
|
5600 |
+
struct ggml_tensor * V =
|
5601 |
+
ggml_view_3d(ctx0, kv_self.v,
|
5602 |
+
n_kv, n_embd_head, n_head_kv,
|
5603 |
+
ggml_element_size(kv_self.v)*n_ctx,
|
5604 |
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
5605 |
+
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
5606 |
+
offload_func_v(V);
|
5607 |
+
ggml_set_name(V, "V");
|
5608 |
+
|
5609 |
+
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
5610 |
+
offload_func_v(KQV);
|
5611 |
+
ggml_set_name(KQV, "KQV");
|
5612 |
+
|
5613 |
+
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
5614 |
+
offload_func_v(KQV_merged);
|
5615 |
+
ggml_set_name(KQV_merged, "KQV_merged");
|
5616 |
+
|
5617 |
+
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
5618 |
+
offload_func_v(cur);
|
5619 |
+
ggml_set_name(cur, "KQV_merged_contiguous");
|
5620 |
+
|
5621 |
+
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
5622 |
+
offload_func(cur);
|
5623 |
+
ggml_set_name(cur, "result_wo");
|
5624 |
+
}
|
5625 |
+
|
5626 |
+
// Add the input
|
5627 |
+
cur = ggml_add(ctx0, cur, inpL);
|
5628 |
+
offload_func(cur);
|
5629 |
+
|
5630 |
+
struct ggml_tensor * attn_out = cur;
|
5631 |
+
|
5632 |
+
// feed forward
|
5633 |
+
{
|
5634 |
+
// Norm
|
5635 |
+
{
|
5636 |
+
cur = ggml_norm(ctx0, attn_out, norm_eps);
|
5637 |
+
offload_func(cur);
|
5638 |
+
|
5639 |
+
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
|
5640 |
+
offload_func(cur);
|
5641 |
+
}
|
5642 |
+
|
5643 |
+
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
5644 |
+
offload_func(cur);
|
5645 |
+
|
5646 |
+
cur = ggml_gelu(ctx0, cur);
|
5647 |
+
offload_func(cur);
|
5648 |
+
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
5649 |
+
offload_func(cur);
|
5650 |
+
}
|
5651 |
+
|
5652 |
+
cur = ggml_add(ctx0, cur, attn_out);
|
5653 |
+
offload_func(cur);
|
5654 |
+
// input for next layer
|
5655 |
+
inpL = cur;
|
5656 |
+
}
|
5657 |
+
|
5658 |
+
cur = inpL;
|
5659 |
+
|
5660 |
+
// norm
|
5661 |
+
{
|
5662 |
+
cur = ggml_norm(ctx0, cur, norm_eps);
|
5663 |
+
offload_func_nr(cur);
|
5664 |
+
|
5665 |
+
cur = ggml_mul(ctx0, cur, model.output_norm);
|
5666 |
+
ggml_set_name(cur, "result_norm");
|
5667 |
+
}
|
5668 |
+
|
5669 |
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5670 |
+
ggml_set_name(cur, "result_output");
|
5671 |
+
|
5672 |
+
ggml_build_forward_expand(gf, cur);
|
5673 |
+
|
5674 |
+
ggml_free(ctx0);
|
5675 |
+
|
5676 |
+
return gf;
|
5677 |
+
}
|
5678 |
+
|
5679 |
+
static struct ggml_cgraph * llama_build_graph(
|
5680 |
+
llama_context & lctx,
|
5681 |
+
const llama_batch & batch) {
|
5682 |
+
const auto & model = lctx.model;
|
5683 |
+
|
5684 |
+
struct ggml_cgraph * result = NULL;
|
5685 |
+
|
5686 |
+
switch (model.arch) {
|
5687 |
+
case LLM_ARCH_LLAMA:
|
5688 |
+
{
|
5689 |
+
result = llm_build_llama(lctx, batch);
|
5690 |
+
} break;
|
5691 |
+
case LLM_ARCH_BAICHUAN:
|
5692 |
+
{
|
5693 |
+
result = llm_build_baichaun(lctx, batch);
|
5694 |
+
} break;
|
5695 |
+
case LLM_ARCH_FALCON:
|
5696 |
+
{
|
5697 |
+
result = llm_build_falcon(lctx, batch);
|
5698 |
+
} break;
|
5699 |
+
case LLM_ARCH_STARCODER:
|
5700 |
+
{
|
5701 |
+
result = llm_build_starcoder(lctx, batch);
|
5702 |
+
} break;
|
5703 |
+
case LLM_ARCH_PERSIMMON:
|
5704 |
+
{
|
5705 |
+
result = llm_build_persimmon(lctx, batch);
|
5706 |
+
} break;
|
5707 |
+
case LLM_ARCH_REFACT:
|
5708 |
+
{
|
5709 |
+
result = llm_build_refact(lctx, batch);
|
5710 |
+
} break;
|
5711 |
+
case LLM_ARCH_BLOOM:
|
5712 |
+
{
|
5713 |
+
result = llm_build_bloom(lctx, batch);
|
5714 |
+
} break;
|
5715 |
+
case LLM_ARCH_MPT:
|
5716 |
+
{
|
5717 |
+
result = llm_build_mpt(lctx, batch);
|
5718 |
} break;
|
5719 |
default:
|
5720 |
GGML_ASSERT(false);
|
|
|
5846 |
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
5847 |
model.arch == LLM_ARCH_BAICHUAN ||
|
5848 |
model.arch == LLM_ARCH_FALCON ||
|
5849 |
+
model.arch == LLM_ARCH_REFACT ||
|
5850 |
+
model.arch == LLM_ARCH_MPT;
|
5851 |
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5852 |
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
5853 |
n_threads = 1;
|
|
|
6348 |
for (int i = 0; i < (int)text_utf.size(); i++) {
|
6349 |
const std::string & utf_char = text_utf[i];
|
6350 |
bool split_condition = false;
|
|
|
6351 |
int bytes_remain = text_utf.size() - i;
|
6352 |
// forward backward lookups
|
6353 |
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
|
|
6373 |
if (!split_condition && bytes_remain >= 3) {
|
6374 |
// 're|'ve|'ll
|
6375 |
if (utf_char == "\'" && (
|
6376 |
+
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
6377 |
+
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
6378 |
+
(utf_char_next == "l" && utf_char_next_next == "l"))
|
6379 |
) {
|
6380 |
split_condition = true;
|
6381 |
}
|
|
|
6426 |
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
6427 |
split_condition = true;
|
6428 |
}
|
6429 |
+
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
|
6430 |
split_condition = true;
|
6431 |
}
|
6432 |
}
|
|
|
7945 |
const std::string name = ggml_get_name(meta);
|
7946 |
|
7947 |
// TODO: avoid hardcoded tensor names - use the TN_* constants
|
7948 |
+
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
7949 |
++n_attention_wv;
|
7950 |
}
|
7951 |
else if (name.find("ffn_down.weight") != std::string::npos) {
|
otherarch/llama_v3.cpp
CHANGED
@@ -63,9 +63,8 @@ static void llama_v3_log_callback_default(llama_v3_log_level level, const char *
|
|
63 |
#define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
|
64 |
#define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
|
65 |
|
66 |
-
|
67 |
-
#if !defined(GGML_USE_CUBLAS)
|
68 |
#include "ggml-alloc.h"
|
|
|
69 |
#define LLAMA_V3_USE_ALLOCATOR
|
70 |
#else
|
71 |
#define LLAMA_V3_USE_SCRATCH
|
@@ -725,7 +724,7 @@ struct llama_v3_model_loader {
|
|
725 |
}
|
726 |
}
|
727 |
|
728 |
-
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne,
|
729 |
auto it = tensors_map.name_to_idx.find(name);
|
730 |
if (it == tensors_map.name_to_idx.end()) {
|
731 |
throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
@@ -739,7 +738,7 @@ struct llama_v3_model_loader {
|
|
739 |
return get_tensor_for(lt, backend);
|
740 |
}
|
741 |
|
742 |
-
struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt,
|
743 |
struct ggml_tensor * tensor;
|
744 |
if (backend != GGML_BACKEND_CPU) {
|
745 |
ggml_set_no_alloc(ggml_ctx, true);
|
@@ -1230,8 +1229,8 @@ static void llama_v3_model_load_internal(
|
|
1230 |
|
1231 |
// "output" tensor
|
1232 |
{
|
1233 |
-
|
1234 |
-
|
1235 |
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1236 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1237 |
// on Windows however this is detrimental unless everything is on the GPU
|
@@ -1261,8 +1260,8 @@ static void llama_v3_model_load_internal(
|
|
1261 |
|
1262 |
model.layers.resize(n_layer);
|
1263 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
1264 |
-
const
|
1265 |
-
const
|
1266 |
|
1267 |
auto & layer = model.layers[i];
|
1268 |
|
|
|
63 |
#define LLAMA_V3_LOG_WARN(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_WARN , __VA_ARGS__)
|
64 |
#define LLAMA_V3_LOG_ERROR(...) llama_v3_log_internal(LLAMA_V3_LOG_LEVEL_ERROR, __VA_ARGS__)
|
65 |
|
|
|
|
|
66 |
#include "ggml-alloc.h"
|
67 |
+
#if !defined(GGML_USE_CUBLAS)
|
68 |
#define LLAMA_V3_USE_ALLOCATOR
|
69 |
#else
|
70 |
#define LLAMA_V3_USE_SCRATCH
|
|
|
724 |
}
|
725 |
}
|
726 |
|
727 |
+
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend_type backend) {
|
728 |
auto it = tensors_map.name_to_idx.find(name);
|
729 |
if (it == tensors_map.name_to_idx.end()) {
|
730 |
throw std::runtime_error(std::runtime_error(format_old("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
|
|
738 |
return get_tensor_for(lt, backend);
|
739 |
}
|
740 |
|
741 |
+
struct ggml_tensor * get_tensor_for(llama_v3_load_tensor & lt, ggml_backend_type backend) {
|
742 |
struct ggml_tensor * tensor;
|
743 |
if (backend != GGML_BACKEND_CPU) {
|
744 |
ggml_set_no_alloc(ggml_ctx, true);
|
|
|
1229 |
|
1230 |
// "output" tensor
|
1231 |
{
|
1232 |
+
ggml_backend_type backend_norm;
|
1233 |
+
ggml_backend_type backend_output;
|
1234 |
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
1235 |
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
1236 |
// on Windows however this is detrimental unless everything is on the GPU
|
|
|
1260 |
|
1261 |
model.layers.resize(n_layer);
|
1262 |
for (uint32_t i = 0; i < n_layer; ++i) {
|
1263 |
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD; // NOLINT
|
1264 |
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_V3_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
1265 |
|
1266 |
auto & layer = model.layers[i];
|
1267 |
|
prompts/mnemonics.txt
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
For each kanji character, write a Markdown‐formatted mnemonic that uses its keyword and the keyword of all its components.
|
2 |
+
|
3 |
+
Kanji: 欠 (lack of)
|
4 |
+
Components: 𠂊 (hook claw), 人 (person)
|
5 |
+
Mnemonic: This **person** is a pirate. He lost his hand to a crocodile many years ago. Nowadays, the ***lack of*** a hand does not bother him too much. In fact, the **hook claw** that replaces it is the mark of a true pirate, so he is quite proud of it!
|
6 |
+
|
7 |
+
Kanji: 類 (kind (of something))
|
8 |
+
Components: 米 (rice), 大 (large), 頁 (page)
|
9 |
+
Mnemonic: The waiter at a Chinese restaurant hands you a **large** menu. Each **page** has all ***kinds*** of **rice** on offer!
|
10 |
+
|
11 |
+
Kanji: 燃 (burn)
|
12 |
+
Components: 火 (fire), 然 (sort of thing)
|
13 |
+
Mnemonic: ***Burning*** things up with **fire** is just my **sort of thing**. (Spoken like a true pyromaniac.)
|
14 |
+
|
15 |
+
Kanji: 頂 (top of)
|
16 |
+
Components: 丁 (street), 頁 (page)
|
17 |
+
Mnemonic: To be at the ***top of*** your game, you need both practical knowledge (**street** smarts) and theoretical knowledge (having read many **pages**).
|
18 |
+
|
19 |
+
Kanji: 険 (risky and steep)
|
20 |
+
Components: 阝 (small village), 㑒 (consensus)
|
21 |
+
Mnemonic: Everyone agrees (there is **consensus**) that the path to the **small village** is ***risky and steep***.
|
22 |
+
|
23 |
+
Kanji: 困 (distressed)
|
24 |
+
Components: 囗 (closed box), 木 (tree)
|
25 |
+
Mnemonic: You would feel ***distressed*** too if you were a **tree** trapped in a **closed box**! I have no place to grow!
|
26 |
+
|
27 |
+
Kanji: 頭 (head)
|
28 |
+
Components: 豆 (bean), 頁 (page)
|
29 |
+
Mnemonic: What do you have in that ***head*** of yours? A **bean** for a brain? Go read more **pages** and become more knowledgeable about the world!
|
30 |
+
|
31 |
+
Kanji: 確 (certain)
|
32 |
+
Components: 石 (stone), 冖 (roof without a chimney), 隹 (old bird)
|
33 |
+
Mnemonic: An **old bird** has made a nest on your **roof**. What do you do? You call Misaka from a <cite>A ***Certain*** Scientific Railgun</cite> to get rid of it, of course! But she doesn’t really want to vaporize the poor thing, so she just throws a **stone** to scare it away. (What was the point of calling her, then‽)
|
34 |
+
|
35 |
+
Kanji: 魚 (fish)
|
36 |
+
Components: 𠂊 (hook claw), 田 (rice field), 灬 (fire sparks)
|
37 |
+
Mnemonic: Catch ***fish*** with a **hook**, collect rice from the **rice field**, cook them with **fire**… And my meal is ready!
|
38 |
+
|
39 |
+
Kanji: 警 (to police (something))
|
40 |
+
Components: 敬 (respect), 言 (say)
|
41 |
+
Mnemonic: ***To police something*** is to make people **respect** what the law **says**.
|
42 |
+
|
43 |
+
Kanji: 筆 (writing brush)
|
44 |
+
Components: 竹 (bamboo), 聿 (brush)
|
45 |
+
Mnemonic: A traditional ***writing brush*** is a **brush** made of **bamboo**.
|
46 |
+
|
47 |
+
Kanji: 獄 (prison)
|
48 |
+
Components: 犭 (animal), 言 (say), 犬 (dog)
|
49 |
+
Mnemonic: In ***prison***, like in the **animal** kingdom, only the toughest survive. You have to watch what you **say**. It’s a **dog**‐eat‐dog world.
|
50 |
+
|
51 |
+
Kanji: 新 (new)
|
52 |
+
Components: 立 (standing up), 木 (tree), 斤 (axe)
|
53 |
+
Mnemonic: In order for a ***new*** construction to be made, an empty lot is needed. If there are any **trees** **standing up**, they must be cut down with an **axe**.
|
54 |
+
|
55 |
+
Kanji: 怪 (suspicious)
|
56 |
+
Components: 忄 (weak heart), 圣 (sacred)
|
57 |
+
Mnemonic: That painting of the **Sacred** **Heart** of Jesus looks ***suspicious***. I think it might be a forgery.
|
58 |
+
|
59 |
+
Kanji: 温 (warm (to the touch))
|
60 |
+
Components: 氵 (water drops), 日 (sun), 皿 (dish)
|
61 |
+
Mnemonic: If you leave **water** on a **dish** in the **sun**, it will get ***warm***.
|
62 |
+
|
63 |
+
Kanji: 階 (floor (of a building))
|
64 |
+
Components: 阝 (small village), 皆 (all)
|
65 |
+
Mnemonic: It might be a **small village**, but, despite that, **all** of its buildings have many ***floors***. It’s a village of skyscrapers!
|
66 |
+
|
67 |
+
Kanji: 多 (many)
|
68 |
+
Components: 夕 (evening (before sunset)), 夕 (evening (before sunset))
|
69 |
+
Mnemonic: Two **evenings** in a day would be one too ***many***.
|
70 |
+
|
71 |
+
Kanji: 別 (separate)
|
72 |
+
Components: 口 (mouth), 万 (ten thousand), 刂 (knife)
|
73 |
+
Mnemonic: Tom Six is at it again. For his next flick, he wants to stitch together **ten thousand** people, **mouth**‐to‐anus. One of the most graphic and disturbing scenes will feature one of the victims using a **knife** to ***separate*** perself.
|
74 |
+
|
75 |
+
Kanji: 並 (line up)
|
76 |
+
Components: 䒑 (antlers on a wall), 业 (runway)
|
77 |
+
Mnemonic: In order to land a plane you have to ***line up*** properly with the **runway**. The things that look like **antlers** at the end of the runway are the control towers; you should follow their instructions.
|
78 |
+
|
79 |
+
Kanji: 姿 (figure)
|
80 |
+
Components: 次 (next), 女 (woman)
|
81 |
+
Mnemonic: The **next** **woman** that I date will have a perfect **figure**. Because I’m done with 3D women—it will *literally* be an anime figure!
|
82 |
+
|
83 |
+
Kanji: 実 (real)
|
84 |
+
Components: 宀 (roof with a chimney), 𡗗 (three people)
|
85 |
+
Mnemonic: Living under a **roof with a chimney** with **three people** (a wife and two children)—a happy family life—is not something I could have ever imagined. It does not feel ***real***.
|
86 |
+
|
87 |
+
Kanji: 謝 (apologize)
|
88 |
+
Components: 言 (say), 射 (shoot)
|
89 |
+
Mnemonic: **Shot** first, ***apologize*** (**say** you are sorry) later.
|
90 |
+
|
91 |
+
Kanji: 提 (propose)
|
92 |
+
Components: 扌 (left hand), 是 (go with)
|
93 |
+
Mnemonic:
|