File size: 2,357 Bytes
3e5595b
 
46c2bfc
 
6ba25f7
9938c27
 
 
 
 
 
 
 
 
 
 
 
6ba25f7
 
 
 
 
 
 
3e5595b
 
 
 
 
 
 
81bf9b4
edc20ac
3e5595b
 
 
 
 
 
 
 
9938c27
3e5595b
b439a8f
3e5595b
 
6ba25f7
 
9938c27
6ba25f7
3e5595b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9938c27
 
f57d7c6
3e5595b
 
f57d7c6
46c2bfc
3e5595b
 
 
 
edc20ac
3e5595b
 
 
 
 
 
 
dc53b3a
 
6ba25f7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#pragma once

const int stop_token_max = 16;
const int ban_token_max = 16;
const int tensor_split_max = 16;
// match kobold's sampler list and order
enum samplers
{
    KCPP_SAMPLER_TOP_K=0,
    KCPP_SAMPLER_TOP_A=1,
    KCPP_SAMPLER_TOP_P=2,
    KCPP_SAMPLER_TFS=3,
    KCPP_SAMPLER_TYP=4,
    KCPP_SAMPLER_TEMP=5,
    KCPP_SAMPLER_REP_PEN=6,
    KCPP_SAMPLER_MAX
};
enum stop_reason
{
    INVALID=-1,
    OUT_OF_TOKENS=0,
    EOS_TOKEN=1,
    CUSTOM_STOPPER=2,
};
struct load_model_inputs
{
    const int threads;
    const int blasthreads;
    const int max_context_length;
    const int batch_size;
    const bool f16_kv;
    const bool low_vram;
    const bool use_mmq;
    const char * executable_path;
    const char * model_filename;
    const char * lora_filename;
    const char * lora_base;
    const bool use_mmap;
    const bool use_mlock;
    const bool use_smartcontext;
    const int clblast_info = 0;
    const int cublas_info = 0;
    const int blasbatchsize = 512;
    const int debugmode = 0;
    const int forceversion = 0;
    const int gpulayers = 0;
    const float rope_freq_scale = 1.0f;
    const float rope_freq_base = 10000.0f;
    const char * banned_tokens[ban_token_max];
    const float tensor_split[tensor_split_max];
};
struct generation_inputs
{
    const int seed;
    const char *prompt;
    const int max_context_length;
    const int max_length;
    const float temperature;
    const int top_k;
    const float top_a = 0.0f;
    const float top_p;
    const float typical_p;
    const float tfs;
    const float rep_pen;
    const int rep_pen_range;
    const int mirostat = 0;
    const float mirostat_eta;
    const float mirostat_tau;
    const samplers sampler_order[KCPP_SAMPLER_MAX];
    const int sampler_len;
    const bool unban_tokens_rt;
    const char * stop_sequence[stop_token_max];
    const bool stream_sse;
    const char * grammar;
    const bool grammar_retain_state;
};
struct generation_outputs
{
    int status = -1;
    char text[24576]; //24kb should be enough for any response
};

extern std::string executable_path;
extern std::string lora_filename;
extern std::string lora_base;
extern std::vector<std::string> generated_tokens;
extern bool generation_finished;
extern float last_eval_time;
extern float last_process_time;
extern int last_token_count;
extern stop_reason last_stop_reason;