JingzeShi commited on
Commit
b1332cb
·
verified ·
1 Parent(s): 06f187c

Upload tokenizer

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
chat_template.jinja CHANGED
@@ -1,109 +1,108 @@
1
- {{- bos_token }}
2
- {%- if custom_tools is defined %}
3
- {%- set tools = custom_tools %}
4
- {%- endif %}
5
- {%- if not tools_in_user_message is defined %}
6
- {%- set tools_in_user_message = true %}
7
- {%- endif %}
8
- {%- if not date_string is defined %}
9
- {%- set date_string = "December 2024" %}
10
- {%- endif %}
11
- {%- if not tools is defined %}
12
- {%- set tools = none %}
13
- {%- endif %}
14
-
15
- {#- This block extracts the system message, so we can slot it into the right place. #}
16
- {%- if messages[0]['role'] == 'system' %}
17
- {%- set system_message = messages[0]['content']|trim %}
18
- {%- set messages = messages[1:] %}
19
- {%- else %}
20
- {%- set system_message = "" %}
21
- {%- endif %}
22
-
23
- {#- System message + builtin tools #}
24
- {{- "<|start_header_id|>system<|end_header_id|>\n" }}
25
- {%- if builtin_tools is defined or tools is not none %}
26
- {{- "Environment: ipython\n" }}
27
- {%- endif %}
28
- {%- if builtin_tools is defined %}
29
- {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
30
- {%- endif %}
31
- {{- "Cutting Knowledge Date: December 2024\n" }}
32
- {{- "Today Date: " + date_string + "\n" }}
33
- {%- if tools is not none and not tools_in_user_message %}
34
- {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
35
- {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
36
- {{- "Do not use variables.\n\n" }}
37
- {%- for t in tools %}
38
- {{- t | tojson(indent=4) }}
39
- {{- "\n\n" }}
40
- {%- endfor %}
41
- {%- endif %}
42
- {{- system_message }}
43
- {{- "<|end_of_text|>\n\n" }}
44
-
45
- {#- Custom tools are passed in a user message with some extra guidance #}
46
- {%- if tools_in_user_message and not tools is none %}
47
- {#- Extract the first user message so we can plug it in here #}
48
- {%- if messages | length != 0 %}
49
- {%- set first_user_message = messages[0]['content']|trim %}
50
- {%- set messages = messages[1:] %}
51
- {%- else %}
52
- {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
53
- {%- endif %}
54
- {{- '<|start_header_id|>user<|end_header_id|>\n' -}}
55
- {{- "Given the following functions, please respond with a JSON for a function call " }}
56
- {{- "with its proper arguments that best answers the given prompt.\n\n" }}
57
- {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
58
- {{- "Do not use variables.\n\n" }}
59
- {%- for t in tools %}
60
- {{- t | tojson(indent=4) }}
61
- {{- "\n\n" }}
62
- {%- endfor %}
63
- {{- first_user_message + "<|end_of_text|>\n\n"}}
64
- {%- endif %}
65
-
66
- {%- for message in messages %}
67
- {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
68
- {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n'+ message['content'] | trim + '<|end_of_text|>\n\n' }}
69
- {%- elif 'tool_calls' in message %}
70
- {%- if not message.tool_calls|length == 1 %}
71
- {{- raise_exception("This model only supports single tool-calls at once!") }}
72
- {%- endif %}
73
- {%- set tool_call = message.tool_calls[0].function %}
74
- {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
75
- {{- '<|start_header_id|>assistant<|end_header_id|>\n' -}}
76
- {{- "<|python_tag|>" + tool_call.name + ".call(" }}
77
- {%- for arg_name, arg_val in tool_call.arguments | items %}
78
- {{- arg_name + '="' + arg_val + '"' }}
79
- {%- if not loop.last %}
80
- {{- ", " }}
81
- {%- endif %}
82
- {%- endfor %}
83
- {{- ")" }}
84
- {%- else %}
85
- {{- '<|start_header_id|>assistant<|end_header_id|>\n' -}}
86
- {{- '{"name": "' + tool_call.name + '", ' }}
87
- {{- '"parameters": ' }}
88
- {{- tool_call.arguments | tojson }}
89
- {{- "}" }}
90
- {%- endif %}
91
- {%- if builtin_tools is defined %}
92
- {#- This means we're in ipython mode #}
93
- {{- "<|end_of_text|>\n\n" }}
94
- {%- else %}
95
- {{- "<|end_of_text|>\n\n" }}
96
- {%- endif %}
97
- {%- elif message.role == "tool" or message.role == "ipython" %}
98
- {{- "<|start_header_id|>ipython<|end_header_id|>\n" }}
99
- {%- if message.content is mapping or message.content is iterable %}
100
- {{- message.content | tojson }}
101
- {%- else %}
102
- {{- message.content }}
103
- {%- endif %}
104
- {{- "<|end_of_text|>\n\n" }}
105
- {%- endif %}
106
- {%- endfor %}
107
- {%- if add_generation_prompt %}
108
- {{- '<|start_header_id|>assistant<|end_header_id|>\n' }}
109
- {%- endif %}
 
1
+ {%- if not documents is defined %}
2
+ {%- set documents = none %}
3
+ {%- endif %}
4
+ {%- if not tools is defined %}
5
+ {%- set tools = none %}
6
+ {%- endif %}
7
+
8
+
9
+ {#- extracts the system message, so we can slot it into the right place. #}
10
+ {%- if messages[0].role == 'system' %}
11
+ {%- set system_message = messages[0].content|trim %}
12
+ {%- else %}
13
+ {%- set system_message = "You are Doge, created by SmallDoge Team. You are a helpful assistant" %}
14
+ {%- endif %}
15
+
16
+
17
+ {#- system message + documents + tools #}
18
+ {{- '<|im_start|>system\n' }}
19
+ {{- system_message }}
20
+ {%- if documents is not none %}
21
+ {{- "\n\nYou have access to the following documents. Please use them to answer the user's question.\n\n" }}
22
+ {%- for doc in documents %}
23
+ {%- if doc.title is defined %}
24
+ {{- "Title: " + doc.title + "\n" }}
25
+ {%- endif %}
26
+ {{- "Content: " + doc.text + "\n\n" }}
27
+ {%- endfor %}
28
+ {{- "If the documents don't contain relevant information, rely on your general knowledge but acknowledge when you're doing so." }}
29
+ {%- endif %}
30
+ {%- if tools is not none %}
31
+ {{- "\n\nYou may call one or more functions to assist with the user query. You are provided with function signatures within <tools></tools> XML tags:\n\n<tools>" }}
32
+ {%- for tool in tools %}
33
+ {{- "\n" }}
34
+ {{- tool | tojson }}
35
+ {%- endfor %}
36
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" }}
37
+ {%- endif %}
38
+ {{- "<|im_end|>\n" }}
39
+
40
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
41
+ {%- for message in messages[::-1] %}
42
+ {%- set index = (messages|length - 1) - loop.index0 %}
43
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
44
+ {%- set ns.multi_step_tool = false %}
45
+ {%- set ns.last_query_index = index %}
46
+ {%- endif %}
47
+ {%- endfor %}
48
+ {%- for message in messages %}
49
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
50
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
51
+ {%- elif message.role == "assistant" %}
52
+ {%- set content = message.content %}
53
+ {%- set reasoning_content = '' %}
54
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
55
+ {%- set reasoning_content = message.reasoning_content %}
56
+ {%- else %}
57
+ {%- if '</think>' in message.content %}
58
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
59
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
60
+ {%- endif %}
61
+ {%- endif %}
62
+ {%- if loop.index0 > ns.last_query_index %}
63
+ {%- if loop.last or (not loop.last and reasoning_content) %}
64
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
65
+ {%- else %}
66
+ {{- '<|im_start|>' + message.role + '\n' + content }}
67
+ {%- endif %}
68
+ {%- else %}
69
+ {{- '<|im_start|>' + message.role + '\n' + content }}
70
+ {%- endif %}
71
+ {%- if message.tool_calls %}
72
+ {%- for tool_call in message.tool_calls %}
73
+ {%- if (loop.first and content) or (not loop.first) %}
74
+ {{- '\n' }}
75
+ {%- endif %}
76
+ {%- if tool_call.function %}
77
+ {%- set tool_call = tool_call.function %}
78
+ {%- endif %}
79
+ {{- '<tool_call>\n{"name": "' }}
80
+ {{- tool_call.name }}
81
+ {{- '", "arguments": ' }}
82
+ {%- if tool_call.arguments is string %}
83
+ {{- tool_call.arguments }}
84
+ {%- else %}
85
+ {{- tool_call.arguments | tojson }}
86
+ {%- endif %}
87
+ {{- '}\n</tool_call>' }}
88
+ {%- endfor %}
89
+ {%- endif %}
90
+ {{- '<|im_end|>\n' }}
91
+ {%- elif message.role == "tool" %}
92
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
93
+ {{- '<|im_start|>user' }}
94
+ {%- endif %}
95
+ {{- '\n<tool_response>\n' }}
96
+ {{- message.content }}
97
+ {{- '\n</tool_response>' }}
98
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
99
+ {{- '<|im_end|>\n' }}
100
+ {%- endif %}
101
+ {%- endif %}
102
+ {%- endfor %}
103
+ {%- if add_generation_prompt %}
104
+ {{- '<|im_start|>assistant\n' }}
105
+ {%- if enable_thinking is defined and enable_thinking is false %}
106
+ {{- '<think>\n\n</think>\n\n' }}
107
+ {%- endif %}
108
+ {%- endif %}
 
special_tokens_map.json CHANGED
@@ -1,23 +1,23 @@
1
- {
2
- "bos_token": {
3
- "content": "<|begin_of_text|>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|end_of_text|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": {
17
- "content": "<|finetune_right_pad_id|>",
18
- "lstrip": false,
19
- "normalized": false,
20
- "rstrip": false,
21
- "single_word": false
22
- }
23
- }
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end_of_text|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff