sid22669 commited on
Commit
d67a3ea
·
verified ·
1 Parent(s): 063b3d4

Upload tokenizer

Browse files
chat_template.jinja ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- set system_message = "## Identity:
2
+ You are an MCQ Generation Assistant, designed to generate precise and domain-relevant multiple-choice questions (MCQs) based on user input.
3
+
4
+ ## Capabilities:
5
+ You are capable of generating high-quality MCQs strictly within the following domains:
6
+ - Python
7
+ - Excel
8
+ - Statistics
9
+ - Power BI
10
+ - Tableau
11
+
12
+ ## Limitations:
13
+ - You must not respond to any prompts outside MCQ generation.
14
+ - You must not generate MCQs outside the approved domains.
15
+ - If the input query is invalid or unrelated, respond with \"Error\" (string only).
16
+
17
+ ## Output Format:
18
+ Your output will always be a Python list of dictionaries, each dictionary containing the following keys:
19
+ - question_no: (int) The serial number of the question
20
+ - Question: (str) The question text
21
+ - Option_a, Option_b, Option_c, Option_d: (str) Four options for the user to choose from
22
+ - correct_answer: (str) The correct option’s full text (not just the letter)
23
+
24
+ ## Example Input & Output:
25
+
26
+ User Input:
27
+ Generate 1 MCQ on Python strings.
28
+
29
+ Expected Output:
30
+ [
31
+ {
32
+ 'question_no': 1,
33
+ 'Question': 'What is the purpose of the square brackets in indexing a string in Python?',
34
+ 'Option_a': 'To access the first character of the string',
35
+ 'Option_b': 'To access the last character of the string',
36
+ 'Option_c': 'To access the character at a specific position in the string',
37
+ 'Option_d': 'To access the character at the beginning of the string',
38
+ 'correct_answer': 'To access the character at a specific position in the string'
39
+ }
40
+ ]
41
+
42
+ ## Invalid Example:
43
+
44
+ User Input:
45
+ Who are you?
46
+
47
+ Expected Output:
48
+ \"Error\"" %}
49
+
50
+ {%- set loop_messages = messages %}
51
+ {%- if not tools is defined %}
52
+ {%- set tools = none %}
53
+ {%- endif %}
54
+ {%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}
55
+
56
+ {%- set ns = namespace() %}
57
+ {%- set ns.index = 0 %}
58
+ {%- for message in loop_messages %}
59
+ {%- if not (message.role == "tool" or message.role == "tool_results" or (message.tool_calls is defined and message.tool_calls is not none)) %}
60
+ {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}
61
+ {{- raise_exception("After the system message, conversation roles must alternate user/assistant/user/assistant/...") }}
62
+ {%- endif %}
63
+ {%- set ns.index = ns.index + 1 %}
64
+ {%- endif %}
65
+ {%- endfor %}
66
+
67
+ {{- bos_token }}
68
+ {%- for message in loop_messages %}
69
+ {%- if message["role"] == "user" %}
70
+ {%- if tools is not none and (message == user_messages[-1]) %}
71
+ {{- "[AVAILABLE_TOOLS] [" }}
72
+ {%- for tool in tools %}
73
+ {%- set tool = tool.function %}
74
+ {{- '{"type": "function", "function": {' }}
75
+ {%- for key, val in tool.items() if key != "return" %}
76
+ {%- if val is string %}
77
+ {{- '"' + key + '": "' + val + '"' }}
78
+ {%- else %}
79
+ {{- '"' + key + '": ' + val|tojson }}
80
+ {%- endif %}
81
+ {%- if not loop.last %}
82
+ {{- ", " }}
83
+ {%- endif %}
84
+ {%- endfor %}
85
+ {{- "}}" }}
86
+ {%- if not loop.last %}
87
+ {{- ", " }}
88
+ {%- else %}
89
+ {{- "]" }}
90
+ {%- endif %}
91
+ {%- endfor %}
92
+ {{- "[/AVAILABLE_TOOLS]" }}
93
+ {%- endif %}
94
+ {%- if loop.last %}
95
+ {{- "[INST] " + system_message + "
96
+
97
+ " + message["content"] + "[/INST]" }}
98
+ {%- else %}
99
+ {{- "[INST] " + message["content"] + "[/INST]" }}
100
+ {%- endif %}
101
+ {%- elif message.tool_calls is defined and message.tool_calls is not none %}
102
+ {{- "[TOOL_CALLS] [" }}
103
+ {%- for tool_call in message.tool_calls %}
104
+ {%- set out = tool_call.function|tojson %}
105
+ {{- out[:-1] }}
106
+ {%- if not tool_call.id is defined or tool_call.id|length != 9 %}
107
+ {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
108
+ {%- endif %}
109
+ {{- ', "id": "' + tool_call.id + '"}' }}
110
+ {%- if not loop.last %}
111
+ {{- ", " }}
112
+ {%- else %}
113
+ {{- "]" + eos_token }}
114
+ {%- endif %}
115
+ {%- endfor %}
116
+ {%- elif message["role"] == "assistant" %}
117
+ {{- " " + message["content"]|trim + eos_token}}
118
+ {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}
119
+ {%- if message.content is defined and message.content.content is defined %}
120
+ {%- set content = message.content.content %}
121
+ {%- else %}
122
+ {%- set content = message.content %}
123
+ {%- endif %}
124
+ {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }}
125
+ {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}
126
+ {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}
127
+ {%- endif %}
128
+ {{- '"call_id": "' + message.tool_call_id + '"}[/TOOL_RESULTS]' }}
129
+ {%- else %}
130
+ {{- raise_exception("Only user and assistant roles are supported!") }}
131
+ {%- endif %}
132
+ {%- endfor %}
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
3
+ size 587404
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff