Add Darija-adapted Gemma tokenizer from dataset mohamed-stifi/darija-combined-dataset
1293cfa
verified
{{ bos_token }} | |
{%- if messages[0]['role'] == 'system' -%} | |
{%- if messages[0]['content'] is string -%} | |
{%- set first_user_prefix = messages[0]['content'] + ' | |
' -%} | |
{%- else -%} | |
{%- set first_user_prefix = messages[0]['content'][0]['text'] + ' | |
' -%} | |
{%- endif -%} | |
{%- set loop_messages = messages[1:] -%} | |
{%- else -%} | |
{%- set first_user_prefix = "" -%} | |
{%- set loop_messages = messages -%} | |
{%- endif -%} | |
{%- for message in loop_messages -%} | |
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} | |
{{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }} | |
{%- endif -%} | |
{%- if (message['role'] == 'assistant') -%} | |
{%- set role = "model" -%} | |
{%- else -%} | |
{%- set role = message['role'] -%} | |
{%- endif -%} | |
{{ '<start_of_turn>' + role + ' | |
' + (first_user_prefix if loop.first else "") }} | |
{%- if message['content'] is string -%} | |
{{ message['content'] | trim }} | |
{%- elif message['content'] is iterable -%} | |
{%- for item in message['content'] -%} | |
{%- if item['type'] == 'audio' -%} | |
{{ '<audio_soft_token>' }} | |
{%- elif item['type'] == 'image' -%} | |
{{ '<image_soft_token>' }} | |
{%- elif item['type'] == 'text' -%} | |
{{ item['text'] | trim }} | |
{%- endif -%} | |
{%- endfor -%} | |
{%- else -%} | |
{{ raise_exception("Invalid content type") }} | |
{%- endif -%} | |
{{ '<end_of_turn> | |
' }} | |
{%- endfor -%} | |
{%- if add_generation_prompt -%} | |
{{'<start_of_turn>model | |
'}} | |
{%- endif -%} | |