|
import tiktoken |
|
|
|
_enc_model = None |
|
|
|
def normalize2uaua( message, if_replace_system = False ): |
|
new_message = [] |
|
last_role = "" |
|
|
|
for msg in message: |
|
role = msg["role"] |
|
if if_replace_system and role == "system": |
|
role = "user" |
|
|
|
if last_role == role: |
|
new_message[-1]["content"] = new_message[-1]["content"] + "\n" + msg["content"] |
|
else: |
|
last_role = role |
|
new_message.append( msg ) |
|
|
|
return new_message |
|
|
|
def tiktoken_counter( text ): |
|
global _enc_model |
|
|
|
if _enc_model is None: |
|
_enc_model = tiktoken.get_encoding("cl100k_base") |
|
|
|
return len(_enc_model.encode(text)) |
|
|
|
|
|
def string_to_base64(text): |
|
import base64 |
|
byte_array = b'' |
|
for char in text: |
|
num_bytes = char.encode('utf-8') |
|
byte_array += num_bytes |
|
|
|
base64_data = base64.b64encode(byte_array) |
|
return base64_data.decode('utf-8') |
|
|
|
def base64_to_string(base64_data): |
|
import base64 |
|
byte_array = base64.b64decode(base64_data) |
|
text = byte_array.decode('utf-8') |
|
return text |
|
|
|
|
|
def float_array_to_base64(float_arr): |
|
import struct |
|
import base64 |
|
byte_array = b'' |
|
|
|
for f in float_arr: |
|
|
|
num_bytes = struct.pack('!f', f) |
|
byte_array += num_bytes |
|
|
|
|
|
base64_data = base64.b64encode(byte_array) |
|
|
|
return base64_data.decode('utf-8') |
|
|
|
def base64_to_float_array(base64_data): |
|
import struct |
|
import base64 |
|
byte_array = base64.b64decode(base64_data) |
|
|
|
float_array = [] |
|
|
|
|
|
for i in range(0, len(byte_array), 4): |
|
num = struct.unpack('!f', byte_array[i:i+4])[0] |
|
float_array.append(num) |
|
|
|
return float_array |
|
|
|
def load_datas_from_jsonl( file_path ): |
|
import json |
|
datas = [] |
|
with open(file_path, 'r', encoding = 'utf-8') as f: |
|
for line in f: |
|
datas.append(json.loads(line)) |
|
return datas |
|
|
|
def save_datas_to_jsonl( file_path, datas ): |
|
import json |
|
with open(file_path, 'w', encoding = 'utf-8') as f: |
|
for data in datas: |
|
f.write(json.dumps(data, ensure_ascii=False) + '\n') |