silk-road's picture
Upload 13 files
2edd118
raw
history blame
No virus
6.01 kB
from argparse import Namespace
import openai
from transformers import AutoModel, AutoTokenizer
import torch
import random
import tiktoken
import re
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
_luotuo_model = None
_luotuo_model_en = None
_luotuo_en_tokenizer = None
_enc_model = None
def tiktokenizer( text ):
global _enc_model
if _enc_model is None:
_enc_model = tiktoken.get_encoding("cl100k_base")
return len(_enc_model.encode(text))
def response_postprocess(text,dialogue_bra_token = '「',dialogue_ket_token = '」'):
lines = text.split('\n')
new_lines = ""
first_name = None
for line in lines:
line = line.strip(" ")
match = re.match(r'^(.*?)[::]' + dialogue_bra_token + r"(.*?)" + dialogue_ket_token + r"$", line)
if match:
curr_name = match.group(1)
# print(curr_name)
if first_name is None:
first_name = curr_name
new_lines += (match.group(2))
else:
if curr_name != first_name:
return first_name + ":" + dialogue_bra_token + new_lines + dialogue_ket_token
else:
new_lines += (match.group(2))
else:
if first_name == None:
return text
else:
return first_name + ":" + dialogue_bra_token + new_lines + dialogue_ket_token
return first_name + ":" + dialogue_bra_token + new_lines + dialogue_ket_token
def download_models():
print("正在下载Luotuo-Bert")
# Import our models. The package will take care of downloading the models automatically
model_args = Namespace(do_mlm=None, pooler_type="cls", temp=0.05, mlp_only_train=False,
init_embeddings_model=None)
model = AutoModel.from_pretrained("silk-road/luotuo-bert-medium", trust_remote_code=True, model_args=model_args).to(
device)
print("Luotuo-Bert下载完毕")
return model
def get_luotuo_model():
global _luotuo_model
if _luotuo_model is None:
_luotuo_model = download_models()
return _luotuo_model
def luotuo_embedding(model, texts):
# Tokenize the texts_source
tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert-medium")
inputs = tokenizer(texts, padding=True, truncation=False, return_tensors="pt")
inputs = inputs.to(device)
# Extract the embeddings
# Get the embeddings
with torch.no_grad():
embeddings = model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
return embeddings
def luotuo_en_embedding( texts ):
# this function implemented by Cheng
global _luotuo_model_en
global _luotuo_en_tokenizer
if _luotuo_model_en is None:
_luotuo_en_tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert-en")
_luotuo_model_en = AutoModel.from_pretrained("silk-road/luotuo-bert-en").to(device)
if _luotuo_en_tokenizer is None:
_luotuo_en_tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert-en")
inputs = _luotuo_en_tokenizer(texts, padding=True, truncation=False, return_tensors="pt")
inputs = inputs.to(device)
with torch.no_grad():
embeddings = _luotuo_model_en(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
return embeddings
def get_embedding_for_chinese(model, texts):
model = model.to(device)
# str or strList
texts = texts if isinstance(texts, list) else [texts]
# 截断
for i in range(len(texts)):
if len(texts[i]) > 510:
texts[i] = texts[i][:510]
if len(texts) >= 64:
embeddings = []
chunk_size = 64
for i in range(0, len(texts), chunk_size):
embeddings.append(luotuo_embedding(model, texts[i: i + chunk_size]))
return torch.cat(embeddings, dim=0)
else:
return luotuo_embedding(model, texts)
def is_chinese_or_english(text):
text = list(text)
is_chinese, is_english = 0, 0
for char in text:
# 判断字符的Unicode值是否在中文字符的Unicode范围内
if '\u4e00' <= char <= '\u9fa5':
is_chinese += 4
# 判断字符是否为英文字符(包括大小写字母和常见标点符号)
elif ('\u0041' <= char <= '\u005a') or ('\u0061' <= char <= '\u007a'):
is_english += 1
if is_chinese >= is_english:
return "chinese"
else:
return "english"
def get_embedding_for_english(text, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']
import os
def luotuo_openai_embedding(texts, is_chinese= None ):
"""
when input is chinese, use luotuo_embedding
when input is english, use openai_embedding
texts can be a list or a string
when texts is a list, return a list of embeddings, using batch inference
when texts is a string, return a single embedding
"""
openai_key = os.environ.get("OPENAI_API_KEY")
if isinstance(texts, list):
index = random.randint(0, len(texts) - 1)
if openai_key is None or is_chinese_or_english(texts[index]) == "chinese":
return [embed.cpu().tolist() for embed in get_embedding_for_chinese(get_luotuo_model(), texts)]
else:
return [get_embedding_for_english(text) for text in texts]
else:
if openai_key is None or is_chinese_or_english(texts) == "chinese":
return get_embedding_for_chinese(get_luotuo_model(), texts)[0].cpu().tolist()
else:
return get_embedding_for_english(texts)
# compute cosine similarity between two vector
def get_cosine_similarity( v1, v2):
v1 = torch.tensor(v1).to(device)
v2 = torch.tensor(v2).to(device)
return torch.cosine_similarity(v1, v2, dim=0).item()