Spaces:
Runtime error
Runtime error
File size: 6,009 Bytes
2edd118 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
from argparse import Namespace
import openai
from transformers import AutoModel, AutoTokenizer
import torch
import random
import tiktoken
import re
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
_luotuo_model = None
_luotuo_model_en = None
_luotuo_en_tokenizer = None
_enc_model = None
def tiktokenizer( text ):
global _enc_model
if _enc_model is None:
_enc_model = tiktoken.get_encoding("cl100k_base")
return len(_enc_model.encode(text))
def response_postprocess(text,dialogue_bra_token = '「',dialogue_ket_token = '」'):
lines = text.split('\n')
new_lines = ""
first_name = None
for line in lines:
line = line.strip(" ")
match = re.match(r'^(.*?)[::]' + dialogue_bra_token + r"(.*?)" + dialogue_ket_token + r"$", line)
if match:
curr_name = match.group(1)
# print(curr_name)
if first_name is None:
first_name = curr_name
new_lines += (match.group(2))
else:
if curr_name != first_name:
return first_name + ":" + dialogue_bra_token + new_lines + dialogue_ket_token
else:
new_lines += (match.group(2))
else:
if first_name == None:
return text
else:
return first_name + ":" + dialogue_bra_token + new_lines + dialogue_ket_token
return first_name + ":" + dialogue_bra_token + new_lines + dialogue_ket_token
def download_models():
print("正在下载Luotuo-Bert")
# Import our models. The package will take care of downloading the models automatically
model_args = Namespace(do_mlm=None, pooler_type="cls", temp=0.05, mlp_only_train=False,
init_embeddings_model=None)
model = AutoModel.from_pretrained("silk-road/luotuo-bert-medium", trust_remote_code=True, model_args=model_args).to(
device)
print("Luotuo-Bert下载完毕")
return model
def get_luotuo_model():
global _luotuo_model
if _luotuo_model is None:
_luotuo_model = download_models()
return _luotuo_model
def luotuo_embedding(model, texts):
# Tokenize the texts_source
tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert-medium")
inputs = tokenizer(texts, padding=True, truncation=False, return_tensors="pt")
inputs = inputs.to(device)
# Extract the embeddings
# Get the embeddings
with torch.no_grad():
embeddings = model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
return embeddings
def luotuo_en_embedding( texts ):
# this function implemented by Cheng
global _luotuo_model_en
global _luotuo_en_tokenizer
if _luotuo_model_en is None:
_luotuo_en_tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert-en")
_luotuo_model_en = AutoModel.from_pretrained("silk-road/luotuo-bert-en").to(device)
if _luotuo_en_tokenizer is None:
_luotuo_en_tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert-en")
inputs = _luotuo_en_tokenizer(texts, padding=True, truncation=False, return_tensors="pt")
inputs = inputs.to(device)
with torch.no_grad():
embeddings = _luotuo_model_en(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
return embeddings
def get_embedding_for_chinese(model, texts):
model = model.to(device)
# str or strList
texts = texts if isinstance(texts, list) else [texts]
# 截断
for i in range(len(texts)):
if len(texts[i]) > 510:
texts[i] = texts[i][:510]
if len(texts) >= 64:
embeddings = []
chunk_size = 64
for i in range(0, len(texts), chunk_size):
embeddings.append(luotuo_embedding(model, texts[i: i + chunk_size]))
return torch.cat(embeddings, dim=0)
else:
return luotuo_embedding(model, texts)
def is_chinese_or_english(text):
text = list(text)
is_chinese, is_english = 0, 0
for char in text:
# 判断字符的Unicode值是否在中文字符的Unicode范围内
if '\u4e00' <= char <= '\u9fa5':
is_chinese += 4
# 判断字符是否为英文字符(包括大小写字母和常见标点符号)
elif ('\u0041' <= char <= '\u005a') or ('\u0061' <= char <= '\u007a'):
is_english += 1
if is_chinese >= is_english:
return "chinese"
else:
return "english"
def get_embedding_for_english(text, model="text-embedding-ada-002"):
text = text.replace("\n", " ")
return openai.Embedding.create(input=[text], model=model)['data'][0]['embedding']
import os
def luotuo_openai_embedding(texts, is_chinese= None ):
"""
when input is chinese, use luotuo_embedding
when input is english, use openai_embedding
texts can be a list or a string
when texts is a list, return a list of embeddings, using batch inference
when texts is a string, return a single embedding
"""
openai_key = os.environ.get("OPENAI_API_KEY")
if isinstance(texts, list):
index = random.randint(0, len(texts) - 1)
if openai_key is None or is_chinese_or_english(texts[index]) == "chinese":
return [embed.cpu().tolist() for embed in get_embedding_for_chinese(get_luotuo_model(), texts)]
else:
return [get_embedding_for_english(text) for text in texts]
else:
if openai_key is None or is_chinese_or_english(texts) == "chinese":
return get_embedding_for_chinese(get_luotuo_model(), texts)[0].cpu().tolist()
else:
return get_embedding_for_english(texts)
# compute cosine similarity between two vector
def get_cosine_similarity( v1, v2):
v1 = torch.tensor(v1).to(device)
v2 = torch.tensor(v2).to(device)
return torch.cosine_similarity(v1, v2, dim=0).item()
|