File size: 5,974 Bytes
d319ff8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
import re
def extract_speaker(text):
# 使用正则表达式匹配文本开头的 "<name> :" 格式,并捕获冒号后面的内容
match = re.match(r'^([^:]+) :(.*)', text)
if match:
return (match.group(1), match.group(2).strip()) # 返回匹配到的name部分和冒号后面的内容作为元组
else:
return None, text # 如果不匹配,返回None和原始文本
def get_line_recall(query, line):
# 获得query中每个汉字在 line 中的recall
if not query or not line:
return 0
line_set = set(line)
return sum(char in line_set for char in query) / len(query)
def get_max_recall_in_lines(query, lines):
recall_values = [(get_line_recall(query, line), i) for i, line in enumerate(lines)]
return max(recall_values, default=(-1, -1), key=lambda x: x[0])
def extract_dialogues_from_response(text):
# Split the text into lines
lines = text.split('\n')
# Initialize an empty list to store the extracted dialogues
extracted_dialogues = []
valid_said_by = ["said by", "thought by", "described by", "from"]
# Iterate through each line
for line in lines:
# Split the line by '|' and strip whitespace from each part
parts = [part.strip() for part in line.split('|')]
# Check if the line has 4 parts and the third part is 'said by'
if len(parts) == 3:
# Extract the dialogue and speaker, and add to the list
if parts[2] == "speaker":
continue
if parts[1].strip().lower() not in valid_said_by:
continue
dialogue_dict = {
'dialogue': parts[0],
'speaker': parts[2],
"said_by": parts[1]
}
extracted_dialogues.append(dialogue_dict)
return extracted_dialogues
def extract_dialogues_from_glm_response(text):
# Split the text into lines
lines = text.split('\n')
# Initialize an empty list to store the extracted dialogues
extracted_dialogues = []
valid_said_by = ["said by", "thought by", "described by", "from"]
# Iterate through each line
for line in lines:
# Split the line by '|' and strip whitespace from each part
parts = [part.strip() for part in line.split('|')]
# Check if the line has 4 parts and the third part is 'said by'
if len(parts) == 4:
# Extract the dialogue and speaker, and add to the list
if parts[3] == "speaker":
continue
if parts[2].strip().lower() not in valid_said_by:
continue
try:
id_num = int(parts[0])
except ValueError:
id_num = id
dialogue_dict = {
'id': id_num,
'dialogue': parts[1],
'speaker': parts[3],
"said_by": parts[2]
}
extracted_dialogues.append(dialogue_dict)
return extracted_dialogues
def has_dialogue_sentences(text: str) -> int:
# 定义成对的引号
paired_quotes = [
("“", "”"),
("‘", "’"),
("「", "」")
]
# 定义符号列表(包括全角和半角的逗号和句号)
symbols = ['。', '!', '?', '*', '.', '?', '!', '"', '”', ',', '~', ')', ')', '…', ']', '♪',',']
# 检查成对引号内的内容
for start_quote, end_quote in paired_quotes:
start_index = text.find(start_quote)
while start_index != -1:
end_index = text.find(end_quote, start_index + 1)
if end_index != -1:
quote_content = text[start_index + 1:end_index]
# 检查引号内的内容是否符合条件
if any(symbol in quote_content for symbol in symbols) or len(quote_content) >= 10:
return 2 # 成对引号内有符号或长度>=10
start_index = text.find(start_quote, end_index + 1)
else:
break
# 检查双引号'"'
double_quotes_indices = [i for i, char in enumerate(text) if char == '"']
if len(double_quotes_indices) % 2 == 0: # 必须是偶数个双引号
for i in range(0, len(double_quotes_indices), 2):
start_index, end_index = double_quotes_indices[i], double_quotes_indices[i+1]
quote_content = text[start_index+1:end_index]
# 检查引号内的内容是否含有符号
if any(symbol in quote_content for symbol in symbols):
return 1 # 双引号内有符号
return 0 # 没有符合条件的对话型句子
def replace_recalled_dialogue( raw_text, response_text ):
dialogues = extract_dialogues_from_response( response_text )
lines = raw_text.split("\n")
lines = [line.strip().strip("\u3000") for line in lines]
recall_flag = [ False for line in lines ]
line2ids = [ [] for line in lines ]
for id, dialogue in enumerate(dialogues):
dialogue_text = dialogue['dialogue']
remove_symbol_text = dialogue_text.replace("*","").replace('"',"")
recall, lid = get_max_recall_in_lines( remove_symbol_text, lines )
if recall > 0.3:
recall_flag[lid] = True
line2ids[lid].append(id)
new_text = ""
for lid, line in enumerate(lines):
if recall_flag[lid]:
if len(line2ids[lid]) == 1 and ("未知" in dialogues[0]['speaker'] or dialogues[0]['speaker'].strip() == ""):
new_text += line + "\n"
continue
for dia_id in line2ids[lid]:
speaker = dialogues[dia_id]['speaker']
dialogue = dialogues[dia_id]['dialogue']
dialogue = dialogue.replace('"',"").replace('“',"").replace('”',"")
new_text += speaker + " : " + dialogue + "\n"
else:
new_text += line + "\n"
return new_text.strip()
|