File size: 5,974 Bytes
d319ff8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176




import re

def extract_speaker(text):
    # 使用正则表达式匹配文本开头的 "<name> :" 格式,并捕获冒号后面的内容
    match = re.match(r'^([^:]+) :(.*)', text)
    if match:
        return (match.group(1), match.group(2).strip())  # 返回匹配到的name部分和冒号后面的内容作为元组
    else:
        return None, text  # 如果不匹配,返回None和原始文本


def get_line_recall(query, line):
    # 获得query中每个汉字在 line 中的recall
    if not query or not line:
        return 0
    line_set = set(line)
    return sum(char in line_set for char in query) / len(query)


def get_max_recall_in_lines(query, lines):
    recall_values = [(get_line_recall(query, line), i) for i, line in enumerate(lines)]
    return max(recall_values, default=(-1, -1), key=lambda x: x[0])

def extract_dialogues_from_response(text):
    # Split the text into lines
    lines = text.split('\n')

    # Initialize an empty list to store the extracted dialogues
    extracted_dialogues = []

    valid_said_by = ["said by", "thought by", "described by", "from"]

    # Iterate through each line
    for line in lines:
        # Split the line by '|' and strip whitespace from each part
        parts = [part.strip() for part in line.split('|')]

        # Check if the line has 4 parts and the third part is 'said by'
        if len(parts) == 3:
            # Extract the dialogue and speaker, and add to the list
            if parts[2] == "speaker":
                continue

            if parts[1].strip().lower() not in valid_said_by:
                continue

            dialogue_dict = {
                'dialogue': parts[0],
                'speaker': parts[2],
                "said_by": parts[1]
            }
            extracted_dialogues.append(dialogue_dict)

    return extracted_dialogues


def extract_dialogues_from_glm_response(text):
    # Split the text into lines
    lines = text.split('\n')

    # Initialize an empty list to store the extracted dialogues
    extracted_dialogues = []

    valid_said_by = ["said by", "thought by", "described by", "from"]

    # Iterate through each line
    for line in lines:
        # Split the line by '|' and strip whitespace from each part
        parts = [part.strip() for part in line.split('|')]

        # Check if the line has 4 parts and the third part is 'said by'
        if len(parts) == 4:
            # Extract the dialogue and speaker, and add to the list
            if parts[3] == "speaker":
                continue

            if parts[2].strip().lower() not in valid_said_by:
                continue

            try:
                id_num = int(parts[0])
            except ValueError:
                id_num = id

            dialogue_dict = {
                'id': id_num,
                'dialogue': parts[1],
                'speaker': parts[3],
                "said_by": parts[2]
            }
            extracted_dialogues.append(dialogue_dict)

    return extracted_dialogues


def has_dialogue_sentences(text: str) -> int:
    # 定义成对的引号
    paired_quotes = [
        ("“", "”"),
        ("‘", "’"),
        ("「", "」")
    ]
    # 定义符号列表(包括全角和半角的逗号和句号)
    symbols = ['。', '!', '?', '*', '.', '?', '!', '"', '”', ',', '~', ')', ')', '…', ']', '♪',',']

    # 检查成对引号内的内容
    for start_quote, end_quote in paired_quotes:
        start_index = text.find(start_quote)
        while start_index != -1:
            end_index = text.find(end_quote, start_index + 1)
            if end_index != -1:
                quote_content = text[start_index + 1:end_index]
                # 检查引号内的内容是否符合条件
                if any(symbol in quote_content for symbol in symbols) or len(quote_content) >= 10:
                    return 2  # 成对引号内有符号或长度>=10
                start_index = text.find(start_quote, end_index + 1)
            else:
                break

    # 检查双引号'"'
    double_quotes_indices = [i for i, char in enumerate(text) if char == '"']
    if len(double_quotes_indices) % 2 == 0:  # 必须是偶数个双引号
        for i in range(0, len(double_quotes_indices), 2):
            start_index, end_index = double_quotes_indices[i], double_quotes_indices[i+1]
            quote_content = text[start_index+1:end_index]
            # 检查引号内的内容是否含有符号
            if any(symbol in quote_content for symbol in symbols):
                return 1  # 双引号内有符号

    return 0  # 没有符合条件的对话型句子

def replace_recalled_dialogue( raw_text, response_text ):
    dialogues = extract_dialogues_from_response( response_text )

    lines = raw_text.split("\n")

    lines = [line.strip().strip("\u3000") for line in lines]

    recall_flag = [ False for line in lines ]
    line2ids = [ [] for line in lines ]

    for id, dialogue in enumerate(dialogues):
        dialogue_text = dialogue['dialogue']
        remove_symbol_text = dialogue_text.replace("*","").replace('"',"")

        recall, lid = get_max_recall_in_lines( remove_symbol_text, lines )

        if recall > 0.3:
            recall_flag[lid] = True
            line2ids[lid].append(id)

    new_text = ""

    for lid, line in enumerate(lines):
        if recall_flag[lid]:
            if len(line2ids[lid]) == 1 and ("未知" in dialogues[0]['speaker'] or dialogues[0]['speaker'].strip() == ""):
                new_text += line + "\n"
                continue

            for dia_id in line2ids[lid]:
                speaker = dialogues[dia_id]['speaker']
                dialogue = dialogues[dia_id]['dialogue']
                dialogue = dialogue.replace('"',"").replace('“',"").replace('”',"")
                new_text += speaker + " : " + dialogue + "\n"
        else:
            new_text += line + "\n"

    return new_text.strip()