Spaces:

silk-road
/

Zero-Haruhi-50_Novels-Playground

Runtime error

File size: 5,974 Bytes

aef3deb





import re

def extract_speaker(text):
    # 使用正则表达式匹配文本开头的 "<name> :" 格式，并捕获冒号后面的内容
    match = re.match(r'^([^:]+) :(.*)', text)
    if match:
        return (match.group(1), match.group(2).strip())  # 返回匹配到的name部分和冒号后面的内容作为元组
    else:
        return None, text  # 如果不匹配，返回None和原始文本


def get_line_recall(query, line):
    # 获得query中每个汉字在 line 中的recall
    if not query or not line:
        return 0
    line_set = set(line)
    return sum(char in line_set for char in query) / len(query)


def get_max_recall_in_lines(query, lines):
    recall_values = [(get_line_recall(query, line), i) for i, line in enumerate(lines)]
    return max(recall_values, default=(-1, -1), key=lambda x: x[0])

def extract_dialogues_from_response(text):
    # Split the text into lines
    lines = text.split('\n')

    # Initialize an empty list to store the extracted dialogues
    extracted_dialogues = []

    valid_said_by = ["said by", "thought by", "described by", "from"]

    # Iterate through each line
    for line in lines:
        # Split the line by '|' and strip whitespace from each part
        parts = [part.strip() for part in line.split('|')]

        # Check if the line has 4 parts and the third part is 'said by'
        if len(parts) == 3:
            # Extract the dialogue and speaker, and add to the list
            if parts[2] == "speaker":
                continue

            if parts[1].strip().lower() not in valid_said_by:
                continue

            dialogue_dict = {
                'dialogue': parts[0],
                'speaker': parts[2],
                "said_by": parts[1]
            }
            extracted_dialogues.append(dialogue_dict)

    return extracted_dialogues


def extract_dialogues_from_glm_response(text):
    # Split the text into lines
    lines = text.split('\n')

    # Initialize an empty list to store the extracted dialogues
    extracted_dialogues = []

    valid_said_by = ["said by", "thought by", "described by", "from"]

    # Iterate through each line
    for line in lines:
        # Split the line by '|' and strip whitespace from each part
        parts = [part.strip() for part in line.split('|')]

        # Check if the line has 4 parts and the third part is 'said by'
        if len(parts) == 4:
            # Extract the dialogue and speaker, and add to the list
            if parts[3] == "speaker":
                continue

            if parts[2].strip().lower() not in valid_said_by:
                continue

            try:
                id_num = int(parts[0])
            except ValueError:
                id_num = id

            dialogue_dict = {
                'id': id_num,
                'dialogue': parts[1],
                'speaker': parts[3],
                "said_by": parts[2]
            }
            extracted_dialogues.append(dialogue_dict)

    return extracted_dialogues


def has_dialogue_sentences(text: str) -> int:
    # 定义成对的引号
    paired_quotes = [
        ("“", "”"),
        ("‘", "’"),
        ("「", "」")
    ]
    # 定义符号列表（包括全角和半角的逗号和句号）
    symbols = ['。', '!', '?', '*', '.', '？', '！', '"', '”', ',', '~', ')', '）', '…', ']', '♪','，']

    # 检查成对引号内的内容
    for start_quote, end_quote in paired_quotes:
        start_index = text.find(start_quote)
        while start_index != -1:
            end_index = text.find(end_quote, start_index + 1)
            if end_index != -1:
                quote_content = text[start_index + 1:end_index]
                # 检查引号内的内容是否符合条件
                if any(symbol in quote_content for symbol in symbols) or len(quote_content) >= 10:
                    return 2  # 成对引号内有符号或长度>=10
                start_index = text.find(start_quote, end_index + 1)
            else:
                break

    # 检查双引号'"'
    double_quotes_indices = [i for i, char in enumerate(text) if char == '"']
    if len(double_quotes_indices) % 2 == 0:  # 必须是偶数个双引号
        for i in range(0, len(double_quotes_indices), 2):
            start_index, end_index = double_quotes_indices[i], double_quotes_indices[i+1]
            quote_content = text[start_index+1:end_index]
            # 检查引号内的内容是否含有符号
            if any(symbol in quote_content for symbol in symbols):
                return 1  # 双引号内有符号

    return 0  # 没有符合条件的对话型句子

def replace_recalled_dialogue( raw_text, response_text ):
    dialogues = extract_dialogues_from_response( response_text )

    lines = raw_text.split("\n")

    lines = [line.strip().strip("\u3000") for line in lines]

    recall_flag = [ False for line in lines ]
    line2ids = [ [] for line in lines ]

    for id, dialogue in enumerate(dialogues):
        dialogue_text = dialogue['dialogue']
        remove_symbol_text = dialogue_text.replace("*","").replace('"',"")

        recall, lid = get_max_recall_in_lines( remove_symbol_text, lines )

        if recall > 0.3:
            recall_flag[lid] = True
            line2ids[lid].append(id)

    new_text = ""

    for lid, line in enumerate(lines):
        if recall_flag[lid]:
            if len(line2ids[lid]) == 1 and ("未知" in dialogues[0]['speaker'] or dialogues[0]['speaker'].strip() == ""):
                new_text += line + "\n"
                continue

            for dia_id in line2ids[lid]:
                speaker = dialogues[dia_id]['speaker']
                dialogue = dialogues[dia_id]['dialogue']
                dialogue = dialogue.replace('"',"").replace('“',"").replace('”',"")
                new_text += speaker + " : " + dialogue + "\n"
        else:
            new_text += line + "\n"

    return new_text.strip()