JotunnBurton commited on
Commit
a126173
·
verified ·
1 Parent(s): 6ccb2b2

Upload re_matching.py

Browse files
Files changed (1) hide show
  1. re_matching.py +81 -0
re_matching.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def extract_language_and_text_updated(speaker, dialogue):
5
+ # 使用正则表达式匹配<语言>标签和其后的文本
6
+ pattern_language_text = r"<(\S+?)>([^<]+)"
7
+ matches = re.findall(pattern_language_text, dialogue, re.DOTALL)
8
+ speaker = speaker[1:-1]
9
+ # 清理文本:去除两边的空白字符
10
+ matches_cleaned = [(lang.upper(), text.strip()) for lang, text in matches]
11
+ matches_cleaned.append(speaker)
12
+ return matches_cleaned
13
+
14
+
15
+ def validate_text(input_text):
16
+ # 验证说话人的正则表达式
17
+ pattern_speaker = r"(\[\S+?\])((?:\s*<\S+?>[^<\[\]]+?)+)"
18
+
19
+ # 使用re.DOTALL标志使.匹配包括换行符在内的所有字符
20
+ matches = re.findall(pattern_speaker, input_text, re.DOTALL)
21
+
22
+ # 对每个匹配到的说话人内容进行进一步验证
23
+ for _, dialogue in matches:
24
+ language_text_matches = extract_language_and_text_updated(_, dialogue)
25
+ if not language_text_matches:
26
+ return (
27
+ False,
28
+ "Error: Invalid format detected in dialogue content. Please check your input.",
29
+ )
30
+
31
+ # 如果输入的文本中没有找到任何匹配项
32
+ if not matches:
33
+ return (
34
+ False,
35
+ "Error: No valid speaker format detected. Please check your input.",
36
+ )
37
+
38
+ return True, "Input is valid."
39
+
40
+
41
+ def text_matching(text: str) -> list:
42
+ speaker_pattern = r"(\[\S+?\])(.+?)(?=\[\S+?\]|$)"
43
+ matches = re.findall(speaker_pattern, text, re.DOTALL)
44
+ result = []
45
+ for speaker, dialogue in matches:
46
+ result.append(extract_language_and_text_updated(speaker, dialogue))
47
+ return result
48
+
49
+
50
+ def cut_para(text):
51
+ splitted_para = re.split("[\n]", text) # 按段分
52
+ splitted_para = [
53
+ sentence.strip() for sentence in splitted_para if sentence.strip()
54
+ ] # 删除空字符串
55
+ return splitted_para
56
+
57
+
58
+ def cut_sent(para):
59
+ para = re.sub("([。!;?\?])([^”’])", r"\1\n\2", para) # 单字符断句符
60
+ para = re.sub("(\.{6})([^”’])", r"\1\n\2", para) # 英文省略号
61
+ para = re.sub("(\…{2})([^”’])", r"\1\n\2", para) # 中文省略号
62
+ para = re.sub("([。!?\?][”’])([^,。!?\?])", r"\1\n\2", para)
63
+ para = para.rstrip() # 段尾如果有多余的\n就去掉它
64
+ return para.split("\n")
65
+
66
+
67
+ if __name__ == "__main__":
68
+ text = """
69
+ [说话人1]
70
+ [说话人2]<zh>你好吗?<jp>元気ですか?<jp>こんにちは,世界。<zh>你好吗?
71
+ [说话人3]<zh>谢谢。<jp>どういたしまして。
72
+ """
73
+ text_matching(text)
74
+ # 测试函数
75
+ test_text = """
76
+ [说话人1]<zh>你好,こんにちは!<jp>こんにちは,世界。
77
+ [说话人2]<zh>你好吗?
78
+ """
79
+ text_matching(test_text)
80
+ res = validate_text(test_text)
81
+ print(res)