Spaces:
Running
Running
long text
Browse files
app.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
|
3 |
-
import gradio as gr
|
4 |
import operator
|
5 |
-
import
|
|
|
|
|
6 |
from transformers import BertTokenizer, BertForMaskedLM
|
|
|
|
|
7 |
|
8 |
|
9 |
pretrained_model_name_or_path = "Macropodus/macbert4mdcspell_v2"
|
@@ -119,11 +122,73 @@ def func_macro_correct(text):
|
|
119 |
else:
|
120 |
corrected_text, details = get_errors_from_diff_length(corrected_text, text, know_tokens=vocab)
|
121 |
print(text, ' => ', corrected_text, details)
|
122 |
-
return corrected_text + ' ' + str(details)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
|
125 |
if __name__ == '__main__':
|
126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
127 |
|
128 |
examples = [
|
129 |
"夫谷之雨,犹复云之亦从的起,因与疾风俱飘,参于天,集于的。",
|
@@ -141,13 +206,11 @@ if __name__ == '__main__':
|
|
141 |
]
|
142 |
|
143 |
gr.Interface(
|
144 |
-
|
145 |
inputs='text',
|
146 |
outputs='text',
|
147 |
title="Chinese Spelling Correction Model Macropodus/macbert4mdcspell_v2",
|
148 |
description="Copy or input error Chinese text. Submit and the machine will correct text.",
|
149 |
article="Link to <a href='https://github.com/yongzhuo/macro-correct' style='color:blue;' target='_blank\'>Github REPO: macro-correct</a>",
|
150 |
examples=examples
|
151 |
-
).launch()
|
152 |
-
|
153 |
-
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
|
|
|
3 |
import operator
|
4 |
+
import copy
|
5 |
+
import re
|
6 |
+
|
7 |
from transformers import BertTokenizer, BertForMaskedLM
|
8 |
+
import gradio as gr
|
9 |
+
import torch
|
10 |
|
11 |
|
12 |
pretrained_model_name_or_path = "Macropodus/macbert4mdcspell_v2"
|
|
|
122 |
else:
|
123 |
corrected_text, details = get_errors_from_diff_length(corrected_text, text, know_tokens=vocab)
|
124 |
print(text, ' => ', corrected_text, details)
|
125 |
+
# return corrected_text + ' ' + str(details)
|
126 |
+
line_dict = {"source": text, "target": corrected_text, "errors": details}
|
127 |
+
return line_dict
|
128 |
+
|
129 |
+
|
130 |
+
def cut_sent_by_stay(text, return_length=True, add_semicolon=False):
|
131 |
+
""" 分句但是保存原标点符号 """
|
132 |
+
if add_semicolon:
|
133 |
+
text_sp = re.split(r"!”|?”|。”|……”|”!|”?|”。|”……|》。|)。|;|!|?|。|…|\!|\?", text)
|
134 |
+
conn_symbol = ";!?。…”;!?》)\n"
|
135 |
+
else:
|
136 |
+
text_sp = re.split(r"!”|?”|。”|……”|”!|”?|”。|”……|》。|)。|!|?|。|…|\!|\?", text)
|
137 |
+
conn_symbol = "!?。…”!?》)\n"
|
138 |
+
text_length_s = []
|
139 |
+
text_cut = []
|
140 |
+
len_text = len(text) - 1
|
141 |
+
# signal_symbol = "—”>;?…)‘《’(·》“~,、!。:<"
|
142 |
+
len_global = 0
|
143 |
+
for idx, text_sp_i in enumerate(text_sp):
|
144 |
+
text_cut_idx = text_sp[idx]
|
145 |
+
len_global_before = copy.deepcopy(len_global)
|
146 |
+
len_global += len(text_sp_i)
|
147 |
+
while True:
|
148 |
+
if len_global <= len_text and text[len_global] in conn_symbol:
|
149 |
+
text_cut_idx += text[len_global]
|
150 |
+
else:
|
151 |
+
# len_global += 1
|
152 |
+
if text_cut_idx:
|
153 |
+
text_length_s.append([len_global_before, len_global])
|
154 |
+
text_cut.append(text_cut_idx)
|
155 |
+
break
|
156 |
+
len_global += 1
|
157 |
+
if return_length:
|
158 |
+
return text_cut, text_length_s
|
159 |
+
return text_cut
|
160 |
+
|
161 |
+
|
162 |
+
def func_macro_correct_long(text):
|
163 |
+
""" 长句 """
|
164 |
+
texts, length = cut_sent_by_stay(text, return_length=True, add_semicolon=True)
|
165 |
+
text_correct = ""
|
166 |
+
errors_new = []
|
167 |
+
for idx, text in enumerate(texts):
|
168 |
+
text_out = func_macro_correct(text)
|
169 |
+
source = text_out.get("source")
|
170 |
+
target = text_out.get("target")
|
171 |
+
errors = text_out.get("errors")
|
172 |
+
text_correct += target
|
173 |
+
for error in errors:
|
174 |
+
pos = length[idx][0] + error[-1]
|
175 |
+
error_1 = [error[0], error[1], pos]
|
176 |
+
errors_new.append(error_1)
|
177 |
+
return text_correct + '\n' + str(errors_new)
|
178 |
|
179 |
|
180 |
if __name__ == '__main__':
|
181 |
+
text = """网购的烦脑
|
182 |
+
emer 发布于 2025-7-3 18:20 阅读:73
|
183 |
+
|
184 |
+
最近网购遇到件恼火的事。我在网店看中件羽戎服,店家保正是正品,还承诺七天无里由退换。收到货后却发现袖口有开线,更糟的是拉链老是卡住。
|
185 |
+
|
186 |
+
联系客服时,对方态度敷衔,先说让我自行缝补,后又说要扣除运废才给退。我在评沦区如实描述经历,结果发现好多消废者都有类似遭遇。
|
187 |
+
|
188 |
+
这次购物让我明白,不能光看店家的宣全,要多查考真实评价。现在我已经学精了,下单前总会反复合对商品信息。
|
189 |
+
网购的烦恼发布于2025-7-310期阅读:最近网购遇到件恼火的事。我在网店看中件羽绒服,店家保证是正品,还承诺七天无理由退换。收到货后却发现袖口有开线,更糟的是拉链老是卡住。联系客服时,对方态度敷衍,先说让我自行缝补,后又说要扣除运废才给退。我在评论区如实描述经历,结果发现好多消废者都有类似遭遇。这次购物让我明白,不能光看店家的宣全,要多查考真实评价。现在我已经学精了,下单前总会反复核对商品信息。
|
190 |
+
网购的烦恼e发布于2025-7-3期期阅读:最近网购遇到件恼火的事。我在网店看中件羽绒服,店家保证是正品,还承诺七天无理由退换。收到货后却发现袖口有开线,更糟的是拉链老是卡住。联系客服时,对方态度敷衍,先说让我自行缝补,后又说要扣除运废才给退。我在评论区如实描述经历,结果发现好多消废者都有类似遭遇。这次购物让我明白,不能光看店家的宣全,要多查考真实评价。现在我已经学精了,下单前总会反复核对商品信息。网购的烦恼发布于2025-7-310期阅读:最近网购遇到件恼火的事。我在网店看中件羽绒服,店家保证是正品,还承诺七天无理由退换。收到货后却发现袖口有开线,更糟的是拉链老是卡住。联系客服时,对方态度敷衍,先说让我自行缝补,后又说要扣除运废才给退。我在评论区如实描述经历,结果发现好多消废者都有类似遭遇。这次购物让我明白,不能光看店家的宣全���要多查考真实评价。现在我已经学精了,下单前总会反复核对商品信息。"""
|
191 |
+
print(func_macro_correct_long(text))
|
192 |
|
193 |
examples = [
|
194 |
"夫谷之雨,犹复云之亦从的起,因与疾风俱飘,参于天,集于的。",
|
|
|
206 |
]
|
207 |
|
208 |
gr.Interface(
|
209 |
+
func_macro_correct_long,
|
210 |
inputs='text',
|
211 |
outputs='text',
|
212 |
title="Chinese Spelling Correction Model Macropodus/macbert4mdcspell_v2",
|
213 |
description="Copy or input error Chinese text. Submit and the machine will correct text.",
|
214 |
article="Link to <a href='https://github.com/yongzhuo/macro-correct' style='color:blue;' target='_blank\'>Github REPO: macro-correct</a>",
|
215 |
examples=examples
|
216 |
+
).launch()
|
|
|
|