Macropodus commited on
Commit
760a845
·
verified ·
1 Parent(s): 6b2b9b7
Files changed (1) hide show
  1. app.py +71 -8
app.py CHANGED
@@ -1,9 +1,12 @@
1
  # -*- coding: utf-8 -*-
2
 
3
- import gradio as gr
4
  import operator
5
- import torch
 
 
6
  from transformers import BertTokenizer, BertForMaskedLM
 
 
7
 
8
 
9
  pretrained_model_name_or_path = "Macropodus/macbert4mdcspell_v2"
@@ -119,11 +122,73 @@ def func_macro_correct(text):
119
  else:
120
  corrected_text, details = get_errors_from_diff_length(corrected_text, text, know_tokens=vocab)
121
  print(text, ' => ', corrected_text, details)
122
- return corrected_text + ' ' + str(details)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
 
125
  if __name__ == '__main__':
126
- print(func_macro_correct('他法语说的很好,的语也不错'))
 
 
 
 
 
 
 
 
 
 
127
 
128
  examples = [
129
  "夫谷之雨,犹复云之亦从的起,因与疾风俱飘,参于天,集于的。",
@@ -141,13 +206,11 @@ if __name__ == '__main__':
141
  ]
142
 
143
  gr.Interface(
144
- func_macro_correct,
145
  inputs='text',
146
  outputs='text',
147
  title="Chinese Spelling Correction Model Macropodus/macbert4mdcspell_v2",
148
  description="Copy or input error Chinese text. Submit and the machine will correct text.",
149
  article="Link to <a href='https://github.com/yongzhuo/macro-correct' style='color:blue;' target='_blank\'>Github REPO: macro-correct</a>",
150
  examples=examples
151
- ).launch() # .launch(server_name="0.0.0.0", server_port=8036, share=False, debug=True)
152
-
153
-
 
1
  # -*- coding: utf-8 -*-
2
 
 
3
  import operator
4
+ import copy
5
+ import re
6
+
7
  from transformers import BertTokenizer, BertForMaskedLM
8
+ import gradio as gr
9
+ import torch
10
 
11
 
12
  pretrained_model_name_or_path = "Macropodus/macbert4mdcspell_v2"
 
122
  else:
123
  corrected_text, details = get_errors_from_diff_length(corrected_text, text, know_tokens=vocab)
124
  print(text, ' => ', corrected_text, details)
125
+ # return corrected_text + ' ' + str(details)
126
+ line_dict = {"source": text, "target": corrected_text, "errors": details}
127
+ return line_dict
128
+
129
+
130
+ def cut_sent_by_stay(text, return_length=True, add_semicolon=False):
131
+ """ 分句但是保存原标点符号 """
132
+ if add_semicolon:
133
+ text_sp = re.split(r"!”|?”|。”|……”|”!|”?|”。|”……|》。|)。|;|!|?|。|…|\!|\?", text)
134
+ conn_symbol = ";!?。…”;!?》)\n"
135
+ else:
136
+ text_sp = re.split(r"!”|?”|。”|……”|”!|”?|”。|”……|》。|)。|!|?|。|…|\!|\?", text)
137
+ conn_symbol = "!?。…”!?》)\n"
138
+ text_length_s = []
139
+ text_cut = []
140
+ len_text = len(text) - 1
141
+ # signal_symbol = "—”>;?…)‘《’(·》“~,、!。:<"
142
+ len_global = 0
143
+ for idx, text_sp_i in enumerate(text_sp):
144
+ text_cut_idx = text_sp[idx]
145
+ len_global_before = copy.deepcopy(len_global)
146
+ len_global += len(text_sp_i)
147
+ while True:
148
+ if len_global <= len_text and text[len_global] in conn_symbol:
149
+ text_cut_idx += text[len_global]
150
+ else:
151
+ # len_global += 1
152
+ if text_cut_idx:
153
+ text_length_s.append([len_global_before, len_global])
154
+ text_cut.append(text_cut_idx)
155
+ break
156
+ len_global += 1
157
+ if return_length:
158
+ return text_cut, text_length_s
159
+ return text_cut
160
+
161
+
162
+ def func_macro_correct_long(text):
163
+ """ 长句 """
164
+ texts, length = cut_sent_by_stay(text, return_length=True, add_semicolon=True)
165
+ text_correct = ""
166
+ errors_new = []
167
+ for idx, text in enumerate(texts):
168
+ text_out = func_macro_correct(text)
169
+ source = text_out.get("source")
170
+ target = text_out.get("target")
171
+ errors = text_out.get("errors")
172
+ text_correct += target
173
+ for error in errors:
174
+ pos = length[idx][0] + error[-1]
175
+ error_1 = [error[0], error[1], pos]
176
+ errors_new.append(error_1)
177
+ return text_correct + '\n' + str(errors_new)
178
 
179
 
180
  if __name__ == '__main__':
181
+ text = """网购的烦脑
182
+ emer 发布于 2025-7-3 18:20 阅读:73
183
+
184
+ 最近网购遇到件恼火的事。我在网店看中件羽戎服,店家保正是正品,还承诺七天无里由退换。收到货后却发现袖口有开线,更糟的是拉链老是卡住。
185
+
186
+ 联系客服时,对方态度敷衔,先说让我自行缝补,后又说要扣除运废才给退。我在评沦区如实描述经历,结果发现好多消废者都有类似遭遇。
187
+
188
+ 这次购物让我明白,不能光看店家的宣全,要多查考真实评价。现在我已经学精了,下单前总会反复合对商品信息。
189
+ 网购的烦恼发布于2025-7-310期阅读:最近网购遇到件恼火的事。我在网店看中件羽绒服,店家保证是正品,还承诺七天无理由退换。收到货后却发现袖口有开线,更糟的是拉链老是卡住。联系客服时,对方态度敷衍,先说让我自行缝补,后又说要扣除运废才给退。我在评论区如实描述经历,结果发现好多消废者都有类似遭遇。这次购物让我明白,不能光看店家的宣全,要多查考真实评价。现在我已经学精了,下单前总会反复核对商品信息。
190
+ 网购的烦恼e发布于2025-7-3期期阅读:最近网购遇到件恼火的事。我在网店看中件羽绒服,店家保证是正品,还承诺七天无理由退换。收到货后却发现袖口有开线,更糟的是拉链老是卡住。联系客服时,对方态度敷衍,先说让我自行缝补,后又说要扣除运废才给退。我在评论区如实描述经历,结果发现好多消废者都有类似遭遇。这次购物让我明白,不能光看店家的宣全,要多查考真实评价。现在我已经学精了,下单前总会反复核对商品信息。网购的烦恼发布于2025-7-310期阅读:最近网购遇到件恼火的事。我在网店看中件羽绒服,店家保证是正品,还承诺七天无理由退换。收到货后却发现袖口有开线,更糟的是拉链老是卡住。联系客服时,对方态度敷衍,先说让我自行缝补,后又说要扣除运废才给退。我在评论区如实描述经历,结果发现好多消废者都有类似遭遇。这次购物让我明白,不能光看店家的宣全���要多查考真实评价。现在我已经学精了,下单前总会反复核对商品信息。"""
191
+ print(func_macro_correct_long(text))
192
 
193
  examples = [
194
  "夫谷之雨,犹复云之亦从的起,因与疾风俱飘,参于天,集于的。",
 
206
  ]
207
 
208
  gr.Interface(
209
+ func_macro_correct_long,
210
  inputs='text',
211
  outputs='text',
212
  title="Chinese Spelling Correction Model Macropodus/macbert4mdcspell_v2",
213
  description="Copy or input error Chinese text. Submit and the machine will correct text.",
214
  article="Link to <a href='https://github.com/yongzhuo/macro-correct' style='color:blue;' target='_blank\'>Github REPO: macro-correct</a>",
215
  examples=examples
216
+ ).launch()