Spaces:

Macropodus
/

macbert4mdcspell_v2

Running

App Files Files Community

Macropodus commited on about 22 hours ago

Commit

bd24f17

verified ·

1 Parent(s): 760a845

preprocess

Browse files

Files changed (1) hide show

app.py +53 -0

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import re
 from transformers import BertTokenizer, BertForMaskedLM
 import gradio as gr
 import torch
@@ -20,6 +21,10 @@ vocab = tokenizer.vocab
 # tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
 # model = AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)
 # vocab = tokenizer.vocab
 def func_macro_correct(text):
@@ -127,6 +132,27 @@ def func_macro_correct(text):
     return line_dict
 def cut_sent_by_stay(text, return_length=True, add_semicolon=False):
     """  分句但是保存原标点符号  """
     if add_semicolon:
@@ -157,6 +183,27 @@ def cut_sent_by_stay(text, return_length=True, add_semicolon=False):
     if return_length:
         return text_cut, text_length_s
     return text_cut
 def func_macro_correct_long(text):
@@ -165,6 +212,12 @@ def func_macro_correct_long(text):
     text_correct = ""
     errors_new = []
     for idx, text in enumerate(texts):
         text_out = func_macro_correct(text)
         source = text_out.get("source")
         target = text_out.get("target")

 from transformers import BertTokenizer, BertForMaskedLM
 import gradio as gr
+import opencc
 import torch
 # tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
 # model = AutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)
 # vocab = tokenizer.vocab
+converter_t2s = opencc.OpenCC("t2s.json")
+context = converter_t2s.convert("汉字")  # 漢字
+PUN_EN2ZH_DICT = {",": "，", ";": "；", "!": "！", "?": "？", ":": "：", "(": "（", ")": "）", "_": "—"}
+PUN_BERT_DICT = {"“":'"', "”":'"', "‘":'"', "’":'"', "—": "_", "——": "__"}
 def func_macro_correct(text):
     return line_dict
+def transfor_english_symbol_to_chinese(text, kv_dict=PUN_EN2ZH_DICT):
+    """   将英文标点符号转化为中文标点符号, 位数不能变防止pos_id变化   """
+    for k, v in kv_dict.items():  # 英文替换
+        text = text.replace(k, v)
+    if text and text[-1] == ".":  # 最后一个字符是英文.
+        text = text[:-1] + "。"
+    if text and "\"" in text:  # 双引号
+        index_list = [i.start() for i in re.finditer("\"", text)]
+        if index_list:
+            for idx, index in enumerate(index_list):
+                symbol = "“" if idx % 2 == 0 else "”"
+                text = text[:index] + symbol + text[index + 1:]
+    if text and "'" in text:  # 单引号
+        index_list = [i.start() for i in re.finditer("'", text)]
+        if index_list:
+            for idx, index in enumerate(index_list):
+                symbol = "‘" if idx % 2 == 0 else "’"
+                text = text[:index] + symbol + text[index + 1:]
+    return text
 def cut_sent_by_stay(text, return_length=True, add_semicolon=False):
     """  分句但是保存原标点符号  """
     if add_semicolon:
     if return_length:
         return text_cut, text_length_s
     return text_cut
+def transfor_bert_unk_pun_to_know(text, kv_dict=PUN_BERT_DICT):
+    """   将英文标点符号转化为中文标点符号, 位数不能变防止pos_id变化   """
+    for k, v in kv_dict.items():  # 英文替换
+        text = text.replace(k, v)
+    return text
+def tradition_to_simple(text):
+    """  繁体到简体  """
+    return converter_t2s.convert(text)
+def string_q2b(ustring):
+    """把字符串全角转半角"""
+    return "".join([q2b(uchar) for uchar in ustring])
+def q2b(uchar):
+    """全角转半角"""
+    inside_code = ord(uchar)
+    if inside_code == 0x3000:
+        inside_code = 0x0020
+    else:
+        inside_code -= 0xfee0
+    if inside_code < 0x0020 or inside_code > 0x7e:  # 转完之后不是半角字符返回原来的字符
+        return uchar
+    return chr(inside_code)
 def func_macro_correct_long(text):
     text_correct = ""
     errors_new = []
     for idx, text in enumerate(texts):
+        # 前处理
+        text = transfor_english_symbol_to_chinese(text)
+        text = string_q2b(text)
+        text = tradition_to_simple(text)
+        text = transfor_bert_unk_pun_to_know(text)
         text_out = func_macro_correct(text)
         source = text_out.get("source")
         target = text_out.get("target")