Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
language:
|
4 |
+
- ru
|
5 |
+
---
|
6 |
+
|
7 |
+
RUPunct_big - самая большая модель из семейства RUPunct. Подходит для большинства задач.
|
8 |
+
|
9 |
+
Код инференса:
|
10 |
+
```py
|
11 |
+
from transformers import pipeline
|
12 |
+
from transformers import AutoTokenizer
|
13 |
+
|
14 |
+
pt = "RUPunct/RUPunct_big"
|
15 |
+
|
16 |
+
tk = AutoTokenizer.from_pretrained(pt, strip_accents=False, add_prefix_space=True)
|
17 |
+
classifier = pipeline("ner", model=pt, tokenizer=tk, aggregation_strategy="first")
|
18 |
+
|
19 |
+
|
20 |
+
def process_token(token, label):
|
21 |
+
if label == "LOWER_O":
|
22 |
+
return token
|
23 |
+
if label == "LOWER_PERIOD":
|
24 |
+
return token + "."
|
25 |
+
if label == "LOWER_COMMA":
|
26 |
+
return token + ","
|
27 |
+
if label == "LOWER_QUESTION":
|
28 |
+
return token + "?"
|
29 |
+
if label == "LOWER_TIRE":
|
30 |
+
return token + "—"
|
31 |
+
if label == "LOWER_DVOETOCHIE":
|
32 |
+
return token + ":"
|
33 |
+
if label == "LOWER_VOSKL":
|
34 |
+
return token + "!"
|
35 |
+
if label == "LOWER_PERIODCOMMA":
|
36 |
+
return token + ";"
|
37 |
+
if label == "LOWER_DEFIS":
|
38 |
+
return token + "-"
|
39 |
+
if label == "LOWER_MNOGOTOCHIE":
|
40 |
+
return token + "..."
|
41 |
+
if label == "LOWER_QUESTIONVOSKL":
|
42 |
+
return token + "?!"
|
43 |
+
if label == "UPPER_O":
|
44 |
+
return token.capitalize()
|
45 |
+
if label == "UPPER_PERIOD":
|
46 |
+
return token.capitalize() + "."
|
47 |
+
if label == "UPPER_COMMA":
|
48 |
+
return token.capitalize() + ","
|
49 |
+
if label == "UPPER_QUESTION":
|
50 |
+
return token.capitalize() + "?"
|
51 |
+
if label == "UPPER_TIRE":
|
52 |
+
return token.capitalize() + " —"
|
53 |
+
if label == "UPPER_DVOETOCHIE":
|
54 |
+
return token.capitalize() + ":"
|
55 |
+
if label == "UPPER_VOSKL":
|
56 |
+
return token.capitalize() + "!"
|
57 |
+
if label == "UPPER_PERIODCOMMA":
|
58 |
+
return token.capitalize() + ";"
|
59 |
+
if label == "UPPER_DEFIS":
|
60 |
+
return token.capitalize() + "-"
|
61 |
+
if label == "UPPER_MNOGOTOCHIE":
|
62 |
+
return token.capitalize() + "..."
|
63 |
+
if label == "UPPER_QUESTIONVOSKL":
|
64 |
+
return token.capitalize() + "?!"
|
65 |
+
if label == "UPPER_TOTAL_O":
|
66 |
+
return token.upper()
|
67 |
+
if label == "UPPER_TOTAL_PERIOD":
|
68 |
+
return token.upper() + "."
|
69 |
+
if label == "UPPER_TOTAL_COMMA":
|
70 |
+
return token.upper() + ","
|
71 |
+
if label == "UPPER_TOTAL_QUESTION":
|
72 |
+
return token.upper() + "?"
|
73 |
+
if label == "UPPER_TOTAL_TIRE":
|
74 |
+
return token.upper() + " —"
|
75 |
+
if label == "UPPER_TOTAL_DVOETOCHIE":
|
76 |
+
return token.upper() + ":"
|
77 |
+
if label == "UPPER_TOTAL_VOSKL":
|
78 |
+
return token.upper() + "!"
|
79 |
+
if label == "UPPER_TOTAL_PERIODCOMMA":
|
80 |
+
return token.upper() + ";"
|
81 |
+
if label == "UPPER_TOTAL_DEFIS":
|
82 |
+
return token.upper() + "-"
|
83 |
+
if label == "UPPER_TOTAL_MNOGOTOCHIE":
|
84 |
+
return token.upper() + "..."
|
85 |
+
if label == "UPPER_TOTAL_QUESTIONVOSKL":
|
86 |
+
return token.upper() + "?!"
|
87 |
+
|
88 |
+
while 1:
|
89 |
+
input_text = input(":> ")
|
90 |
+
preds = classifier(input_text)
|
91 |
+
output = ""
|
92 |
+
for item in preds:
|
93 |
+
if item["word"] == ".":
|
94 |
+
item["entity_group"] = "O"
|
95 |
+
output += " " + process_token(item['word'].strip(), item['entity_group'])
|
96 |
+
print(">>>", output)
|
97 |
+
```
|