Upload 2 files
Browse files- snails_naturalness_classifier.py +121 -0
- tokenprocessing.py +27 -0
snails_naturalness_classifier.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copyright 2024 Kyle Luoma
|
| 3 |
+
|
| 4 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
you may not use this file except in compliance with the License.
|
| 6 |
+
You may obtain a copy of the License at
|
| 7 |
+
|
| 8 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
|
| 10 |
+
Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
See the License for the specific language governing permissions and
|
| 14 |
+
limitations under the License.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from transformers import AutoTokenizer, CanineForSequenceClassification, pipeline
|
| 18 |
+
import torch
|
| 19 |
+
import pandas as pd
|
| 20 |
+
import tokenprocessing as tp
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class CanineIdentifierClassifier:
|
| 24 |
+
"""
|
| 25 |
+
A classifier for identifying word naturalness using a pre-trained text analysis model.
|
| 26 |
+
Classifies words as Regular (label N1), Low (label N2), or Least (label N3) natural.
|
| 27 |
+
Attributes:
|
| 28 |
+
model_name (str): The name of the model used for classification.
|
| 29 |
+
checkpoint (int): The checkpoint number of the model.
|
| 30 |
+
id2label (dict): A dictionary mapping label IDs to label names.
|
| 31 |
+
label2id (dict): A dictionary mapping label names to label IDs.
|
| 32 |
+
classifier (pipeline): The sentiment analysis pipeline used for classification.
|
| 33 |
+
identifiers (pd.DataFrame): A DataFrame containing identifiers to classify.
|
| 34 |
+
Methods:
|
| 35 |
+
__init__(identifiers=pd.DataFrame()):
|
| 36 |
+
Initializes the classifier with the given identifiers DataFrame.
|
| 37 |
+
do_batch_job(ident_df=None, save_as_excel=False, make_tag=True):
|
| 38 |
+
Performs batch classification on the given DataFrame of identifiers.
|
| 39 |
+
Args:
|
| 40 |
+
ident_df (pd.DataFrame, optional): The DataFrame of identifiers to classify. Defaults to None.
|
| 41 |
+
save_as_excel (bool, optional): Whether to save the results as an Excel file. Defaults to False.
|
| 42 |
+
make_tag (bool, optional): Whether to add a token tag to the text before classification. Defaults to True.
|
| 43 |
+
classify_identifier(identifier, make_tag=True):
|
| 44 |
+
Classifies a single identifier.
|
| 45 |
+
Args:
|
| 46 |
+
identifier (str): The identifier to classify.
|
| 47 |
+
make_tag (bool, optional): Whether to add a token tag to the identifier before classification. Defaults to True.
|
| 48 |
+
Returns:
|
| 49 |
+
list: The classification result.
|
| 50 |
+
"""
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def __init__(self, identifiers = pd.DataFrame()):
|
| 54 |
+
|
| 55 |
+
self.model_name = "kyleluoma/SNAILS-word-naturalness-classifier"
|
| 56 |
+
self.checkpoint = 5590
|
| 57 |
+
self.id2label = {0: "N1", 1: "N2", 2: "N3"}
|
| 58 |
+
self.label2id = {"N1": 0, "N2": 1, "N3": 2}
|
| 59 |
+
self.classifier = pipeline(
|
| 60 |
+
"sentiment-analysis",
|
| 61 |
+
model = "kyleluoma/SNAILS-word-naturalness-classifier",
|
| 62 |
+
device=0
|
| 63 |
+
)
|
| 64 |
+
self.identifiers = identifiers
|
| 65 |
+
|
| 66 |
+
def do_batch_job(self, ident_df: pd.DataFrame = None, save_as_excel: bool = False, make_tag: bool = True):
|
| 67 |
+
"""
|
| 68 |
+
Processes a batch of text data through a classifier and optionally saves the results to an Excel file.
|
| 69 |
+
Args:
|
| 70 |
+
ident_df (pd.DataFrame, optional): DataFrame containing the text data to be classified.
|
| 71 |
+
If None, uses self.identifiers. Defaults to None.
|
| 72 |
+
save_as_excel (bool, optional): If True, saves the results to an Excel file. Defaults to False.
|
| 73 |
+
make_tag (bool, optional): If True, appends a token tag to the text before classification. Defaults to True.
|
| 74 |
+
Returns:
|
| 75 |
+
None
|
| 76 |
+
"""
|
| 77 |
+
|
| 78 |
+
auto_scores = []
|
| 79 |
+
|
| 80 |
+
if ident_df == None:
|
| 81 |
+
ident_df = self.identifiers
|
| 82 |
+
|
| 83 |
+
for row in ident_df.itertuples():
|
| 84 |
+
if make_tag:
|
| 85 |
+
pred = classifier(row.text + tp.make_token_tag(row.text))
|
| 86 |
+
else:
|
| 87 |
+
pred = self.classifier(row.text)
|
| 88 |
+
print(pred)
|
| 89 |
+
auto_scores.append(pred[0]['label'])
|
| 90 |
+
|
| 91 |
+
ident_df["prediction"] = auto_scores
|
| 92 |
+
|
| 93 |
+
if save_as_excel:
|
| 94 |
+
ident_df[['text', 'prediction', 'category']].to_excel(
|
| 95 |
+
f"./classifier-inference-results/{self.model_name}-cp-{self.checkpoint}.xlsx",
|
| 96 |
+
index=False
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
def classify_identifier(self, identifier: str, make_tag: bool = True):
|
| 100 |
+
"""
|
| 101 |
+
Classifies the given identifier using the classifier.
|
| 102 |
+
Args:
|
| 103 |
+
identifier (str): The identifier to classify.
|
| 104 |
+
make_tag (bool, optional): If True, appends a token tag to the identifier before classification. Defaults to True.
|
| 105 |
+
Returns:
|
| 106 |
+
The classification result of the identifier.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
identifier = str(identifier)
|
| 110 |
+
if make_tag:
|
| 111 |
+
identifier += (" " + tp.make_token_tag(identifier))
|
| 112 |
+
pred = self.classifier(identifier)
|
| 113 |
+
# print("Classifying", identifier, "as", pred)
|
| 114 |
+
return pred
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
if __name__ == "__main__":
|
| 118 |
+
classifier = CanineIdentifierClassifier()
|
| 119 |
+
print(classifier.classify_identifier("WinterWeather"))
|
| 120 |
+
print(classifier.classify_identifier("WntrWthr"))
|
| 121 |
+
print(classifier.classify_identifier("WWth"))
|
tokenprocessing.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def make_token_tag(identifier):
|
| 2 |
+
"""
|
| 3 |
+
Feature engineering for identifiers, tags each character as a vowel, consonant, number, special character, or other.
|
| 4 |
+
|
| 5 |
+
Args:
|
| 6 |
+
identifier (str): The identifier to tag.
|
| 7 |
+
|
| 8 |
+
Returns:
|
| 9 |
+
str: A string of tag characters the same length as the input string.
|
| 10 |
+
"""
|
| 11 |
+
vowels = ["a", "e", "i", "o", "u"]
|
| 12 |
+
special = ["-", "_", "@"]
|
| 13 |
+
numbers = ["1", "2", "3", "4", "5", "6", "7", "8", "9"]
|
| 14 |
+
consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"]
|
| 15 |
+
tags = ""
|
| 16 |
+
for c in identifier.lower():
|
| 17 |
+
if c in vowels:
|
| 18 |
+
tags += "^"
|
| 19 |
+
elif c in special:
|
| 20 |
+
tags += "$"
|
| 21 |
+
elif c in numbers:
|
| 22 |
+
tags += "#"
|
| 23 |
+
elif c in consonants:
|
| 24 |
+
tags += "+"
|
| 25 |
+
else:
|
| 26 |
+
tags += "*"
|
| 27 |
+
return tags
|