Spaces:
Running
Running
Fix broken tokenization
Browse files- app/cli.py +23 -16
- app/constants.py +4 -0
- app/data.py +6 -6
- app/model.py +1 -2
app/cli.py
CHANGED
@@ -139,14 +139,16 @@ def evaluate(
|
|
139 |
import gc
|
140 |
|
141 |
import joblib
|
|
|
142 |
|
143 |
-
from app.constants import
|
144 |
from app.data import load_data, tokenize
|
145 |
from app.model import evaluate_model
|
146 |
from app.utils import deserialize, serialize
|
147 |
|
148 |
-
cached_data_path =
|
149 |
use_cached_data = False
|
|
|
150 |
if cached_data_path.exists():
|
151 |
use_cached_data = force_cache or click.confirm(
|
152 |
f"Found existing tokenized data for '{dataset}'. Use it?",
|
@@ -159,20 +161,22 @@ def evaluate(
|
|
159 |
|
160 |
if use_cached_data:
|
161 |
click.echo("Loading cached data... ", nl=False)
|
162 |
-
token_data = deserialize(cached_data_path)
|
163 |
click.echo(DONE_STR)
|
164 |
else:
|
165 |
-
click.echo("Tokenizing data... "
|
166 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
167 |
-
click.echo(DONE_STR)
|
168 |
|
169 |
-
click.echo("Caching tokenized data... "
|
170 |
-
serialize(token_data, cached_data_path)
|
171 |
-
click.echo(DONE_STR)
|
172 |
|
173 |
del text_data
|
174 |
gc.collect()
|
175 |
|
|
|
|
|
|
|
|
|
176 |
click.echo("Loading model... ", nl=False)
|
177 |
model = joblib.load(model_path)
|
178 |
click.echo(DONE_STR)
|
@@ -266,8 +270,9 @@ def train(
|
|
266 |
import gc
|
267 |
|
268 |
import joblib
|
|
|
269 |
|
270 |
-
from app.constants import
|
271 |
from app.data import load_data, tokenize
|
272 |
from app.model import train_model
|
273 |
from app.utils import deserialize, serialize
|
@@ -276,7 +281,7 @@ def train(
|
|
276 |
if model_path.exists() and not overwrite:
|
277 |
click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
|
278 |
|
279 |
-
cached_data_path =
|
280 |
use_cached_data = False
|
281 |
|
282 |
if cached_data_path.exists():
|
@@ -291,20 +296,22 @@ def train(
|
|
291 |
|
292 |
if use_cached_data:
|
293 |
click.echo("Loading cached data... ", nl=False)
|
294 |
-
token_data = deserialize(cached_data_path)
|
295 |
click.echo(DONE_STR)
|
296 |
else:
|
297 |
-
click.echo("Tokenizing data... "
|
298 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
299 |
-
click.echo(DONE_STR)
|
300 |
|
301 |
-
click.echo("Caching tokenized data... "
|
302 |
-
serialize(token_data, cached_data_path)
|
303 |
-
click.echo(DONE_STR)
|
304 |
|
305 |
del text_data
|
306 |
gc.collect()
|
307 |
|
|
|
|
|
|
|
|
|
308 |
click.echo("Training model... ")
|
309 |
model, accuracy = train_model(
|
310 |
token_data,
|
|
|
139 |
import gc
|
140 |
|
141 |
import joblib
|
142 |
+
import pandas as pd
|
143 |
|
144 |
+
from app.constants import TOKENIZER_CACHE_PATH
|
145 |
from app.data import load_data, tokenize
|
146 |
from app.model import evaluate_model
|
147 |
from app.utils import deserialize, serialize
|
148 |
|
149 |
+
cached_data_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
|
150 |
use_cached_data = False
|
151 |
+
|
152 |
if cached_data_path.exists():
|
153 |
use_cached_data = force_cache or click.confirm(
|
154 |
f"Found existing tokenized data for '{dataset}'. Use it?",
|
|
|
161 |
|
162 |
if use_cached_data:
|
163 |
click.echo("Loading cached data... ", nl=False)
|
164 |
+
token_data = pd.Series(deserialize(cached_data_path))
|
165 |
click.echo(DONE_STR)
|
166 |
else:
|
167 |
+
click.echo("Tokenizing data... ")
|
168 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
|
|
169 |
|
170 |
+
click.echo("Caching tokenized data... ")
|
171 |
+
serialize(token_data, cached_data_path, show_progress=True)
|
|
|
172 |
|
173 |
del text_data
|
174 |
gc.collect()
|
175 |
|
176 |
+
click.echo("Size of vocabulary: ", nl=False)
|
177 |
+
vocab = token_data.explode().value_counts()
|
178 |
+
click.secho(str(len(vocab)), fg="blue")
|
179 |
+
|
180 |
click.echo("Loading model... ", nl=False)
|
181 |
model = joblib.load(model_path)
|
182 |
click.echo(DONE_STR)
|
|
|
270 |
import gc
|
271 |
|
272 |
import joblib
|
273 |
+
import pandas as pd
|
274 |
|
275 |
+
from app.constants import MODEL_DIR, TOKENIZER_CACHE_PATH
|
276 |
from app.data import load_data, tokenize
|
277 |
from app.model import train_model
|
278 |
from app.utils import deserialize, serialize
|
|
|
281 |
if model_path.exists() and not overwrite:
|
282 |
click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
|
283 |
|
284 |
+
cached_data_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
|
285 |
use_cached_data = False
|
286 |
|
287 |
if cached_data_path.exists():
|
|
|
296 |
|
297 |
if use_cached_data:
|
298 |
click.echo("Loading cached data... ", nl=False)
|
299 |
+
token_data = pd.Series(deserialize(cached_data_path))
|
300 |
click.echo(DONE_STR)
|
301 |
else:
|
302 |
+
click.echo("Tokenizing data... ")
|
303 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
|
|
304 |
|
305 |
+
click.echo("Caching tokenized data... ")
|
306 |
+
serialize(token_data, cached_data_path, show_progress=True)
|
|
|
307 |
|
308 |
del text_data
|
309 |
gc.collect()
|
310 |
|
311 |
+
click.echo("Size of vocabulary: ", nl=False)
|
312 |
+
vocab = token_data.explode().value_counts()
|
313 |
+
click.secho(str(len(vocab)), fg="blue")
|
314 |
+
|
315 |
click.echo("Training model... ")
|
316 |
model, accuracy = train_model(
|
317 |
token_data,
|
app/constants.py
CHANGED
@@ -7,6 +7,8 @@ CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
|
|
7 |
DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
|
8 |
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
9 |
|
|
|
|
|
10 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
11 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
12 |
|
@@ -25,3 +27,5 @@ SLANGMAP_URL = "Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-
|
|
25 |
CACHE_DIR.mkdir(exist_ok=True, parents=True)
|
26 |
DATA_DIR.mkdir(exist_ok=True, parents=True)
|
27 |
MODEL_DIR.mkdir(exist_ok=True, parents=True)
|
|
|
|
|
|
7 |
DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
|
8 |
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
9 |
|
10 |
+
TOKENIZER_CACHE_PATH = CACHE_DIR / "tokenizer"
|
11 |
+
|
12 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
13 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
14 |
|
|
|
27 |
CACHE_DIR.mkdir(exist_ok=True, parents=True)
|
28 |
DATA_DIR.mkdir(exist_ok=True, parents=True)
|
29 |
MODEL_DIR.mkdir(exist_ok=True, parents=True)
|
30 |
+
|
31 |
+
TOKENIZER_CACHE_PATH.mkdir(exist_ok=True, parents=True)
|
app/data.py
CHANGED
@@ -89,7 +89,7 @@ def _clean(text: str) -> str:
|
|
89 |
|
90 |
# Remove acronyms and abbreviations
|
91 |
# text = re.sub(r"(?:[a-z]\.){2,}", "", text)
|
92 |
-
text = re.sub(r"(?:[a-z]\.?)(?:[a-z]\.)", "", text)
|
93 |
|
94 |
# Remove honorifics
|
95 |
text = re.sub(r"\b(?:mr|mrs|ms|dr|prof|sr|jr)\.?\b", "", text)
|
@@ -118,7 +118,7 @@ def _clean(text: str) -> str:
|
|
118 |
return text.strip()
|
119 |
|
120 |
|
121 |
-
def _lemmatize(doc: Doc, threshold: int =
|
122 |
"""Lemmatize the provided text using spaCy.
|
123 |
|
124 |
Args:
|
@@ -136,8 +136,8 @@ def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
|
|
136 |
and not token.like_email # Ignore email addresses
|
137 |
and not token.like_url # Ignore URLs
|
138 |
and not token.like_num # Ignore numbers
|
139 |
-
and
|
140 |
-
and
|
141 |
]
|
142 |
|
143 |
|
@@ -145,7 +145,7 @@ def tokenize(
|
|
145 |
text_data: Sequence[str],
|
146 |
batch_size: int = 512,
|
147 |
n_jobs: int = 4,
|
148 |
-
character_threshold: int =
|
149 |
show_progress: bool = True,
|
150 |
) -> Sequence[Sequence[str]]:
|
151 |
"""Tokenize the provided text using spaCy.
|
@@ -174,7 +174,7 @@ def tokenize(
|
|
174 |
[
|
175 |
_lemmatize(doc, character_threshold)
|
176 |
for doc in tqdm(
|
177 |
-
nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner"
|
178 |
total=len(text_data),
|
179 |
desc="Lemmatization",
|
180 |
unit="doc",
|
|
|
89 |
|
90 |
# Remove acronyms and abbreviations
|
91 |
# text = re.sub(r"(?:[a-z]\.){2,}", "", text)
|
92 |
+
text = re.sub(r"\b(?:[a-z]\.?)(?:[a-z]\.)\b", "", text)
|
93 |
|
94 |
# Remove honorifics
|
95 |
text = re.sub(r"\b(?:mr|mrs|ms|dr|prof|sr|jr)\.?\b", "", text)
|
|
|
118 |
return text.strip()
|
119 |
|
120 |
|
121 |
+
def _lemmatize(doc: Doc, threshold: int = 3) -> Sequence[str]:
|
122 |
"""Lemmatize the provided text using spaCy.
|
123 |
|
124 |
Args:
|
|
|
136 |
and not token.like_email # Ignore email addresses
|
137 |
and not token.like_url # Ignore URLs
|
138 |
and not token.like_num # Ignore numbers
|
139 |
+
and token.is_alpha # Ignore non-alphabetic tokens
|
140 |
+
and (len(tok := token.lemma_.lower().strip()) >= threshold) # Ignore short tokens
|
141 |
]
|
142 |
|
143 |
|
|
|
145 |
text_data: Sequence[str],
|
146 |
batch_size: int = 512,
|
147 |
n_jobs: int = 4,
|
148 |
+
character_threshold: int = 3,
|
149 |
show_progress: bool = True,
|
150 |
) -> Sequence[Sequence[str]]:
|
151 |
"""Tokenize the provided text using spaCy.
|
|
|
174 |
[
|
175 |
_lemmatize(doc, character_threshold)
|
176 |
for doc in tqdm(
|
177 |
+
nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner"]),
|
178 |
total=len(text_data),
|
179 |
desc="Lemmatization",
|
180 |
unit="doc",
|
app/model.py
CHANGED
@@ -10,7 +10,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer,
|
|
10 |
from sklearn.linear_model import LogisticRegression
|
11 |
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
|
12 |
from sklearn.pipeline import Pipeline
|
13 |
-
from sklearn.svm import LinearSVC
|
14 |
from tqdm import tqdm
|
15 |
|
16 |
from app.constants import CACHE_DIR
|
@@ -132,7 +131,7 @@ def train_model(
|
|
132 |
vectorizer = _get_vectorizer(vectorizer, max_features)
|
133 |
classifiers = [
|
134 |
(LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
135 |
-
(LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
136 |
# (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 10)}),
|
137 |
# (RandomForestClassifier(random_state=rs), {"n_estimators": np.arange(50, 500, 50)}),
|
138 |
# (
|
|
|
10 |
from sklearn.linear_model import LogisticRegression
|
11 |
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
|
12 |
from sklearn.pipeline import Pipeline
|
|
|
13 |
from tqdm import tqdm
|
14 |
|
15 |
from app.constants import CACHE_DIR
|
|
|
131 |
vectorizer = _get_vectorizer(vectorizer, max_features)
|
132 |
classifiers = [
|
133 |
(LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
134 |
+
# (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
135 |
# (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 10)}),
|
136 |
# (RandomForestClassifier(random_state=rs), {"n_estimators": np.arange(50, 500, 50)}),
|
137 |
# (
|