Tymec commited on
Commit
447f97e
1 Parent(s): e1645d7

Fix broken tokenization

Browse files
Files changed (4) hide show
  1. app/cli.py +23 -16
  2. app/constants.py +4 -0
  3. app/data.py +6 -6
  4. app/model.py +1 -2
app/cli.py CHANGED
@@ -139,14 +139,16 @@ def evaluate(
139
  import gc
140
 
141
  import joblib
 
142
 
143
- from app.constants import CACHE_DIR
144
  from app.data import load_data, tokenize
145
  from app.model import evaluate_model
146
  from app.utils import deserialize, serialize
147
 
148
- cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
149
  use_cached_data = False
 
150
  if cached_data_path.exists():
151
  use_cached_data = force_cache or click.confirm(
152
  f"Found existing tokenized data for '{dataset}'. Use it?",
@@ -159,20 +161,22 @@ def evaluate(
159
 
160
  if use_cached_data:
161
  click.echo("Loading cached data... ", nl=False)
162
- token_data = deserialize(cached_data_path)
163
  click.echo(DONE_STR)
164
  else:
165
- click.echo("Tokenizing data... ", nl=False)
166
  token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
167
- click.echo(DONE_STR)
168
 
169
- click.echo("Caching tokenized data... ", nl=False)
170
- serialize(token_data, cached_data_path)
171
- click.echo(DONE_STR)
172
 
173
  del text_data
174
  gc.collect()
175
 
 
 
 
 
176
  click.echo("Loading model... ", nl=False)
177
  model = joblib.load(model_path)
178
  click.echo(DONE_STR)
@@ -266,8 +270,9 @@ def train(
266
  import gc
267
 
268
  import joblib
 
269
 
270
- from app.constants import CACHE_DIR, MODEL_DIR
271
  from app.data import load_data, tokenize
272
  from app.model import train_model
273
  from app.utils import deserialize, serialize
@@ -276,7 +281,7 @@ def train(
276
  if model_path.exists() and not overwrite:
277
  click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
278
 
279
- cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
280
  use_cached_data = False
281
 
282
  if cached_data_path.exists():
@@ -291,20 +296,22 @@ def train(
291
 
292
  if use_cached_data:
293
  click.echo("Loading cached data... ", nl=False)
294
- token_data = deserialize(cached_data_path)
295
  click.echo(DONE_STR)
296
  else:
297
- click.echo("Tokenizing data... ", nl=False)
298
  token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
299
- click.echo(DONE_STR)
300
 
301
- click.echo("Caching tokenized data... ", nl=False)
302
- serialize(token_data, cached_data_path)
303
- click.echo(DONE_STR)
304
 
305
  del text_data
306
  gc.collect()
307
 
 
 
 
 
308
  click.echo("Training model... ")
309
  model, accuracy = train_model(
310
  token_data,
 
139
  import gc
140
 
141
  import joblib
142
+ import pandas as pd
143
 
144
+ from app.constants import TOKENIZER_CACHE_PATH
145
  from app.data import load_data, tokenize
146
  from app.model import evaluate_model
147
  from app.utils import deserialize, serialize
148
 
149
+ cached_data_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
150
  use_cached_data = False
151
+
152
  if cached_data_path.exists():
153
  use_cached_data = force_cache or click.confirm(
154
  f"Found existing tokenized data for '{dataset}'. Use it?",
 
161
 
162
  if use_cached_data:
163
  click.echo("Loading cached data... ", nl=False)
164
+ token_data = pd.Series(deserialize(cached_data_path))
165
  click.echo(DONE_STR)
166
  else:
167
+ click.echo("Tokenizing data... ")
168
  token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
 
169
 
170
+ click.echo("Caching tokenized data... ")
171
+ serialize(token_data, cached_data_path, show_progress=True)
 
172
 
173
  del text_data
174
  gc.collect()
175
 
176
+ click.echo("Size of vocabulary: ", nl=False)
177
+ vocab = token_data.explode().value_counts()
178
+ click.secho(str(len(vocab)), fg="blue")
179
+
180
  click.echo("Loading model... ", nl=False)
181
  model = joblib.load(model_path)
182
  click.echo(DONE_STR)
 
270
  import gc
271
 
272
  import joblib
273
+ import pandas as pd
274
 
275
+ from app.constants import MODEL_DIR, TOKENIZER_CACHE_PATH
276
  from app.data import load_data, tokenize
277
  from app.model import train_model
278
  from app.utils import deserialize, serialize
 
281
  if model_path.exists() and not overwrite:
282
  click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
283
 
284
+ cached_data_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
285
  use_cached_data = False
286
 
287
  if cached_data_path.exists():
 
296
 
297
  if use_cached_data:
298
  click.echo("Loading cached data... ", nl=False)
299
+ token_data = pd.Series(deserialize(cached_data_path))
300
  click.echo(DONE_STR)
301
  else:
302
+ click.echo("Tokenizing data... ")
303
  token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
 
304
 
305
+ click.echo("Caching tokenized data... ")
306
+ serialize(token_data, cached_data_path, show_progress=True)
 
307
 
308
  del text_data
309
  gc.collect()
310
 
311
+ click.echo("Size of vocabulary: ", nl=False)
312
+ vocab = token_data.explode().value_counts()
313
+ click.secho(str(len(vocab)), fg="blue")
314
+
315
  click.echo("Training model... ")
316
  model, accuracy = train_model(
317
  token_data,
app/constants.py CHANGED
@@ -7,6 +7,8 @@ CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
7
  DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
8
  MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
9
 
 
 
10
  SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
11
  SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
12
 
@@ -25,3 +27,5 @@ SLANGMAP_URL = "Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-
25
  CACHE_DIR.mkdir(exist_ok=True, parents=True)
26
  DATA_DIR.mkdir(exist_ok=True, parents=True)
27
  MODEL_DIR.mkdir(exist_ok=True, parents=True)
 
 
 
7
  DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
8
  MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
9
 
10
+ TOKENIZER_CACHE_PATH = CACHE_DIR / "tokenizer"
11
+
12
  SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
13
  SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
14
 
 
27
  CACHE_DIR.mkdir(exist_ok=True, parents=True)
28
  DATA_DIR.mkdir(exist_ok=True, parents=True)
29
  MODEL_DIR.mkdir(exist_ok=True, parents=True)
30
+
31
+ TOKENIZER_CACHE_PATH.mkdir(exist_ok=True, parents=True)
app/data.py CHANGED
@@ -89,7 +89,7 @@ def _clean(text: str) -> str:
89
 
90
  # Remove acronyms and abbreviations
91
  # text = re.sub(r"(?:[a-z]\.){2,}", "", text)
92
- text = re.sub(r"(?:[a-z]\.?)(?:[a-z]\.)", "", text)
93
 
94
  # Remove honorifics
95
  text = re.sub(r"\b(?:mr|mrs|ms|dr|prof|sr|jr)\.?\b", "", text)
@@ -118,7 +118,7 @@ def _clean(text: str) -> str:
118
  return text.strip()
119
 
120
 
121
- def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
122
  """Lemmatize the provided text using spaCy.
123
 
124
  Args:
@@ -136,8 +136,8 @@ def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
136
  and not token.like_email # Ignore email addresses
137
  and not token.like_url # Ignore URLs
138
  and not token.like_num # Ignore numbers
139
- and not token.is_alpha # Ignore non-alphabetic tokens
140
- and not (len(tok := token.lemma_.lower().strip()) < threshold) # Ignore short tokens
141
  ]
142
 
143
 
@@ -145,7 +145,7 @@ def tokenize(
145
  text_data: Sequence[str],
146
  batch_size: int = 512,
147
  n_jobs: int = 4,
148
- character_threshold: int = 2,
149
  show_progress: bool = True,
150
  ) -> Sequence[Sequence[str]]:
151
  """Tokenize the provided text using spaCy.
@@ -174,7 +174,7 @@ def tokenize(
174
  [
175
  _lemmatize(doc, character_threshold)
176
  for doc in tqdm(
177
- nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner", "tok2vec"]),
178
  total=len(text_data),
179
  desc="Lemmatization",
180
  unit="doc",
 
89
 
90
  # Remove acronyms and abbreviations
91
  # text = re.sub(r"(?:[a-z]\.){2,}", "", text)
92
+ text = re.sub(r"\b(?:[a-z]\.?)(?:[a-z]\.)\b", "", text)
93
 
94
  # Remove honorifics
95
  text = re.sub(r"\b(?:mr|mrs|ms|dr|prof|sr|jr)\.?\b", "", text)
 
118
  return text.strip()
119
 
120
 
121
+ def _lemmatize(doc: Doc, threshold: int = 3) -> Sequence[str]:
122
  """Lemmatize the provided text using spaCy.
123
 
124
  Args:
 
136
  and not token.like_email # Ignore email addresses
137
  and not token.like_url # Ignore URLs
138
  and not token.like_num # Ignore numbers
139
+ and token.is_alpha # Ignore non-alphabetic tokens
140
+ and (len(tok := token.lemma_.lower().strip()) >= threshold) # Ignore short tokens
141
  ]
142
 
143
 
 
145
  text_data: Sequence[str],
146
  batch_size: int = 512,
147
  n_jobs: int = 4,
148
+ character_threshold: int = 3,
149
  show_progress: bool = True,
150
  ) -> Sequence[Sequence[str]]:
151
  """Tokenize the provided text using spaCy.
 
174
  [
175
  _lemmatize(doc, character_threshold)
176
  for doc in tqdm(
177
+ nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner"]),
178
  total=len(text_data),
179
  desc="Lemmatization",
180
  unit="doc",
app/model.py CHANGED
@@ -10,7 +10,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer,
10
  from sklearn.linear_model import LogisticRegression
11
  from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
12
  from sklearn.pipeline import Pipeline
13
- from sklearn.svm import LinearSVC
14
  from tqdm import tqdm
15
 
16
  from app.constants import CACHE_DIR
@@ -132,7 +131,7 @@ def train_model(
132
  vectorizer = _get_vectorizer(vectorizer, max_features)
133
  classifiers = [
134
  (LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
135
- (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
136
  # (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 10)}),
137
  # (RandomForestClassifier(random_state=rs), {"n_estimators": np.arange(50, 500, 50)}),
138
  # (
 
10
  from sklearn.linear_model import LogisticRegression
11
  from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
12
  from sklearn.pipeline import Pipeline
 
13
  from tqdm import tqdm
14
 
15
  from app.constants import CACHE_DIR
 
131
  vectorizer = _get_vectorizer(vectorizer, max_features)
132
  classifiers = [
133
  (LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
134
+ # (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
135
  # (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 10)}),
136
  # (RandomForestClassifier(random_state=rs), {"n_estimators": np.arange(50, 500, 50)}),
137
  # (