Tymec commited on
Commit
183f8cd
1 Parent(s): 2b747dc

Swap test dataset

Browse files
Files changed (5) hide show
  1. app/cli.py +13 -11
  2. app/constants.py +9 -9
  3. app/data.py +3 -30
  4. app/utils.py +1 -0
  5. data/test.csv +0 -0
app/cli.py CHANGED
@@ -141,13 +141,13 @@ def evaluate(
141
  import joblib
142
  import pandas as pd
143
 
144
- from app.constants import TOKENIZER_CACHE_PATH
145
  from app.data import load_data, tokenize
146
  from app.model import evaluate_model
147
  from app.utils import deserialize, serialize
148
 
149
- token_cache_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
150
- label_cache_path = TOKENIZER_CACHE_PATH / f"{dataset}_labels.pkl"
151
  use_cached_data = False
152
 
153
  if token_cache_path.exists():
@@ -168,8 +168,6 @@ def evaluate(
168
 
169
  click.echo("Tokenizing data... ")
170
  token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
171
-
172
- click.echo("Caching tokenized data... ")
173
  serialize(token_data, token_cache_path, show_progress=True)
174
  joblib.dump(label_data, label_cache_path, compress=3)
175
 
@@ -184,7 +182,13 @@ def evaluate(
184
  model = joblib.load(model_path)
185
  click.echo(DONE_STR)
186
 
187
- click.echo("Evaluating model... ", nl=False)
 
 
 
 
 
 
188
  acc_mean, acc_std = evaluate_model(
189
  model,
190
  token_data,
@@ -282,7 +286,7 @@ def train(
282
  import joblib
283
  import pandas as pd
284
 
285
- from app.constants import MODEL_DIR, TOKENIZER_CACHE_PATH
286
  from app.data import load_data, tokenize
287
  from app.model import train_model
288
  from app.utils import deserialize, serialize
@@ -291,8 +295,8 @@ def train(
291
  if model_path.exists() and not overwrite:
292
  click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
293
 
294
- token_cache_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
295
- label_cache_path = TOKENIZER_CACHE_PATH / f"{dataset}_labels.pkl"
296
  use_cached_data = False
297
 
298
  if token_cache_path.exists():
@@ -313,8 +317,6 @@ def train(
313
 
314
  click.echo("Tokenizing data... ")
315
  token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
316
-
317
- click.echo("Caching tokenized data... ")
318
  serialize(token_data, token_cache_path, show_progress=True)
319
  joblib.dump(label_data, label_cache_path, compress=3)
320
 
 
141
  import joblib
142
  import pandas as pd
143
 
144
+ from app.constants import TOKENIZER_CACHE_DIR
145
  from app.data import load_data, tokenize
146
  from app.model import evaluate_model
147
  from app.utils import deserialize, serialize
148
 
149
+ token_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_tokenized.pkl"
150
+ label_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_labels.pkl"
151
  use_cached_data = False
152
 
153
  if token_cache_path.exists():
 
168
 
169
  click.echo("Tokenizing data... ")
170
  token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
 
 
171
  serialize(token_data, token_cache_path, show_progress=True)
172
  joblib.dump(label_data, label_cache_path, compress=3)
173
 
 
182
  model = joblib.load(model_path)
183
  click.echo(DONE_STR)
184
 
185
+ if cv == 1:
186
+ click.echo("Evaluating model... ", nl=False)
187
+ acc = model.score(token_data, label_data)
188
+ click.secho(f"{acc:.2%}", fg="blue")
189
+ return
190
+
191
+ click.echo("Evaluating model... ")
192
  acc_mean, acc_std = evaluate_model(
193
  model,
194
  token_data,
 
286
  import joblib
287
  import pandas as pd
288
 
289
+ from app.constants import MODEL_DIR, TOKENIZER_CACHE_DIR
290
  from app.data import load_data, tokenize
291
  from app.model import train_model
292
  from app.utils import deserialize, serialize
 
295
  if model_path.exists() and not overwrite:
296
  click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
297
 
298
+ token_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_tokenized.pkl"
299
+ label_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_labels.pkl"
300
  use_cached_data = False
301
 
302
  if token_cache_path.exists():
 
317
 
318
  click.echo("Tokenizing data... ")
319
  token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
 
 
320
  serialize(token_data, token_cache_path, show_progress=True)
321
  joblib.dump(label_data, label_cache_path, compress=3)
322
 
app/constants.py CHANGED
@@ -4,10 +4,16 @@ import os
4
  from pathlib import Path
5
 
6
  CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
 
 
7
  DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
 
 
8
  MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
 
9
 
10
- TOKENIZER_CACHE_PATH = CACHE_DIR / "tokenizer"
 
11
 
12
  SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
13
  SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
@@ -19,13 +25,7 @@ IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
19
  IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
20
 
21
  TEST_DATASET_PATH = DATA_DIR / "test.csv"
22
- TEST_DATASET_URL = "https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset"
23
 
24
  SLANGMAP_PATH = DATA_DIR / "slang.json"
25
- SLANGMAP_URL = "Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing"
26
-
27
- CACHE_DIR.mkdir(exist_ok=True, parents=True)
28
- DATA_DIR.mkdir(exist_ok=True, parents=True)
29
- MODEL_DIR.mkdir(exist_ok=True, parents=True)
30
-
31
- TOKENIZER_CACHE_PATH.mkdir(exist_ok=True, parents=True)
 
4
  from pathlib import Path
5
 
6
  CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
7
+ CACHE_DIR.mkdir(exist_ok=True, parents=True)
8
+
9
  DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
10
+ DATA_DIR.mkdir(exist_ok=True, parents=True)
11
+
12
  MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
13
+ MODEL_DIR.mkdir(exist_ok=True, parents=True)
14
 
15
+ TOKENIZER_CACHE_DIR = CACHE_DIR / "tokenizer"
16
+ TOKENIZER_CACHE_DIR.mkdir(exist_ok=True, parents=True)
17
 
18
  SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
19
  SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
 
25
  IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
26
 
27
  TEST_DATASET_PATH = DATA_DIR / "test.csv"
28
+ TEST_DATASET_URL = "https://github.com/Tymec/sentiment-analysis/blob/main/data/test.csv?raw=true"
29
 
30
  SLANGMAP_PATH = DATA_DIR / "slang.json"
31
+ SLANGMAP_URL = "https://github.com/Tymec/sentiment-analysis/blob/main/data/slang.json?raw=true"
 
 
 
 
 
 
app/data.py CHANGED
@@ -55,7 +55,6 @@ def slang() -> tuple[Pattern, dict[str, str]]:
55
  FileNotFoundError: If the file is not found
56
  """
57
  if not SLANGMAP_PATH.exists():
58
- # msg = f"Missing slang mapping file: {SLANG_PATH}"
59
  msg = (
60
  f"Slang mapping file not found at: '{SLANGMAP_PATH}'\n"
61
  "Please download the file from:\n"
@@ -89,7 +88,6 @@ def _clean(text: str) -> str:
89
  text = slang_pattern.sub(lambda x: slang_mapping[x.group()], text)
90
 
91
  # Remove acronyms and abbreviations
92
- # text = re.sub(r"(?:[a-z]\.){2,}", "", text)
93
  text = re.sub(r"\b(?:[a-z]\.?)(?:[a-z]\.)\b", "", text)
94
 
95
  # Remove honorifics
@@ -161,15 +159,6 @@ def tokenize(
161
  Returns:
162
  Tokenized text data
163
  """
164
- # text_data = [
165
- # _clean(text)
166
- # for text in tqdm(
167
- # text_data,
168
- # desc="Cleaning",
169
- # unit="doc",
170
- # disable=not show_progress,
171
- # )
172
- # ]
173
  text_data = Parallel(n_jobs=n_jobs)(
174
  delayed(_clean)(text)
175
  for text in tqdm(
@@ -310,12 +299,9 @@ def load_imdb50k() -> tuple[list[str], list[int]]:
310
  return data["review"].tolist(), data["sentiment"].tolist()
311
 
312
 
313
- def load_test(include_neutral: bool = False) -> tuple[list[str], list[int]]:
314
  """Load the test dataset and make it suitable for use.
315
 
316
- Args:
317
- include_neutral: Whether to include neutral sentiment
318
-
319
  Returns:
320
  Text and label data
321
 
@@ -334,21 +320,8 @@ def load_test(include_neutral: bool = False) -> tuple[list[str], list[int]]:
334
  # Load the dataset
335
  data = pd.read_csv(TEST_DATASET_PATH)
336
 
337
- # Ignore rows with neutral sentiment
338
- if not include_neutral:
339
- data = data[data["label"] != 1]
340
-
341
- # Map sentiment values
342
- data["label"] = data["label"].map(
343
- {
344
- 0: 0, # Negative
345
- 1: 2, # Neutral
346
- 2: 1, # Positive
347
- },
348
- )
349
-
350
  # Return as lists
351
- return data["text"].tolist(), data["label"].tolist()
352
 
353
 
354
  def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test"]) -> tuple[list[str], list[int]]:
@@ -371,7 +344,7 @@ def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test
371
  case "imdb50k":
372
  return load_imdb50k()
373
  case "test":
374
- return load_test(include_neutral=False)
375
  case _:
376
  msg = f"Unknown dataset: {dataset}"
377
  raise ValueError(msg)
 
55
  FileNotFoundError: If the file is not found
56
  """
57
  if not SLANGMAP_PATH.exists():
 
58
  msg = (
59
  f"Slang mapping file not found at: '{SLANGMAP_PATH}'\n"
60
  "Please download the file from:\n"
 
88
  text = slang_pattern.sub(lambda x: slang_mapping[x.group()], text)
89
 
90
  # Remove acronyms and abbreviations
 
91
  text = re.sub(r"\b(?:[a-z]\.?)(?:[a-z]\.)\b", "", text)
92
 
93
  # Remove honorifics
 
159
  Returns:
160
  Tokenized text data
161
  """
 
 
 
 
 
 
 
 
 
162
  text_data = Parallel(n_jobs=n_jobs)(
163
  delayed(_clean)(text)
164
  for text in tqdm(
 
299
  return data["review"].tolist(), data["sentiment"].tolist()
300
 
301
 
302
+ def load_test() -> tuple[list[str], list[int]]:
303
  """Load the test dataset and make it suitable for use.
304
 
 
 
 
305
  Returns:
306
  Text and label data
307
 
 
320
  # Load the dataset
321
  data = pd.read_csv(TEST_DATASET_PATH)
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  # Return as lists
324
+ return data["text"].tolist(), data["sentiment"].tolist()
325
 
326
 
327
  def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test"]) -> tuple[list[str], list[int]]:
 
344
  case "imdb50k":
345
  return load_imdb50k()
346
  case "test":
347
+ return load_test()
348
  case _:
349
  msg = f"Unknown dataset: {dataset}"
350
  raise ValueError(msg)
app/utils.py CHANGED
@@ -23,6 +23,7 @@ def serialize(data: Sequence[str | int], path: Path, max_size: int = 100_000, sh
23
  for i, chunk in enumerate(
24
  tqdm(
25
  [data[i : i + max_size] for i in range(0, len(data), max_size)],
 
26
  unit="chunk",
27
  disable=not show_progress,
28
  ),
 
23
  for i, chunk in enumerate(
24
  tqdm(
25
  [data[i : i + max_size] for i in range(0, len(data), max_size)],
26
+ desc="Serializing",
27
  unit="chunk",
28
  disable=not show_progress,
29
  ),
data/test.csv ADDED
Binary file (22.7 kB). View file