Spaces:
Running
Running
Swap test dataset
Browse files- app/cli.py +13 -11
- app/constants.py +9 -9
- app/data.py +3 -30
- app/utils.py +1 -0
- data/test.csv +0 -0
app/cli.py
CHANGED
@@ -141,13 +141,13 @@ def evaluate(
|
|
141 |
import joblib
|
142 |
import pandas as pd
|
143 |
|
144 |
-
from app.constants import
|
145 |
from app.data import load_data, tokenize
|
146 |
from app.model import evaluate_model
|
147 |
from app.utils import deserialize, serialize
|
148 |
|
149 |
-
token_cache_path =
|
150 |
-
label_cache_path =
|
151 |
use_cached_data = False
|
152 |
|
153 |
if token_cache_path.exists():
|
@@ -168,8 +168,6 @@ def evaluate(
|
|
168 |
|
169 |
click.echo("Tokenizing data... ")
|
170 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
171 |
-
|
172 |
-
click.echo("Caching tokenized data... ")
|
173 |
serialize(token_data, token_cache_path, show_progress=True)
|
174 |
joblib.dump(label_data, label_cache_path, compress=3)
|
175 |
|
@@ -184,7 +182,13 @@ def evaluate(
|
|
184 |
model = joblib.load(model_path)
|
185 |
click.echo(DONE_STR)
|
186 |
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
acc_mean, acc_std = evaluate_model(
|
189 |
model,
|
190 |
token_data,
|
@@ -282,7 +286,7 @@ def train(
|
|
282 |
import joblib
|
283 |
import pandas as pd
|
284 |
|
285 |
-
from app.constants import MODEL_DIR,
|
286 |
from app.data import load_data, tokenize
|
287 |
from app.model import train_model
|
288 |
from app.utils import deserialize, serialize
|
@@ -291,8 +295,8 @@ def train(
|
|
291 |
if model_path.exists() and not overwrite:
|
292 |
click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
|
293 |
|
294 |
-
token_cache_path =
|
295 |
-
label_cache_path =
|
296 |
use_cached_data = False
|
297 |
|
298 |
if token_cache_path.exists():
|
@@ -313,8 +317,6 @@ def train(
|
|
313 |
|
314 |
click.echo("Tokenizing data... ")
|
315 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
316 |
-
|
317 |
-
click.echo("Caching tokenized data... ")
|
318 |
serialize(token_data, token_cache_path, show_progress=True)
|
319 |
joblib.dump(label_data, label_cache_path, compress=3)
|
320 |
|
|
|
141 |
import joblib
|
142 |
import pandas as pd
|
143 |
|
144 |
+
from app.constants import TOKENIZER_CACHE_DIR
|
145 |
from app.data import load_data, tokenize
|
146 |
from app.model import evaluate_model
|
147 |
from app.utils import deserialize, serialize
|
148 |
|
149 |
+
token_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_tokenized.pkl"
|
150 |
+
label_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_labels.pkl"
|
151 |
use_cached_data = False
|
152 |
|
153 |
if token_cache_path.exists():
|
|
|
168 |
|
169 |
click.echo("Tokenizing data... ")
|
170 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
|
|
|
|
171 |
serialize(token_data, token_cache_path, show_progress=True)
|
172 |
joblib.dump(label_data, label_cache_path, compress=3)
|
173 |
|
|
|
182 |
model = joblib.load(model_path)
|
183 |
click.echo(DONE_STR)
|
184 |
|
185 |
+
if cv == 1:
|
186 |
+
click.echo("Evaluating model... ", nl=False)
|
187 |
+
acc = model.score(token_data, label_data)
|
188 |
+
click.secho(f"{acc:.2%}", fg="blue")
|
189 |
+
return
|
190 |
+
|
191 |
+
click.echo("Evaluating model... ")
|
192 |
acc_mean, acc_std = evaluate_model(
|
193 |
model,
|
194 |
token_data,
|
|
|
286 |
import joblib
|
287 |
import pandas as pd
|
288 |
|
289 |
+
from app.constants import MODEL_DIR, TOKENIZER_CACHE_DIR
|
290 |
from app.data import load_data, tokenize
|
291 |
from app.model import train_model
|
292 |
from app.utils import deserialize, serialize
|
|
|
295 |
if model_path.exists() and not overwrite:
|
296 |
click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
|
297 |
|
298 |
+
token_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_tokenized.pkl"
|
299 |
+
label_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_labels.pkl"
|
300 |
use_cached_data = False
|
301 |
|
302 |
if token_cache_path.exists():
|
|
|
317 |
|
318 |
click.echo("Tokenizing data... ")
|
319 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
|
|
|
|
320 |
serialize(token_data, token_cache_path, show_progress=True)
|
321 |
joblib.dump(label_data, label_cache_path, compress=3)
|
322 |
|
app/constants.py
CHANGED
@@ -4,10 +4,16 @@ import os
|
|
4 |
from pathlib import Path
|
5 |
|
6 |
CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
|
|
|
|
|
7 |
DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
|
|
|
|
|
8 |
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
|
|
9 |
|
10 |
-
|
|
|
11 |
|
12 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
13 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
@@ -19,13 +25,7 @@ IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
|
|
19 |
IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
|
20 |
|
21 |
TEST_DATASET_PATH = DATA_DIR / "test.csv"
|
22 |
-
TEST_DATASET_URL = "https://
|
23 |
|
24 |
SLANGMAP_PATH = DATA_DIR / "slang.json"
|
25 |
-
SLANGMAP_URL = "
|
26 |
-
|
27 |
-
CACHE_DIR.mkdir(exist_ok=True, parents=True)
|
28 |
-
DATA_DIR.mkdir(exist_ok=True, parents=True)
|
29 |
-
MODEL_DIR.mkdir(exist_ok=True, parents=True)
|
30 |
-
|
31 |
-
TOKENIZER_CACHE_PATH.mkdir(exist_ok=True, parents=True)
|
|
|
4 |
from pathlib import Path
|
5 |
|
6 |
CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
|
7 |
+
CACHE_DIR.mkdir(exist_ok=True, parents=True)
|
8 |
+
|
9 |
DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
|
10 |
+
DATA_DIR.mkdir(exist_ok=True, parents=True)
|
11 |
+
|
12 |
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
13 |
+
MODEL_DIR.mkdir(exist_ok=True, parents=True)
|
14 |
|
15 |
+
TOKENIZER_CACHE_DIR = CACHE_DIR / "tokenizer"
|
16 |
+
TOKENIZER_CACHE_DIR.mkdir(exist_ok=True, parents=True)
|
17 |
|
18 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
19 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
|
|
25 |
IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
|
26 |
|
27 |
TEST_DATASET_PATH = DATA_DIR / "test.csv"
|
28 |
+
TEST_DATASET_URL = "https://github.com/Tymec/sentiment-analysis/blob/main/data/test.csv?raw=true"
|
29 |
|
30 |
SLANGMAP_PATH = DATA_DIR / "slang.json"
|
31 |
+
SLANGMAP_URL = "https://github.com/Tymec/sentiment-analysis/blob/main/data/slang.json?raw=true"
|
|
|
|
|
|
|
|
|
|
|
|
app/data.py
CHANGED
@@ -55,7 +55,6 @@ def slang() -> tuple[Pattern, dict[str, str]]:
|
|
55 |
FileNotFoundError: If the file is not found
|
56 |
"""
|
57 |
if not SLANGMAP_PATH.exists():
|
58 |
-
# msg = f"Missing slang mapping file: {SLANG_PATH}"
|
59 |
msg = (
|
60 |
f"Slang mapping file not found at: '{SLANGMAP_PATH}'\n"
|
61 |
"Please download the file from:\n"
|
@@ -89,7 +88,6 @@ def _clean(text: str) -> str:
|
|
89 |
text = slang_pattern.sub(lambda x: slang_mapping[x.group()], text)
|
90 |
|
91 |
# Remove acronyms and abbreviations
|
92 |
-
# text = re.sub(r"(?:[a-z]\.){2,}", "", text)
|
93 |
text = re.sub(r"\b(?:[a-z]\.?)(?:[a-z]\.)\b", "", text)
|
94 |
|
95 |
# Remove honorifics
|
@@ -161,15 +159,6 @@ def tokenize(
|
|
161 |
Returns:
|
162 |
Tokenized text data
|
163 |
"""
|
164 |
-
# text_data = [
|
165 |
-
# _clean(text)
|
166 |
-
# for text in tqdm(
|
167 |
-
# text_data,
|
168 |
-
# desc="Cleaning",
|
169 |
-
# unit="doc",
|
170 |
-
# disable=not show_progress,
|
171 |
-
# )
|
172 |
-
# ]
|
173 |
text_data = Parallel(n_jobs=n_jobs)(
|
174 |
delayed(_clean)(text)
|
175 |
for text in tqdm(
|
@@ -310,12 +299,9 @@ def load_imdb50k() -> tuple[list[str], list[int]]:
|
|
310 |
return data["review"].tolist(), data["sentiment"].tolist()
|
311 |
|
312 |
|
313 |
-
def load_test(
|
314 |
"""Load the test dataset and make it suitable for use.
|
315 |
|
316 |
-
Args:
|
317 |
-
include_neutral: Whether to include neutral sentiment
|
318 |
-
|
319 |
Returns:
|
320 |
Text and label data
|
321 |
|
@@ -334,21 +320,8 @@ def load_test(include_neutral: bool = False) -> tuple[list[str], list[int]]:
|
|
334 |
# Load the dataset
|
335 |
data = pd.read_csv(TEST_DATASET_PATH)
|
336 |
|
337 |
-
# Ignore rows with neutral sentiment
|
338 |
-
if not include_neutral:
|
339 |
-
data = data[data["label"] != 1]
|
340 |
-
|
341 |
-
# Map sentiment values
|
342 |
-
data["label"] = data["label"].map(
|
343 |
-
{
|
344 |
-
0: 0, # Negative
|
345 |
-
1: 2, # Neutral
|
346 |
-
2: 1, # Positive
|
347 |
-
},
|
348 |
-
)
|
349 |
-
|
350 |
# Return as lists
|
351 |
-
return data["text"].tolist(), data["
|
352 |
|
353 |
|
354 |
def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test"]) -> tuple[list[str], list[int]]:
|
@@ -371,7 +344,7 @@ def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test
|
|
371 |
case "imdb50k":
|
372 |
return load_imdb50k()
|
373 |
case "test":
|
374 |
-
return load_test(
|
375 |
case _:
|
376 |
msg = f"Unknown dataset: {dataset}"
|
377 |
raise ValueError(msg)
|
|
|
55 |
FileNotFoundError: If the file is not found
|
56 |
"""
|
57 |
if not SLANGMAP_PATH.exists():
|
|
|
58 |
msg = (
|
59 |
f"Slang mapping file not found at: '{SLANGMAP_PATH}'\n"
|
60 |
"Please download the file from:\n"
|
|
|
88 |
text = slang_pattern.sub(lambda x: slang_mapping[x.group()], text)
|
89 |
|
90 |
# Remove acronyms and abbreviations
|
|
|
91 |
text = re.sub(r"\b(?:[a-z]\.?)(?:[a-z]\.)\b", "", text)
|
92 |
|
93 |
# Remove honorifics
|
|
|
159 |
Returns:
|
160 |
Tokenized text data
|
161 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
text_data = Parallel(n_jobs=n_jobs)(
|
163 |
delayed(_clean)(text)
|
164 |
for text in tqdm(
|
|
|
299 |
return data["review"].tolist(), data["sentiment"].tolist()
|
300 |
|
301 |
|
302 |
+
def load_test() -> tuple[list[str], list[int]]:
|
303 |
"""Load the test dataset and make it suitable for use.
|
304 |
|
|
|
|
|
|
|
305 |
Returns:
|
306 |
Text and label data
|
307 |
|
|
|
320 |
# Load the dataset
|
321 |
data = pd.read_csv(TEST_DATASET_PATH)
|
322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
323 |
# Return as lists
|
324 |
+
return data["text"].tolist(), data["sentiment"].tolist()
|
325 |
|
326 |
|
327 |
def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test"]) -> tuple[list[str], list[int]]:
|
|
|
344 |
case "imdb50k":
|
345 |
return load_imdb50k()
|
346 |
case "test":
|
347 |
+
return load_test()
|
348 |
case _:
|
349 |
msg = f"Unknown dataset: {dataset}"
|
350 |
raise ValueError(msg)
|
app/utils.py
CHANGED
@@ -23,6 +23,7 @@ def serialize(data: Sequence[str | int], path: Path, max_size: int = 100_000, sh
|
|
23 |
for i, chunk in enumerate(
|
24 |
tqdm(
|
25 |
[data[i : i + max_size] for i in range(0, len(data), max_size)],
|
|
|
26 |
unit="chunk",
|
27 |
disable=not show_progress,
|
28 |
),
|
|
|
23 |
for i, chunk in enumerate(
|
24 |
tqdm(
|
25 |
[data[i : i + max_size] for i in range(0, len(data), max_size)],
|
26 |
+
desc="Serializing",
|
27 |
unit="chunk",
|
28 |
disable=not show_progress,
|
29 |
),
|
data/test.csv
ADDED
Binary file (22.7 kB). View file
|
|