Spaces:
Runtime error
Runtime error
Swap test dataset
Browse files- app/cli.py +13 -11
- app/constants.py +9 -9
- app/data.py +3 -30
- app/utils.py +1 -0
- data/test.csv +0 -0
app/cli.py
CHANGED
|
@@ -141,13 +141,13 @@ def evaluate(
|
|
| 141 |
import joblib
|
| 142 |
import pandas as pd
|
| 143 |
|
| 144 |
-
from app.constants import
|
| 145 |
from app.data import load_data, tokenize
|
| 146 |
from app.model import evaluate_model
|
| 147 |
from app.utils import deserialize, serialize
|
| 148 |
|
| 149 |
-
token_cache_path =
|
| 150 |
-
label_cache_path =
|
| 151 |
use_cached_data = False
|
| 152 |
|
| 153 |
if token_cache_path.exists():
|
|
@@ -168,8 +168,6 @@ def evaluate(
|
|
| 168 |
|
| 169 |
click.echo("Tokenizing data... ")
|
| 170 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
| 171 |
-
|
| 172 |
-
click.echo("Caching tokenized data... ")
|
| 173 |
serialize(token_data, token_cache_path, show_progress=True)
|
| 174 |
joblib.dump(label_data, label_cache_path, compress=3)
|
| 175 |
|
|
@@ -184,7 +182,13 @@ def evaluate(
|
|
| 184 |
model = joblib.load(model_path)
|
| 185 |
click.echo(DONE_STR)
|
| 186 |
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
acc_mean, acc_std = evaluate_model(
|
| 189 |
model,
|
| 190 |
token_data,
|
|
@@ -282,7 +286,7 @@ def train(
|
|
| 282 |
import joblib
|
| 283 |
import pandas as pd
|
| 284 |
|
| 285 |
-
from app.constants import MODEL_DIR,
|
| 286 |
from app.data import load_data, tokenize
|
| 287 |
from app.model import train_model
|
| 288 |
from app.utils import deserialize, serialize
|
|
@@ -291,8 +295,8 @@ def train(
|
|
| 291 |
if model_path.exists() and not overwrite:
|
| 292 |
click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
|
| 293 |
|
| 294 |
-
token_cache_path =
|
| 295 |
-
label_cache_path =
|
| 296 |
use_cached_data = False
|
| 297 |
|
| 298 |
if token_cache_path.exists():
|
|
@@ -313,8 +317,6 @@ def train(
|
|
| 313 |
|
| 314 |
click.echo("Tokenizing data... ")
|
| 315 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
| 316 |
-
|
| 317 |
-
click.echo("Caching tokenized data... ")
|
| 318 |
serialize(token_data, token_cache_path, show_progress=True)
|
| 319 |
joblib.dump(label_data, label_cache_path, compress=3)
|
| 320 |
|
|
|
|
| 141 |
import joblib
|
| 142 |
import pandas as pd
|
| 143 |
|
| 144 |
+
from app.constants import TOKENIZER_CACHE_DIR
|
| 145 |
from app.data import load_data, tokenize
|
| 146 |
from app.model import evaluate_model
|
| 147 |
from app.utils import deserialize, serialize
|
| 148 |
|
| 149 |
+
token_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_tokenized.pkl"
|
| 150 |
+
label_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_labels.pkl"
|
| 151 |
use_cached_data = False
|
| 152 |
|
| 153 |
if token_cache_path.exists():
|
|
|
|
| 168 |
|
| 169 |
click.echo("Tokenizing data... ")
|
| 170 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
|
|
|
|
|
|
| 171 |
serialize(token_data, token_cache_path, show_progress=True)
|
| 172 |
joblib.dump(label_data, label_cache_path, compress=3)
|
| 173 |
|
|
|
|
| 182 |
model = joblib.load(model_path)
|
| 183 |
click.echo(DONE_STR)
|
| 184 |
|
| 185 |
+
if cv == 1:
|
| 186 |
+
click.echo("Evaluating model... ", nl=False)
|
| 187 |
+
acc = model.score(token_data, label_data)
|
| 188 |
+
click.secho(f"{acc:.2%}", fg="blue")
|
| 189 |
+
return
|
| 190 |
+
|
| 191 |
+
click.echo("Evaluating model... ")
|
| 192 |
acc_mean, acc_std = evaluate_model(
|
| 193 |
model,
|
| 194 |
token_data,
|
|
|
|
| 286 |
import joblib
|
| 287 |
import pandas as pd
|
| 288 |
|
| 289 |
+
from app.constants import MODEL_DIR, TOKENIZER_CACHE_DIR
|
| 290 |
from app.data import load_data, tokenize
|
| 291 |
from app.model import train_model
|
| 292 |
from app.utils import deserialize, serialize
|
|
|
|
| 295 |
if model_path.exists() and not overwrite:
|
| 296 |
click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
|
| 297 |
|
| 298 |
+
token_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_tokenized.pkl"
|
| 299 |
+
label_cache_path = TOKENIZER_CACHE_DIR / f"{dataset}_labels.pkl"
|
| 300 |
use_cached_data = False
|
| 301 |
|
| 302 |
if token_cache_path.exists():
|
|
|
|
| 317 |
|
| 318 |
click.echo("Tokenizing data... ")
|
| 319 |
token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
|
|
|
|
|
|
|
| 320 |
serialize(token_data, token_cache_path, show_progress=True)
|
| 321 |
joblib.dump(label_data, label_cache_path, compress=3)
|
| 322 |
|
app/constants.py
CHANGED
|
@@ -4,10 +4,16 @@ import os
|
|
| 4 |
from pathlib import Path
|
| 5 |
|
| 6 |
CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
|
|
|
|
|
|
|
| 7 |
DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
|
|
|
|
|
|
|
| 8 |
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
|
|
|
| 9 |
|
| 10 |
-
|
|
|
|
| 11 |
|
| 12 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
| 13 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
|
@@ -19,13 +25,7 @@ IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
|
|
| 19 |
IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
|
| 20 |
|
| 21 |
TEST_DATASET_PATH = DATA_DIR / "test.csv"
|
| 22 |
-
TEST_DATASET_URL = "https://
|
| 23 |
|
| 24 |
SLANGMAP_PATH = DATA_DIR / "slang.json"
|
| 25 |
-
SLANGMAP_URL = "
|
| 26 |
-
|
| 27 |
-
CACHE_DIR.mkdir(exist_ok=True, parents=True)
|
| 28 |
-
DATA_DIR.mkdir(exist_ok=True, parents=True)
|
| 29 |
-
MODEL_DIR.mkdir(exist_ok=True, parents=True)
|
| 30 |
-
|
| 31 |
-
TOKENIZER_CACHE_PATH.mkdir(exist_ok=True, parents=True)
|
|
|
|
| 4 |
from pathlib import Path
|
| 5 |
|
| 6 |
CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
|
| 7 |
+
CACHE_DIR.mkdir(exist_ok=True, parents=True)
|
| 8 |
+
|
| 9 |
DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
|
| 10 |
+
DATA_DIR.mkdir(exist_ok=True, parents=True)
|
| 11 |
+
|
| 12 |
MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
|
| 13 |
+
MODEL_DIR.mkdir(exist_ok=True, parents=True)
|
| 14 |
|
| 15 |
+
TOKENIZER_CACHE_DIR = CACHE_DIR / "tokenizer"
|
| 16 |
+
TOKENIZER_CACHE_DIR.mkdir(exist_ok=True, parents=True)
|
| 17 |
|
| 18 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
| 19 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
|
|
|
| 25 |
IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
|
| 26 |
|
| 27 |
TEST_DATASET_PATH = DATA_DIR / "test.csv"
|
| 28 |
+
TEST_DATASET_URL = "https://github.com/Tymec/sentiment-analysis/blob/main/data/test.csv?raw=true"
|
| 29 |
|
| 30 |
SLANGMAP_PATH = DATA_DIR / "slang.json"
|
| 31 |
+
SLANGMAP_URL = "https://github.com/Tymec/sentiment-analysis/blob/main/data/slang.json?raw=true"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/data.py
CHANGED
|
@@ -55,7 +55,6 @@ def slang() -> tuple[Pattern, dict[str, str]]:
|
|
| 55 |
FileNotFoundError: If the file is not found
|
| 56 |
"""
|
| 57 |
if not SLANGMAP_PATH.exists():
|
| 58 |
-
# msg = f"Missing slang mapping file: {SLANG_PATH}"
|
| 59 |
msg = (
|
| 60 |
f"Slang mapping file not found at: '{SLANGMAP_PATH}'\n"
|
| 61 |
"Please download the file from:\n"
|
|
@@ -89,7 +88,6 @@ def _clean(text: str) -> str:
|
|
| 89 |
text = slang_pattern.sub(lambda x: slang_mapping[x.group()], text)
|
| 90 |
|
| 91 |
# Remove acronyms and abbreviations
|
| 92 |
-
# text = re.sub(r"(?:[a-z]\.){2,}", "", text)
|
| 93 |
text = re.sub(r"\b(?:[a-z]\.?)(?:[a-z]\.)\b", "", text)
|
| 94 |
|
| 95 |
# Remove honorifics
|
|
@@ -161,15 +159,6 @@ def tokenize(
|
|
| 161 |
Returns:
|
| 162 |
Tokenized text data
|
| 163 |
"""
|
| 164 |
-
# text_data = [
|
| 165 |
-
# _clean(text)
|
| 166 |
-
# for text in tqdm(
|
| 167 |
-
# text_data,
|
| 168 |
-
# desc="Cleaning",
|
| 169 |
-
# unit="doc",
|
| 170 |
-
# disable=not show_progress,
|
| 171 |
-
# )
|
| 172 |
-
# ]
|
| 173 |
text_data = Parallel(n_jobs=n_jobs)(
|
| 174 |
delayed(_clean)(text)
|
| 175 |
for text in tqdm(
|
|
@@ -310,12 +299,9 @@ def load_imdb50k() -> tuple[list[str], list[int]]:
|
|
| 310 |
return data["review"].tolist(), data["sentiment"].tolist()
|
| 311 |
|
| 312 |
|
| 313 |
-
def load_test(
|
| 314 |
"""Load the test dataset and make it suitable for use.
|
| 315 |
|
| 316 |
-
Args:
|
| 317 |
-
include_neutral: Whether to include neutral sentiment
|
| 318 |
-
|
| 319 |
Returns:
|
| 320 |
Text and label data
|
| 321 |
|
|
@@ -334,21 +320,8 @@ def load_test(include_neutral: bool = False) -> tuple[list[str], list[int]]:
|
|
| 334 |
# Load the dataset
|
| 335 |
data = pd.read_csv(TEST_DATASET_PATH)
|
| 336 |
|
| 337 |
-
# Ignore rows with neutral sentiment
|
| 338 |
-
if not include_neutral:
|
| 339 |
-
data = data[data["label"] != 1]
|
| 340 |
-
|
| 341 |
-
# Map sentiment values
|
| 342 |
-
data["label"] = data["label"].map(
|
| 343 |
-
{
|
| 344 |
-
0: 0, # Negative
|
| 345 |
-
1: 2, # Neutral
|
| 346 |
-
2: 1, # Positive
|
| 347 |
-
},
|
| 348 |
-
)
|
| 349 |
-
|
| 350 |
# Return as lists
|
| 351 |
-
return data["text"].tolist(), data["
|
| 352 |
|
| 353 |
|
| 354 |
def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test"]) -> tuple[list[str], list[int]]:
|
|
@@ -371,7 +344,7 @@ def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test
|
|
| 371 |
case "imdb50k":
|
| 372 |
return load_imdb50k()
|
| 373 |
case "test":
|
| 374 |
-
return load_test(
|
| 375 |
case _:
|
| 376 |
msg = f"Unknown dataset: {dataset}"
|
| 377 |
raise ValueError(msg)
|
|
|
|
| 55 |
FileNotFoundError: If the file is not found
|
| 56 |
"""
|
| 57 |
if not SLANGMAP_PATH.exists():
|
|
|
|
| 58 |
msg = (
|
| 59 |
f"Slang mapping file not found at: '{SLANGMAP_PATH}'\n"
|
| 60 |
"Please download the file from:\n"
|
|
|
|
| 88 |
text = slang_pattern.sub(lambda x: slang_mapping[x.group()], text)
|
| 89 |
|
| 90 |
# Remove acronyms and abbreviations
|
|
|
|
| 91 |
text = re.sub(r"\b(?:[a-z]\.?)(?:[a-z]\.)\b", "", text)
|
| 92 |
|
| 93 |
# Remove honorifics
|
|
|
|
| 159 |
Returns:
|
| 160 |
Tokenized text data
|
| 161 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
text_data = Parallel(n_jobs=n_jobs)(
|
| 163 |
delayed(_clean)(text)
|
| 164 |
for text in tqdm(
|
|
|
|
| 299 |
return data["review"].tolist(), data["sentiment"].tolist()
|
| 300 |
|
| 301 |
|
| 302 |
+
def load_test() -> tuple[list[str], list[int]]:
|
| 303 |
"""Load the test dataset and make it suitable for use.
|
| 304 |
|
|
|
|
|
|
|
|
|
|
| 305 |
Returns:
|
| 306 |
Text and label data
|
| 307 |
|
|
|
|
| 320 |
# Load the dataset
|
| 321 |
data = pd.read_csv(TEST_DATASET_PATH)
|
| 322 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
# Return as lists
|
| 324 |
+
return data["text"].tolist(), data["sentiment"].tolist()
|
| 325 |
|
| 326 |
|
| 327 |
def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test"]) -> tuple[list[str], list[int]]:
|
|
|
|
| 344 |
case "imdb50k":
|
| 345 |
return load_imdb50k()
|
| 346 |
case "test":
|
| 347 |
+
return load_test()
|
| 348 |
case _:
|
| 349 |
msg = f"Unknown dataset: {dataset}"
|
| 350 |
raise ValueError(msg)
|
app/utils.py
CHANGED
|
@@ -23,6 +23,7 @@ def serialize(data: Sequence[str | int], path: Path, max_size: int = 100_000, sh
|
|
| 23 |
for i, chunk in enumerate(
|
| 24 |
tqdm(
|
| 25 |
[data[i : i + max_size] for i in range(0, len(data), max_size)],
|
|
|
|
| 26 |
unit="chunk",
|
| 27 |
disable=not show_progress,
|
| 28 |
),
|
|
|
|
| 23 |
for i, chunk in enumerate(
|
| 24 |
tqdm(
|
| 25 |
[data[i : i + max_size] for i in range(0, len(data), max_size)],
|
| 26 |
+
desc="Serializing",
|
| 27 |
unit="chunk",
|
| 28 |
disable=not show_progress,
|
| 29 |
),
|
data/test.csv
ADDED
|
Binary file (22.7 kB). View file
|
|
|