Spaces:

Tymec
/

sentiment-analysis

Running

App Files Files

Tymec commited on Jun 2, 2024

Commit

b0ade1a

1 Parent(s): 228859a

Add more vectorizers, classifiers and CLI options

Browse files

Files changed (3) hide show

app/cli.py +46 -33
app/constants.py +3 -3
app/model.py +149 -73

app/cli.py CHANGED Viewed

@@ -104,29 +104,36 @@ def predict(model_path: Path, text: list[str]) -> None:
     type=click.IntRange(1, 50),
 )
 @click.option(
-    "--batch-size",
     default=512,
     help="Size of the batches used in tokenization",
     show_default=True,
 )
 @click.option(
-    "--processes",
     default=4,
-    help="Number of parallel jobs to run",
     show_default=True,
 )
 @click.option(
-    "--verbose",
     is_flag=True,
-    help="Show verbose output",
 )
 def evaluate(
     dataset: Literal["test", "sentiment140", "amazonreviews", "imdb50k"],
     model_path: Path,
     cv: int,
-    batch_size: int,
-    processes: int,
-    verbose: bool,
 ) -> None:
     """Evaluate the model on the the specified dataset"""
     import gc
@@ -141,7 +148,10 @@ def evaluate(
     cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
     if cached_data_path.exists():
-        use_cached_data = click.confirm(f"Found existing tokenized data for '{dataset}'. Use it?", default=True)
     click.echo("Loading dataset... ", nl=False)
     text_data, label_data = load_data(dataset)
@@ -149,16 +159,14 @@ def evaluate(
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
-        # token_data = joblib.load(cached_data_path)
         token_data = deserialize(cached_data_path)
         click.echo(DONE_STR)
     else:
         click.echo("Tokenizing data... ", nl=False)
-        token_data = tokenize(text_data, batch_size=batch_size, n_jobs=processes, show_progress=True)
         click.echo(DONE_STR)
         click.echo("Caching tokenized data... ", nl=False)
-        # joblib.dump(token_data, cached_data_path, compress=3)
         serialize(token_data, cached_data_path)
         click.echo(DONE_STR)
@@ -175,8 +183,7 @@ def evaluate(
         token_data,
         label_data,
         folds=cv,
-        n_jobs=processes,
-        verbose=verbose,
     )
     click.secho(f"{acc_mean:.2%} ± {acc_std:.2%}", fg="blue")
@@ -188,10 +195,16 @@ def evaluate(
     help="Dataset to train the model on",
     type=click.Choice(["sentiment140", "amazonreviews", "imdb50k"]),
 )
 @click.option(
     "--max-features",
     default=20000,
-    help="Maximum number of features",
     show_default=True,
     type=click.IntRange(1, None),
 )
@@ -203,15 +216,21 @@ def evaluate(
     type=click.IntRange(1, 50),
 )
 @click.option(
-    "--batch-size",
     default=512,
     help="Size of the batches used in tokenization",
     show_default=True,
 )
 @click.option(
-    "--processes",
     default=4,
-    help="Number of parallel jobs to run",
     show_default=True,
 )
 @click.option(
@@ -231,33 +250,29 @@ def evaluate(
     is_flag=True,
     help="Always use the cached tokenized data (if available)",
 )
-@click.option(
-    "--verbose",
-    is_flag=True,
-    help="Show verbose output",
-)
 def train(
     dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
     max_features: int,
     cv: int,
-    batch_size: int,
-    processes: int,
     seed: int,
     overwrite: bool,
     force_cache: bool,
-    verbose: bool,
 ) -> None:
     """Train the model on the provided dataset"""
     import gc
     import joblib
-    from app.constants import CACHE_DIR, MODELS_DIR
     from app.data import load_data, tokenize
     from app.model import train_model
     from app.utils import deserialize, serialize
-    model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
     if model_path.exists() and not overwrite:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
@@ -276,16 +291,14 @@ def train(
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
-        # token_data = joblib.load(cached_data_path)
         token_data = deserialize(cached_data_path)
         click.echo(DONE_STR)
     else:
         click.echo("Tokenizing data... ", nl=False)
-        token_data = tokenize(text_data, batch_size=batch_size, n_jobs=processes, show_progress=True)
         click.echo(DONE_STR)
         click.echo("Caching tokenized data... ", nl=False)
-        # joblib.dump(token_data, cached_data_path, compress=3)
         serialize(token_data, cached_data_path)
         click.echo(DONE_STR)
@@ -296,11 +309,11 @@ def train(
     model, accuracy = train_model(
         token_data,
         label_data,
         max_features=max_features,
         folds=cv,
-        n_jobs=processes,
         seed=seed,
-        verbose=verbose,
     )
     click.echo("Model accuracy: ", nl=False)
     click.secho(f"{accuracy:.2%}", fg="blue")

     type=click.IntRange(1, 50),
 )
 @click.option(
+    "--token-batch-size",
     default=512,
     help="Size of the batches used in tokenization",
     show_default=True,
 )
 @click.option(
+    "--token-jobs",
     default=4,
+    help="Number of parallel jobs to run for tokenization",
     show_default=True,
 )
 @click.option(
+    "--eval-jobs",
+    default=1,
+    help="Number of parallel jobs to run for evaluation",
+    show_default=True,
+)
+@click.option(
+    "--force-cache",
     is_flag=True,
+    help="Always use the cached tokenized data (if available)",
 )
 def evaluate(
     dataset: Literal["test", "sentiment140", "amazonreviews", "imdb50k"],
     model_path: Path,
     cv: int,
+    token_batch_size: int,
+    token_jobs: int,
+    eval_jobs: int,
+    force_cache: bool,
 ) -> None:
     """Evaluate the model on the the specified dataset"""
     import gc
     cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
     if cached_data_path.exists():
+        use_cached_data = force_cache or click.confirm(
+            f"Found existing tokenized data for '{dataset}'. Use it?",
+            default=True,
+        )
     click.echo("Loading dataset... ", nl=False)
     text_data, label_data = load_data(dataset)
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
         token_data = deserialize(cached_data_path)
         click.echo(DONE_STR)
     else:
         click.echo("Tokenizing data... ", nl=False)
+        token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
         click.echo(DONE_STR)
         click.echo("Caching tokenized data... ", nl=False)
         serialize(token_data, cached_data_path)
         click.echo(DONE_STR)
         token_data,
         label_data,
         folds=cv,
+        n_jobs=eval_jobs,
     )
     click.secho(f"{acc_mean:.2%} ± {acc_std:.2%}", fg="blue")
     help="Dataset to train the model on",
     type=click.Choice(["sentiment140", "amazonreviews", "imdb50k"]),
 )
+@click.option(
+    "--vectorizer",
+    default="tfidf",
+    help="Vectorizer to use",
+    type=click.Choice(["tfidf", "count", "hashing"]),
+)
 @click.option(
     "--max-features",
     default=20000,
+    help="Maximum number of features (should be greater than 2^15 when using hashing vectorizer)",
     show_default=True,
     type=click.IntRange(1, None),
 )
     type=click.IntRange(1, 50),
 )
 @click.option(
+    "--token-batch-size",
     default=512,
     help="Size of the batches used in tokenization",
     show_default=True,
 )
 @click.option(
+    "--token-jobs",
     default=4,
+    help="Number of parallel jobs to run for tokenization",
+    show_default=True,
+)
+@click.option(
+    "--train-jobs",
+    default=1,
+    help="Number of parallel jobs to run for training",
     show_default=True,
 )
 @click.option(
     is_flag=True,
     help="Always use the cached tokenized data (if available)",
 )
 def train(
     dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
+    vectorizer: Literal["tfidf", "count", "hashing"],
     max_features: int,
     cv: int,
+    token_batch_size: int,
+    token_jobs: int,
+    train_jobs: int,
     seed: int,
     overwrite: bool,
     force_cache: bool,
 ) -> None:
     """Train the model on the provided dataset"""
     import gc
     import joblib
+    from app.constants import CACHE_DIR, MODEL_DIR
     from app.data import load_data, tokenize
     from app.model import train_model
     from app.utils import deserialize, serialize
+    model_path = MODEL_DIR / f"{dataset}_{vectorizer}_ft{max_features}.pkl"
     if model_path.exists() and not overwrite:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
         token_data = deserialize(cached_data_path)
         click.echo(DONE_STR)
     else:
         click.echo("Tokenizing data... ", nl=False)
+        token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
         click.echo(DONE_STR)
         click.echo("Caching tokenized data... ", nl=False)
         serialize(token_data, cached_data_path)
         click.echo(DONE_STR)
     model, accuracy = train_model(
         token_data,
         label_data,
+        vectorizer=vectorizer,
         max_features=max_features,
         folds=cv,
+        n_jobs=train_jobs,
         seed=seed,
     )
     click.echo("Model accuracy: ", nl=False)
     click.secho(f"{accuracy:.2%}", fg="blue")

app/constants.py CHANGED Viewed

@@ -5,12 +5,12 @@ from pathlib import Path
 CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
 DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
-MODELS_DIR = Path(os.getenv("MODELS_DIR", "models"))
 SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
 SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
-AMAZONREVIEWS_PATH = DATA_DIR / "amazonreviews.train.txt.bz2"
 AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
 IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
@@ -21,4 +21,4 @@ TEST_DATASET_URL = "https://huggingface.co/datasets/Sp1786/multiclass-sentiment-
 CACHE_DIR.mkdir(exist_ok=True, parents=True)
 DATA_DIR.mkdir(exist_ok=True, parents=True)
-MODELS_DIR.mkdir(exist_ok=True, parents=True)

 CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
 DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
+MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
 SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
 SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
+AMAZONREVIEWS_PATH = DATA_DIR / "amazonreviews.txt.bz2"
 AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
 IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
 CACHE_DIR.mkdir(exist_ok=True, parents=True)
 DATA_DIR.mkdir(exist_ok=True, parents=True)
+MODEL_DIR.mkdir(exist_ok=True, parents=True)

app/model.py CHANGED Viewed

@@ -1,20 +1,23 @@
 from __future__ import annotations
-import os
-from typing import TYPE_CHECKING
 import numpy as np
 from joblib import Memory
-from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
 from sklearn.pipeline import Pipeline
 from app.constants import CACHE_DIR
 from app.data import tokenize
 if TYPE_CHECKING:
-    from sklearn.base import BaseEstimator
 __all__ = ["train_model", "evaluate_model", "infer_model"]
@@ -31,96 +34,170 @@ def _identity(x: list[str]) -> list[str]:
     return x
 def train_model(
-    token_data: list[str],
     label_data: list[int],
     max_features: int,
     folds: int = 5,
     n_jobs: int = 4,
     seed: int = 42,
-    verbose: bool = False,
 ) -> tuple[BaseEstimator, float]:
     """Train the sentiment analysis model.
     Args:
-        model: Untrained model
         token_data: Tokenized text data
         label_data: Label data
         max_features: Maximum number of features
         folds: Number of cross-validation folds
         n_jobs: Number of parallel jobs
         seed: Random seed (None for random seed)
-        verbose: Whether to output additional information
     Returns:
         Trained model and accuracy
     """
     text_train, text_test, label_train, label_test = train_test_split(
         token_data,
         label_data,
         test_size=0.2,
-        random_state=seed,
     )
-    model = Pipeline(
-        [
-            (
-                "vectorizer",
-                TfidfVectorizer(
-                    max_features=max_features,
-                    ngram_range=(1, 2),
-                    # disable text processing
-                    tokenizer=_identity,
-                    preprocessor=_identity,
-                    lowercase=False,
-                    token_pattern=None,
-                    min_df=0.1,
-                    max_df=0.9,
-                ),
-            ),
-            (
-                "classifier",
-                LogisticRegression(
-                    max_iter=1000,
-                    random_state=None if seed == -1 else seed,
-                ),
-            ),
-        ],
-        memory=Memory(CACHE_DIR, verbose=0),
-        verbose=verbose,
-    )
-    param_distributions = {
-        "classifier__C": np.logspace(-4, 4, 20),
-        "classifier__solver": ["liblinear", "saga"],
-    }
-    search = RandomizedSearchCV(
-        model,
-        param_distributions,
-        cv=folds,
-        random_state=seed,
-        n_jobs=n_jobs,
-        verbose=2 if verbose else 0,
-        scoring="accuracy",
-        n_iter=10,
-    )
-    os.environ["PYTHONWARNINGS"] = "ignore"
-    search.fit(text_train, label_train)
-    del os.environ["PYTHONWARNINGS"]
-    best_model = search.best_estimator_
-    return best_model, best_model.score(text_test, label_test)
 def evaluate_model(
     model: BaseEstimator,
-    token_data: list[str],
     label_data: list[int],
     folds: int = 5,
     n_jobs: int = 4,
-    verbose: bool = False,
 ) -> tuple[float, float]:
     """Evaluate the model using cross-validation.
@@ -130,22 +207,21 @@ def evaluate_model(
         label_data: Label data
         folds: Number of cross-validation folds
         n_jobs: Number of parallel jobs
-        verbose: Whether to output additional information
     Returns:
         Mean accuracy and standard deviation
     """
-    os.environ["PYTHONWARNINGS"] = "ignore"
-    scores = cross_val_score(
-        model,
-        token_data,
-        label_data,
-        cv=folds,
-        scoring="accuracy",
-        n_jobs=n_jobs,
-        verbose=2 if verbose else 0,
-    )
-    del os.environ["PYTHONWARNINGS"]
     return scores.mean(), scores.std()

 from __future__ import annotations
+import warnings
+from typing import TYPE_CHECKING, Literal, Sequence
 import numpy as np
 from joblib import Memory
+from sklearn.exceptions import ConvergenceWarning
+from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
 from sklearn.pipeline import Pipeline
+from sklearn.svm import LinearSVC
+from tqdm import tqdm
 from app.constants import CACHE_DIR
 from app.data import tokenize
 if TYPE_CHECKING:
+    from sklearn.base import BaseEstimator, TransformerMixin
 __all__ = ["train_model", "evaluate_model", "infer_model"]
     return x
+def _get_vectorizer(
+    name: Literal["tfidf", "count", "hashing"],
+    n_features: int,
+    df: tuple[float, float] = (0.1, 0.9),
+    ngram: tuple[int, int] = (1, 2),
+) -> TransformerMixin:
+    """Get the appropriate vectorizer.
+    Args:
+        name: Type of vectorizer
+        n_features: Maximum number of features
+        df: Document frequency range [min_df, max_df] (ignored for HashingVectorizer)
+        ngram: N-gram range [min_n, max_n]
+    Returns:
+        Vectorizer instance
+    Raises:
+        ValueError: If the vectorizer is not recognized
+    """
+    shared_params = {
+        "ngram_range": ngram,
+        # disable text processing
+        "tokenizer": _identity,
+        "preprocessor": _identity,
+        "lowercase": False,
+        "token_pattern": None,
+    }
+    match name:
+        case "tfidf":
+            return TfidfVectorizer(
+                max_features=n_features,
+                min_df=df[0],
+                max_df=df[1],
+                **shared_params,
+            )
+        case "count":
+            return CountVectorizer(
+                max_features=n_features,
+                min_df=df[0],
+                max_df=df[1],
+                **shared_params,
+            )
+        case "hashing":
+            if n_features < 2**15:
+                warnings.warn(
+                    "HashingVectorizer may perform poorly with small n_features, default is 2^20.",
+                    stacklevel=2,
+                )
+            return HashingVectorizer(
+                n_features=n_features,
+                **shared_params,
+            )
+        case _:
+            msg = f"Unknown vectorizer: {name}"
+            raise ValueError(msg)
 def train_model(
+    token_data: Sequence[Sequence[str]],
     label_data: list[int],
+    vectorizer: Literal["tfidf", "count", "hashing"],
     max_features: int,
     folds: int = 5,
     n_jobs: int = 4,
     seed: int = 42,
 ) -> tuple[BaseEstimator, float]:
     """Train the sentiment analysis model.
     Args:
         token_data: Tokenized text data
         label_data: Label data
+        vectorizer: Which vectorizer to use
         max_features: Maximum number of features
         folds: Number of cross-validation folds
         n_jobs: Number of parallel jobs
         seed: Random seed (None for random seed)
     Returns:
         Trained model and accuracy
+    Raises:
+        ValueError: If the vectorizer is not recognized
     """
+    rs = None if seed == -1 else seed
     text_train, text_test, label_train, label_test = train_test_split(
         token_data,
         label_data,
         test_size=0.2,
+        random_state=rs,
     )
+    vectorizer = _get_vectorizer(vectorizer, max_features)
+    classifiers = [
+        (LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
+        (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
+        # (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 10)}),
+        # (RandomForestClassifier(random_state=rs), {"n_estimators": np.arange(50, 500, 50)}),
+        # (
+        #     VotingClassifier(
+        #         estimators=[
+        #             ("lr", LogisticRegression(max_iter=1000, random_state=rs)),
+        #             ("knn", KNeighborsClassifier()),
+        #             ("rf", RandomForestClassifier(random_state=rs)),
+        #         ],
+        #     ),
+        #     {
+        #         "lr__C": np.logspace(-4, 4, 20),
+        #         "knn__n_neighbors": np.arange(1, 10),
+        #         "rf__n_estimators": np.arange(50, 500, 50),
+        #     },
+        # ),
+    ]
+    models = []
+    for clf, param_dist in (pbar := tqdm(classifiers, unit="clf")):
+        param_dist = {f"classifier__{k}": v for k, v in param_dist.items()}
+        model = Pipeline(
+            [("vectorizer", vectorizer), ("classifier", clf)],
+            memory=Memory(CACHE_DIR, verbose=0),
+        )
+        search = RandomizedSearchCV(
+            model,
+            param_dist,
+            cv=folds,
+            random_state=rs,
+            n_jobs=n_jobs,
+            # verbose=2,
+            scoring="accuracy",
+            n_iter=7,
+        )
+        pbar.set_description(f"Searching for {clf.__class__.__name__}")
+        with warnings.catch_warnings():
+            warnings.filterwarnings("once", category=ConvergenceWarning)
+            warnings.filterwarnings("ignore", category=UserWarning, message="Persisting input arguments took")
+            search.fit(text_train, label_train)
+        best_model = search.best_estimator_
+        acc = best_model.score(text_test, label_test)
+        models.append((best_model, acc))
+    print("Final results:")
+    print("--------------")
+    print("\n".join(f"{model.named_steps['classifier'].__class__.__name__}: {acc:.2%}" for model, acc in models))
+    best_model, best_acc = max(models, key=lambda x: x[1])
+    print(f"Settled on {best_model.named_steps['classifier'].__class__.__name__}")
+    return best_model, best_acc
 def evaluate_model(
     model: BaseEstimator,
+    token_data: Sequence[Sequence[str]],
     label_data: list[int],
     folds: int = 5,
     n_jobs: int = 4,
 ) -> tuple[float, float]:
     """Evaluate the model using cross-validation.
         label_data: Label data
         folds: Number of cross-validation folds
         n_jobs: Number of parallel jobs
     Returns:
         Mean accuracy and standard deviation
     """
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=UserWarning)
+        scores = cross_val_score(
+            model,
+            token_data,
+            label_data,
+            cv=folds,
+            scoring="accuracy",
+            n_jobs=n_jobs,
+            verbose=2,
+        )
     return scores.mean(), scores.std()