Spaces:

Tymec
/

sentiment-analysis

Sleeping

App Files Files

Tymec commited on Jun 3, 2024

Commit

447f97e

1 Parent(s): e1645d7

Fix broken tokenization

Browse files

Files changed (4) hide show

app/cli.py +23 -16
app/constants.py +4 -0
app/data.py +6 -6
app/model.py +1 -2

app/cli.py CHANGED Viewed

@@ -139,14 +139,16 @@ def evaluate(
     import gc
     import joblib
-    from app.constants import CACHE_DIR
     from app.data import load_data, tokenize
     from app.model import evaluate_model
     from app.utils import deserialize, serialize
-    cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
     if cached_data_path.exists():
         use_cached_data = force_cache or click.confirm(
             f"Found existing tokenized data for '{dataset}'. Use it?",
@@ -159,20 +161,22 @@ def evaluate(
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
-        token_data = deserialize(cached_data_path)
         click.echo(DONE_STR)
     else:
-        click.echo("Tokenizing data... ", nl=False)
         token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
-        click.echo(DONE_STR)
-        click.echo("Caching tokenized data... ", nl=False)
-        serialize(token_data, cached_data_path)
-        click.echo(DONE_STR)
     del text_data
     gc.collect()
     click.echo("Loading model... ", nl=False)
     model = joblib.load(model_path)
     click.echo(DONE_STR)
@@ -266,8 +270,9 @@ def train(
     import gc
     import joblib
-    from app.constants import CACHE_DIR, MODEL_DIR
     from app.data import load_data, tokenize
     from app.model import train_model
     from app.utils import deserialize, serialize
@@ -276,7 +281,7 @@ def train(
     if model_path.exists() and not overwrite:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
-    cached_data_path = CACHE_DIR / f"{dataset}_tokenized.pkl"
     use_cached_data = False
     if cached_data_path.exists():
@@ -291,20 +296,22 @@ def train(
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
-        token_data = deserialize(cached_data_path)
         click.echo(DONE_STR)
     else:
-        click.echo("Tokenizing data... ", nl=False)
         token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
-        click.echo(DONE_STR)
-        click.echo("Caching tokenized data... ", nl=False)
-        serialize(token_data, cached_data_path)
-        click.echo(DONE_STR)
     del text_data
     gc.collect()
     click.echo("Training model... ")
     model, accuracy = train_model(
         token_data,

     import gc
     import joblib
+    import pandas as pd
+    from app.constants import TOKENIZER_CACHE_PATH
     from app.data import load_data, tokenize
     from app.model import evaluate_model
     from app.utils import deserialize, serialize
+    cached_data_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
     use_cached_data = False
     if cached_data_path.exists():
         use_cached_data = force_cache or click.confirm(
             f"Found existing tokenized data for '{dataset}'. Use it?",
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
+        token_data = pd.Series(deserialize(cached_data_path))
         click.echo(DONE_STR)
     else:
+        click.echo("Tokenizing data... ")
         token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
+        click.echo("Caching tokenized data... ")
+        serialize(token_data, cached_data_path, show_progress=True)
     del text_data
     gc.collect()
+    click.echo("Size of vocabulary: ", nl=False)
+    vocab = token_data.explode().value_counts()
+    click.secho(str(len(vocab)), fg="blue")
     click.echo("Loading model... ", nl=False)
     model = joblib.load(model_path)
     click.echo(DONE_STR)
     import gc
     import joblib
+    import pandas as pd
+    from app.constants import MODEL_DIR, TOKENIZER_CACHE_PATH
     from app.data import load_data, tokenize
     from app.model import train_model
     from app.utils import deserialize, serialize
     if model_path.exists() and not overwrite:
         click.confirm(f"Model file '{model_path}' already exists. Overwrite?", abort=True)
+    cached_data_path = TOKENIZER_CACHE_PATH / f"{dataset}_tokenized.pkl"
     use_cached_data = False
     if cached_data_path.exists():
     if use_cached_data:
         click.echo("Loading cached data... ", nl=False)
+        token_data = pd.Series(deserialize(cached_data_path))
         click.echo(DONE_STR)
     else:
+        click.echo("Tokenizing data... ")
         token_data = tokenize(text_data, batch_size=token_batch_size, n_jobs=token_jobs, show_progress=True)
+        click.echo("Caching tokenized data... ")
+        serialize(token_data, cached_data_path, show_progress=True)
     del text_data
     gc.collect()
+    click.echo("Size of vocabulary: ", nl=False)
+    vocab = token_data.explode().value_counts()
+    click.secho(str(len(vocab)), fg="blue")
     click.echo("Training model... ")
     model, accuracy = train_model(
         token_data,

app/constants.py CHANGED Viewed

@@ -7,6 +7,8 @@ CACHE_DIR = Path(os.getenv("CACHE_DIR", ".cache"))
 DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
 MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
 SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
 SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
@@ -25,3 +27,5 @@ SLANGMAP_URL = "Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-
 CACHE_DIR.mkdir(exist_ok=True, parents=True)
 DATA_DIR.mkdir(exist_ok=True, parents=True)
 MODEL_DIR.mkdir(exist_ok=True, parents=True)

 DATA_DIR = Path(os.getenv("DATA_DIR", "data"))
 MODEL_DIR = Path(os.getenv("MODEL_DIR", "models"))
+TOKENIZER_CACHE_PATH = CACHE_DIR / "tokenizer"
 SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
 SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
 CACHE_DIR.mkdir(exist_ok=True, parents=True)
 DATA_DIR.mkdir(exist_ok=True, parents=True)
 MODEL_DIR.mkdir(exist_ok=True, parents=True)
+TOKENIZER_CACHE_PATH.mkdir(exist_ok=True, parents=True)

app/data.py CHANGED Viewed

@@ -89,7 +89,7 @@ def _clean(text: str) -> str:
     # Remove acronyms and abbreviations
     # text = re.sub(r"(?:[a-z]\.){2,}", "", text)
-    text = re.sub(r"(?:[a-z]\.?)(?:[a-z]\.)", "", text)
     # Remove honorifics
     text = re.sub(r"\b(?:mr|mrs|ms|dr|prof|sr|jr)\.?\b", "", text)
@@ -118,7 +118,7 @@ def _clean(text: str) -> str:
     return text.strip()
-def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
     """Lemmatize the provided text using spaCy.
     Args:
@@ -136,8 +136,8 @@ def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
         and not token.like_email  # Ignore email addresses
         and not token.like_url  # Ignore URLs
         and not token.like_num  # Ignore numbers
-        and not token.is_alpha  # Ignore non-alphabetic tokens
-        and not (len(tok := token.lemma_.lower().strip()) < threshold)  # Ignore short tokens
     ]
@@ -145,7 +145,7 @@ def tokenize(
     text_data: Sequence[str],
     batch_size: int = 512,
     n_jobs: int = 4,
-    character_threshold: int = 2,
     show_progress: bool = True,
 ) -> Sequence[Sequence[str]]:
     """Tokenize the provided text using spaCy.
@@ -174,7 +174,7 @@ def tokenize(
         [
             _lemmatize(doc, character_threshold)
             for doc in tqdm(
-                nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner", "tok2vec"]),
                 total=len(text_data),
                 desc="Lemmatization",
                 unit="doc",

     # Remove acronyms and abbreviations
     # text = re.sub(r"(?:[a-z]\.){2,}", "", text)
+    text = re.sub(r"\b(?:[a-z]\.?)(?:[a-z]\.)\b", "", text)
     # Remove honorifics
     text = re.sub(r"\b(?:mr|mrs|ms|dr|prof|sr|jr)\.?\b", "", text)
     return text.strip()
+def _lemmatize(doc: Doc, threshold: int = 3) -> Sequence[str]:
     """Lemmatize the provided text using spaCy.
     Args:
         and not token.like_email  # Ignore email addresses
         and not token.like_url  # Ignore URLs
         and not token.like_num  # Ignore numbers
+        and token.is_alpha  # Ignore non-alphabetic tokens
+        and (len(tok := token.lemma_.lower().strip()) >= threshold)  # Ignore short tokens
     ]
     text_data: Sequence[str],
     batch_size: int = 512,
     n_jobs: int = 4,
+    character_threshold: int = 3,
     show_progress: bool = True,
 ) -> Sequence[Sequence[str]]:
     """Tokenize the provided text using spaCy.
         [
             _lemmatize(doc, character_threshold)
             for doc in tqdm(
+                nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner"]),
                 total=len(text_data),
                 desc="Lemmatization",
                 unit="doc",

app/model.py CHANGED Viewed

@@ -10,7 +10,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer,
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
 from sklearn.pipeline import Pipeline
-from sklearn.svm import LinearSVC
 from tqdm import tqdm
 from app.constants import CACHE_DIR
@@ -132,7 +131,7 @@ def train_model(
     vectorizer = _get_vectorizer(vectorizer, max_features)
     classifiers = [
         (LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
-        (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
         # (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 10)}),
         # (RandomForestClassifier(random_state=rs), {"n_estimators": np.arange(50, 500, 50)}),
         # (

 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
 from sklearn.pipeline import Pipeline
 from tqdm import tqdm
 from app.constants import CACHE_DIR
     vectorizer = _get_vectorizer(vectorizer, max_features)
     classifiers = [
         (LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
+        # (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
         # (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 10)}),
         # (RandomForestClassifier(random_state=rs), {"n_estimators": np.arange(50, 500, 50)}),
         # (