Tymec commited on
Commit
3854a1f
1 Parent(s): 23e75e7

Create model in train_model

Browse files
Files changed (2) hide show
  1. app/cli.py +10 -4
  2. app/model.py +31 -40
app/cli.py CHANGED
@@ -230,7 +230,7 @@ def train(
230
 
231
  from app.constants import CACHE_DIR, MODELS_DIR
232
  from app.data import load_data, tokenize
233
- from app.model import create_model, train_model
234
 
235
  model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
236
  if model_path.exists() and not force:
@@ -258,13 +258,19 @@ def train(
258
  del text_data
259
 
260
  click.echo("Training model... ")
261
- model = create_model(max_features, seed=None if seed == -1 else seed, verbose=verbose)
262
- trained_model, accuracy = train_model(model, token_data, label_data, folds=cv, seed=seed, verbose=verbose)
 
 
 
 
 
 
263
  click.echo("Model accuracy: ", nl=False)
264
  click.secho(f"{accuracy:.2%}", fg="blue")
265
 
266
  click.echo("Model saved to: ", nl=False)
267
- joblib.dump(trained_model, model_path, compress=3)
268
  click.secho(str(model_path), fg="blue")
269
 
270
 
 
230
 
231
  from app.constants import CACHE_DIR, MODELS_DIR
232
  from app.data import load_data, tokenize
233
+ from app.model import train_model
234
 
235
  model_path = MODELS_DIR / f"{dataset}_tfidf_ft-{max_features}.pkl"
236
  if model_path.exists() and not force:
 
258
  del text_data
259
 
260
  click.echo("Training model... ")
261
+ model, accuracy = train_model(
262
+ token_data,
263
+ label_data,
264
+ max_features=max_features,
265
+ folds=cv,
266
+ seed=seed,
267
+ verbose=verbose,
268
+ )
269
  click.echo("Model accuracy: ", nl=False)
270
  click.secho(f"{accuracy:.2%}", fg="blue")
271
 
272
  click.echo("Model saved to: ", nl=False)
273
+ joblib.dump(model, model_path, compress=3)
274
  click.secho(str(model_path), fg="blue")
275
 
276
 
app/model.py CHANGED
@@ -16,7 +16,7 @@ from app.data import tokenize
16
  if TYPE_CHECKING:
17
  from sklearn.base import BaseEstimator
18
 
19
- __all__ = ["create_model", "train_model", "evaluate_model", "infer_model"]
20
 
21
 
22
  def _identity(x: list[str]) -> list[str]:
@@ -31,46 +31,10 @@ def _identity(x: list[str]) -> list[str]:
31
  return x
32
 
33
 
34
- def create_model(
35
- max_features: int,
36
- seed: int | None = None,
37
- verbose: bool = False,
38
- ) -> Pipeline:
39
- """Create a sentiment analysis model.
40
-
41
- Args:
42
- max_features: Maximum number of features
43
- seed: Random seed (None for random seed)
44
- verbose: Whether to output additional information
45
-
46
- Returns:
47
- Untrained model
48
- """
49
- return Pipeline(
50
- [
51
- (
52
- "vectorizer",
53
- TfidfVectorizer(
54
- max_features=max_features,
55
- ngram_range=(1, 2),
56
- # disable text processing
57
- tokenizer=_identity,
58
- preprocessor=_identity,
59
- lowercase=False,
60
- token_pattern=None,
61
- ),
62
- ),
63
- ("classifier", LogisticRegression(max_iter=1000, random_state=seed)),
64
- ],
65
- memory=Memory(CACHE_DIR, verbose=0),
66
- verbose=verbose,
67
- )
68
-
69
-
70
  def train_model(
71
- model: BaseEstimator,
72
  token_data: list[str],
73
  label_data: list[int],
 
74
  folds: int = 5,
75
  seed: int = 42,
76
  verbose: bool = False,
@@ -81,6 +45,7 @@ def train_model(
81
  model: Untrained model
82
  token_data: Tokenized text data
83
  label_data: Label data
 
84
  folds: Number of cross-validation folds
85
  seed: Random seed (None for random seed)
86
  verbose: Whether to output additional information
@@ -100,6 +65,32 @@ def train_model(
100
  "classifier__solver": ["liblinear", "saga"],
101
  }
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  search = RandomizedSearchCV(
104
  model,
105
  param_distributions,
@@ -111,9 +102,9 @@ def train_model(
111
  verbose=verbose,
112
  )
113
 
114
- os.environ["PYTHONWARNINGS"] = "ignore"
115
  search.fit(text_train, label_train)
116
- del os.environ["PYTHONWARNINGS"]
117
 
118
  best_model = search.best_estimator_
119
  return best_model, best_model.score(text_test, label_test)
 
16
  if TYPE_CHECKING:
17
  from sklearn.base import BaseEstimator
18
 
19
+ __all__ = ["train_model", "evaluate_model", "infer_model"]
20
 
21
 
22
  def _identity(x: list[str]) -> list[str]:
 
31
  return x
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def train_model(
 
35
  token_data: list[str],
36
  label_data: list[int],
37
+ max_features: int,
38
  folds: int = 5,
39
  seed: int = 42,
40
  verbose: bool = False,
 
45
  model: Untrained model
46
  token_data: Tokenized text data
47
  label_data: Label data
48
+ max_features: Maximum number of features
49
  folds: Number of cross-validation folds
50
  seed: Random seed (None for random seed)
51
  verbose: Whether to output additional information
 
65
  "classifier__solver": ["liblinear", "saga"],
66
  }
67
 
68
+ model = Pipeline(
69
+ [
70
+ (
71
+ "vectorizer",
72
+ TfidfVectorizer(
73
+ max_features=max_features,
74
+ ngram_range=(1, 2),
75
+ # disable text processing
76
+ tokenizer=_identity,
77
+ preprocessor=_identity,
78
+ lowercase=False,
79
+ token_pattern=None,
80
+ ),
81
+ ),
82
+ (
83
+ "classifier",
84
+ LogisticRegression(
85
+ max_iter=1000,
86
+ random_state=None if seed == -1 else seed,
87
+ ),
88
+ ),
89
+ ],
90
+ memory=Memory(CACHE_DIR, verbose=0),
91
+ verbose=verbose,
92
+ )
93
+
94
  search = RandomizedSearchCV(
95
  model,
96
  param_distributions,
 
102
  verbose=verbose,
103
  )
104
 
105
+ # os.environ["PYTHONWARNINGS"] = "ignore"
106
  search.fit(text_train, label_train)
107
+ # del os.environ["PYTHONWARNINGS"]
108
 
109
  best_model = search.best_estimator_
110
  return best_model, best_model.score(text_test, label_test)