Spaces:
Runtime error
Runtime error
Improved models
Browse files- README.md +2 -2
- app/model.py +0 -6
- models/imdb50k_tfidf_ft20000.pkl +2 -2
- models/sentiment140_tfidf_ft20000.pkl +2 -2
README.md
CHANGED
|
@@ -211,8 +211,8 @@ graph LR
|
|
| 211 |
The following pre-trained models are available for use:
|
| 212 |
| Dataset | Vectorizer | Classifier | Features | Accuracy on test | Accuracy on self | Model |
|
| 213 |
| --- | --- | --- | --- | --- | --- | --- |
|
| 214 |
-
| `imdb50k` | `tfidf` | `LinearRegression` | 20 000 |
|
| 215 |
-
| `sentiment140` | `tfidf` | `LinearRegression` | 20 000 |
|
| 216 |
| `amazonreviews` | `tfidf` | `LinearRegression` | 20 000 | ❌ | ❌ | [Here](models/amazonreviews_tfidf_ft1048576.pkl) |
|
| 217 |
|
| 218 |
|
|
|
|
| 211 |
The following pre-trained models are available for use:
|
| 212 |
| Dataset | Vectorizer | Classifier | Features | Accuracy on test | Accuracy on self | Model |
|
| 213 |
| --- | --- | --- | --- | --- | --- | --- |
|
| 214 |
+
| `imdb50k` | `tfidf` | `LinearRegression` | 20 000 | 83.24% ± 0.99% | 89.24% ± 0.13% | [Here](models/imdb50k_tfidf_ft20000.pkl) |
|
| 215 |
+
| `sentiment140` | `tfidf` | `LinearRegression` | 20 000 | 83.24% ± 0.99% | 77.32% ± 0.28% | [Here](models/sentiment140_tfidf_ft20000.pkl) |
|
| 216 |
| `amazonreviews` | `tfidf` | `LinearRegression` | 20 000 | ❌ | ❌ | [Here](models/amazonreviews_tfidf_ft1048576.pkl) |
|
| 217 |
|
| 218 |
|
app/model.py
CHANGED
|
@@ -36,7 +36,6 @@ def _identity(x: list[str]) -> list[str]:
|
|
| 36 |
def _get_vectorizer(
|
| 37 |
name: Literal["tfidf", "count", "hashing"],
|
| 38 |
n_features: int,
|
| 39 |
-
df: tuple[float, float] = (1.0, 1.0),
|
| 40 |
ngram: tuple[int, int] = (1, 2),
|
| 41 |
) -> TransformerMixin:
|
| 42 |
"""Get the appropriate vectorizer.
|
|
@@ -44,7 +43,6 @@ def _get_vectorizer(
|
|
| 44 |
Args:
|
| 45 |
name: Type of vectorizer
|
| 46 |
n_features: Maximum number of features
|
| 47 |
-
df: Document frequency range [min_df, max_df] (ignored for HashingVectorizer)
|
| 48 |
ngram: N-gram range [min_n, max_n]
|
| 49 |
|
| 50 |
Returns:
|
|
@@ -66,15 +64,11 @@ def _get_vectorizer(
|
|
| 66 |
case "tfidf":
|
| 67 |
return TfidfVectorizer(
|
| 68 |
max_features=n_features,
|
| 69 |
-
min_df=df[0],
|
| 70 |
-
max_df=df[1],
|
| 71 |
**shared_params,
|
| 72 |
)
|
| 73 |
case "count":
|
| 74 |
return CountVectorizer(
|
| 75 |
max_features=n_features,
|
| 76 |
-
min_df=df[0],
|
| 77 |
-
max_df=df[1],
|
| 78 |
**shared_params,
|
| 79 |
)
|
| 80 |
case "hashing":
|
|
|
|
| 36 |
def _get_vectorizer(
|
| 37 |
name: Literal["tfidf", "count", "hashing"],
|
| 38 |
n_features: int,
|
|
|
|
| 39 |
ngram: tuple[int, int] = (1, 2),
|
| 40 |
) -> TransformerMixin:
|
| 41 |
"""Get the appropriate vectorizer.
|
|
|
|
| 43 |
Args:
|
| 44 |
name: Type of vectorizer
|
| 45 |
n_features: Maximum number of features
|
|
|
|
| 46 |
ngram: N-gram range [min_n, max_n]
|
| 47 |
|
| 48 |
Returns:
|
|
|
|
| 64 |
case "tfidf":
|
| 65 |
return TfidfVectorizer(
|
| 66 |
max_features=n_features,
|
|
|
|
|
|
|
| 67 |
**shared_params,
|
| 68 |
)
|
| 69 |
case "count":
|
| 70 |
return CountVectorizer(
|
| 71 |
max_features=n_features,
|
|
|
|
|
|
|
| 72 |
**shared_params,
|
| 73 |
)
|
| 74 |
case "hashing":
|
models/imdb50k_tfidf_ft20000.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0c916d380fc84a33f3cb5892cd10e4aaa29330cbbac4243860e91fe9392df897
|
| 3 |
+
size 398706
|
models/sentiment140_tfidf_ft20000.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1308cb96bbee2befeb585c99fb3ad78b4bbef0504fcb5070d8c738289c212431
|
| 3 |
+
size 397501
|