Spaces:
Sleeping
Sleeping
Parallelize text cleaning
Browse files- app/data.py +13 -4
- app/utils.py +1 -1
app/data.py
CHANGED
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Literal, Sequence
|
|
9 |
import emoji
|
10 |
import pandas as pd
|
11 |
import spacy
|
|
|
12 |
from tqdm import tqdm
|
13 |
|
14 |
from app.constants import (
|
@@ -160,16 +161,24 @@ def tokenize(
|
|
160 |
Returns:
|
161 |
Tokenized text data
|
162 |
"""
|
163 |
-
text_data = [
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
for text in tqdm(
|
166 |
text_data,
|
167 |
desc="Cleaning",
|
168 |
unit="doc",
|
169 |
disable=not show_progress,
|
170 |
)
|
171 |
-
|
172 |
-
|
173 |
return pd.Series(
|
174 |
[
|
175 |
_lemmatize(doc, character_threshold)
|
|
|
9 |
import emoji
|
10 |
import pandas as pd
|
11 |
import spacy
|
12 |
+
from joblib import Parallel, delayed
|
13 |
from tqdm import tqdm
|
14 |
|
15 |
from app.constants import (
|
|
|
161 |
Returns:
|
162 |
Tokenized text data
|
163 |
"""
|
164 |
+
# text_data = [
|
165 |
+
# _clean(text)
|
166 |
+
# for text in tqdm(
|
167 |
+
# text_data,
|
168 |
+
# desc="Cleaning",
|
169 |
+
# unit="doc",
|
170 |
+
# disable=not show_progress,
|
171 |
+
# )
|
172 |
+
# ]
|
173 |
+
text_data = Parallel(n_jobs=n_jobs)(
|
174 |
+
delayed(_clean)(text)
|
175 |
for text in tqdm(
|
176 |
text_data,
|
177 |
desc="Cleaning",
|
178 |
unit="doc",
|
179 |
disable=not show_progress,
|
180 |
)
|
181 |
+
)
|
|
|
182 |
return pd.Series(
|
183 |
[
|
184 |
_lemmatize(doc, character_threshold)
|
app/utils.py
CHANGED
@@ -11,7 +11,7 @@ if TYPE_CHECKING:
|
|
11 |
__all__ = ["serialize", "deserialize"]
|
12 |
|
13 |
|
14 |
-
def serialize(data: Sequence[str | int], path: Path, max_size: int =
|
15 |
"""Serialize data to a file
|
16 |
|
17 |
Args:
|
|
|
11 |
__all__ = ["serialize", "deserialize"]
|
12 |
|
13 |
|
14 |
+
def serialize(data: Sequence[str | int], path: Path, max_size: int = 100_000, show_progress: bool = False) -> None:
|
15 |
"""Serialize data to a file
|
16 |
|
17 |
Args:
|