Spaces:
Running
Running
Update web.py
Browse files
web.py
CHANGED
|
@@ -399,21 +399,21 @@ def web_data():
|
|
| 399 |
),
|
| 400 |
P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
|
| 401 |
Ul(
|
| 402 |
-
Li("the word count in the document", style = "margin-bottom:
|
| 403 |
-
Li("the mean word length", style = "margin-bottom:
|
| 404 |
-
Li("the number of sentences", style = "margin-bottom:
|
| 405 |
-
Li("the symbol-to-word ratio", style = "margin-bottom:
|
| 406 |
-
Li("the fraction of alphabetic words", style = "margin-bottom:
|
| 407 |
-
Li("and the number of stop words", style = "margin-bottom:
|
| 408 |
),
|
| 409 |
P("Specifically, we remove any document which satisfies any of the following criteria:"),
|
| 410 |
Ul(
|
| 411 |
-
Li("it contains less than 50 words or more than 100,000 words"),
|
| 412 |
-
Li("its mean word length is outside the range of 3 to 10"),
|
| 413 |
-
Li("it contains less than 3 sentences"),
|
| 414 |
-
Li("its symbol-to-word ratio is greater than 0.1"),
|
| 415 |
-
Li("the words that contain at least one alphabetic character are less than 80% of the whole words"),
|
| 416 |
-
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with"),
|
| 417 |
),
|
| 418 |
|
| 419 |
P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
|
|
|
|
| 399 |
),
|
| 400 |
P("We summarize other statistics-based rules originated from Gopher [7] in this section. The statistics can be used include:"),
|
| 401 |
Ul(
|
| 402 |
+
Li("the word count in the document", style = "margin-bottom: 5px"),
|
| 403 |
+
Li("the mean word length", style = "margin-bottom: 5px"),
|
| 404 |
+
Li("the number of sentences", style = "margin-bottom: 5px"),
|
| 405 |
+
Li("the symbol-to-word ratio", style = "margin-bottom: 5px"),
|
| 406 |
+
Li("the fraction of alphabetic words", style = "margin-bottom: 5px"),
|
| 407 |
+
Li("and the number of stop words", style = "margin-bottom: 5px"),
|
| 408 |
),
|
| 409 |
P("Specifically, we remove any document which satisfies any of the following criteria:"),
|
| 410 |
Ul(
|
| 411 |
+
Li("it contains less than 50 words or more than 100,000 words", style = "margin-bottom: 5px"),
|
| 412 |
+
Li("its mean word length is outside the range of 3 to 10", style = "margin-bottom: 5px"),
|
| 413 |
+
Li("it contains less than 3 sentences", style = "margin-bottom: 5px"),
|
| 414 |
+
Li("its symbol-to-word ratio is greater than 0.1", style = "margin-bottom: 5px"),
|
| 415 |
+
Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
|
| 416 |
+
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
|
| 417 |
),
|
| 418 |
|
| 419 |
P("Following C4, we remove any page where the phrase “lorem ipsum” appears since some pages have placeholder “lorem ipsum” text."),
|