Spaces:
Runtime error
Runtime error
Update web.py
Browse files
web.py
CHANGED
|
@@ -254,46 +254,14 @@ def web_data():
|
|
| 254 |
Li("Local Deduplication", style = "margin-bottom: 5px"),
|
| 255 |
Li("Each section is complete with code and comparisons to Dolma, DataTrove, and/or RedPajama-V-2", style = "margin-bottom: 5px"),
|
| 256 |
),
|
| 257 |
-
),
|
| 258 |
-
|
| 259 |
-
Div(
|
| 260 |
-
H2("Common Crawl Data Processing Summary"),
|
| 261 |
-
P(
|
| 262 |
-
"To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ",
|
| 263 |
-
A("Common Crawl", href="https://commoncrawl.org/"),
|
| 264 |
-
", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.",
|
| 265 |
-
),
|
| 266 |
-
style="margin-top: 20px;",
|
| 267 |
-
),
|
| 268 |
-
Div(
|
| 269 |
-
Ul(
|
| 270 |
-
Li(
|
| 271 |
-
A(
|
| 272 |
-
"Raw Documentation",
|
| 273 |
-
href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link",
|
| 274 |
-
)
|
| 275 |
-
),
|
| 276 |
-
Li(
|
| 277 |
-
A(
|
| 278 |
-
"Github link of Web Data Pipeline",
|
| 279 |
-
href="https://github.com/CIAI-LLM/WebDataProcessing.git",
|
| 280 |
-
)
|
| 281 |
-
),
|
| 282 |
-
),
|
| 283 |
-
style="""
|
| 284 |
-
background-color: #d4edda; /* Light green background */
|
| 285 |
-
border: 1px solid #c3e6cb; /* Green border */
|
| 286 |
-
border-radius: 5px;
|
| 287 |
-
padding: 15px 15px 0px 15px;
|
| 288 |
-
margin-bottom: 15px
|
| 289 |
-
""",
|
| 290 |
),
|
| 291 |
id="section1",),
|
| 292 |
Section(
|
| 293 |
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
|
| 294 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
| 295 |
table_div_filter_data,
|
| 296 |
-
P("The table below provides a comparison of the quality filters that have been applied to each dataset."),
|
| 297 |
table_div_qf_filter_data,
|
| 298 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
|
| 299 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
|
@@ -408,7 +376,7 @@ def web_data():
|
|
| 408 |
"""),
|
| 409 |
|
| 410 |
Details(
|
| 411 |
-
Summary("
|
| 412 |
Div (
|
| 413 |
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
| 414 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
@@ -425,7 +393,7 @@ def web_data():
|
|
| 425 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
| 426 |
"""),
|
| 427 |
Details(
|
| 428 |
-
Summary("6
|
| 429 |
Div (
|
| 430 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
| 431 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
@@ -439,7 +407,7 @@ def web_data():
|
|
| 439 |
),
|
| 440 |
|
| 441 |
Details(
|
| 442 |
-
Summary("
|
| 443 |
Div(
|
| 444 |
DV(
|
| 445 |
"data/bad_url_doc.jsonl",
|
|
@@ -460,7 +428,7 @@ def web_data():
|
|
| 460 |
"""),
|
| 461 |
|
| 462 |
Details(
|
| 463 |
-
Summary("
|
| 464 |
Div (
|
| 465 |
DVS(
|
| 466 |
non_web_urls,
|
|
@@ -477,7 +445,7 @@ def web_data():
|
|
| 477 |
),
|
| 478 |
|
| 479 |
Details(
|
| 480 |
-
Summary("
|
| 481 |
Div (
|
| 482 |
DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
|
| 483 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
@@ -539,7 +507,7 @@ def web_data():
|
|
| 539 |
The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
|
| 540 |
"""),
|
| 541 |
Details(
|
| 542 |
-
Summary("Javascript
|
| 543 |
Div (
|
| 544 |
DV(
|
| 545 |
"data/sample_java.jsonl",
|
|
@@ -589,7 +557,7 @@ def web_data():
|
|
| 589 |
the bad words from English but also consider the bad words from other languages.
|
| 590 |
"""),
|
| 591 |
Details(
|
| 592 |
-
Summary("
|
| 593 |
Div (
|
| 594 |
DVS(
|
| 595 |
json.load(open("data/toxic_lines.json")),
|
|
@@ -611,7 +579,7 @@ def web_data():
|
|
| 611 |
In this section, we introduce each quality signal used to filter out low-quality documents.
|
| 612 |
"""),
|
| 613 |
Details(
|
| 614 |
-
Summary("
|
| 615 |
Div (
|
| 616 |
DVS(
|
| 617 |
json.load(open("data/all_signals.json")),
|
|
@@ -732,7 +700,6 @@ def web_data():
|
|
| 732 |
We adjusted the method in Dolma for counting characters within lines by excluding whitespace. This modification
|
| 733 |
ensures consistency with the overall document character count calculation.
|
| 734 |
"""),
|
| 735 |
-
H3("TxT360 Implementation"),
|
| 736 |
Details(
|
| 737 |
Summary("TxT360 Implementation"),
|
| 738 |
Div(
|
|
@@ -1153,9 +1120,6 @@ def web_data():
|
|
| 1153 |
margin-bottom: 15px
|
| 1154 |
""",
|
| 1155 |
),
|
| 1156 |
-
H5(
|
| 1157 |
-
"Sample Documents Filtered by the Fraction of Characters in Duplicated N-grams (n=5,...,10)"
|
| 1158 |
-
),
|
| 1159 |
Details(
|
| 1160 |
Summary("Documents Filtered by Duplicated n-Grams (n=5,...,10)"),
|
| 1161 |
Div(
|
|
@@ -1300,13 +1264,22 @@ def web_data():
|
|
| 1300 |
Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
|
| 1301 |
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
|
| 1302 |
),
|
| 1303 |
-
H3("Word Count"),
|
| 1304 |
Details(
|
|
|
|
| 1305 |
Summary("Implementations from Dolma"),
|
| 1306 |
D_code("""
|
| 1307 |
words = text.split()
|
| 1308 |
word_count = len(words)
|
| 1309 |
""", block="block", language="python"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1310 |
),
|
| 1311 |
Details(
|
| 1312 |
Summary("Implementations from RedPajama-V2"),
|
|
|
|
| 254 |
Li("Local Deduplication", style = "margin-bottom: 5px"),
|
| 255 |
Li("Each section is complete with code and comparisons to Dolma, DataTrove, and/or RedPajama-V-2", style = "margin-bottom: 5px"),
|
| 256 |
),
|
| 257 |
+
P("To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Below is a comprehensive list of datasets we reviewed the comparison of filters we have applied."),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
),
|
| 259 |
id="section1",),
|
| 260 |
Section(
|
| 261 |
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
|
| 262 |
P("The following section provides explicit details covering the reasoning and decisions behind each of the filters we applied. The table below provides a high-level comparison of TxT360's filtering compared to other commonly used pretraining datasets."),
|
| 263 |
table_div_filter_data,
|
| 264 |
+
P("The table below provides a comparison of the quality filters that have been applied to each dataset. Of note, TxT360 does not use any machine learning (ML) based filters. ML filters are a useful and effecient filtering processing that should be consider for any filtering project. However, we are leaving that option to TxT360's end users."),
|
| 265 |
table_div_qf_filter_data,
|
| 266 |
P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across dumps. "),
|
| 267 |
Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
|
|
|
|
| 376 |
"""),
|
| 377 |
|
| 378 |
Details(
|
| 379 |
+
Summary(" List of 24 URLs with 4k+ Matches"),
|
| 380 |
Div (
|
| 381 |
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
| 382 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
|
|
| 393 |
We manually removed the following 6 domains from the UT1 blocklist so that they will not be removed from our dataset.
|
| 394 |
"""),
|
| 395 |
Details(
|
| 396 |
+
Summary("6 URLS Manually Removed from the Blocklist"),
|
| 397 |
Div (
|
| 398 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
| 399 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
|
|
| 407 |
),
|
| 408 |
|
| 409 |
Details(
|
| 410 |
+
Summary("Blocked Document Examples from the URL Blocklist"),
|
| 411 |
Div(
|
| 412 |
DV(
|
| 413 |
"data/bad_url_doc.jsonl",
|
|
|
|
| 428 |
"""),
|
| 429 |
|
| 430 |
Details(
|
| 431 |
+
Summary("TxT360 Excluded URLs"),
|
| 432 |
Div (
|
| 433 |
DVS(
|
| 434 |
non_web_urls,
|
|
|
|
| 445 |
),
|
| 446 |
|
| 447 |
Details(
|
| 448 |
+
Summary("TxT360 Excluded URLs Example Documents"),
|
| 449 |
Div (
|
| 450 |
DV("data/sample_url_exclusion.json", 0, "Sample documents whose urls are in our curated url domain list"),
|
| 451 |
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
|
|
|
| 507 |
The additional keyword could be any one of “enable” / “disable” / “require” / “activate” / “browser”.
|
| 508 |
"""),
|
| 509 |
Details(
|
| 510 |
+
Summary("Javascript Documents Filtered by C4 but Kept in TxT360"),
|
| 511 |
Div (
|
| 512 |
DV(
|
| 513 |
"data/sample_java.jsonl",
|
|
|
|
| 557 |
the bad words from English but also consider the bad words from other languages.
|
| 558 |
"""),
|
| 559 |
Details(
|
| 560 |
+
Summary("Toxic Line Examples (WARNING: MAY CONTAIN OFFENSIVE MATERIAL)"),
|
| 561 |
Div (
|
| 562 |
DVS(
|
| 563 |
json.load(open("data/toxic_lines.json")),
|
|
|
|
| 579 |
In this section, we introduce each quality signal used to filter out low-quality documents.
|
| 580 |
"""),
|
| 581 |
Details(
|
| 582 |
+
Summary("Quality Signals Used For Filtering"),
|
| 583 |
Div (
|
| 584 |
DVS(
|
| 585 |
json.load(open("data/all_signals.json")),
|
|
|
|
| 700 |
We adjusted the method in Dolma for counting characters within lines by excluding whitespace. This modification
|
| 701 |
ensures consistency with the overall document character count calculation.
|
| 702 |
"""),
|
|
|
|
| 703 |
Details(
|
| 704 |
Summary("TxT360 Implementation"),
|
| 705 |
Div(
|
|
|
|
| 1120 |
margin-bottom: 15px
|
| 1121 |
""",
|
| 1122 |
),
|
|
|
|
|
|
|
|
|
|
| 1123 |
Details(
|
| 1124 |
Summary("Documents Filtered by Duplicated n-Grams (n=5,...,10)"),
|
| 1125 |
Div(
|
|
|
|
| 1264 |
Li("the words that contain at least one alphabetic character are less than 80% of the whole words", style = "margin-bottom: 5px"),
|
| 1265 |
Li("it contains less than two of the stop words (the, be, to, of, and, that, have, with", style = "margin-bottom: 5px"),
|
| 1266 |
),
|
| 1267 |
+
H3("Word Count Filters"),
|
| 1268 |
Details(
|
| 1269 |
+
Div(
|
| 1270 |
Summary("Implementations from Dolma"),
|
| 1271 |
D_code("""
|
| 1272 |
words = text.split()
|
| 1273 |
word_count = len(words)
|
| 1274 |
""", block="block", language="python"),
|
| 1275 |
+
style="background-color: white; padding: 15px; margin-top: 10px; margin-bottom: 10px; border-radius: 8px; border: none; " # Styling for the DV2 part
|
| 1276 |
+
),
|
| 1277 |
+
style="""
|
| 1278 |
+
background-color: #EAFFF1; /* Light yellow background */
|
| 1279 |
+
padding: 15px;
|
| 1280 |
+
border-radius: 12px;
|
| 1281 |
+
margin-bottom: 15px
|
| 1282 |
+
""",
|
| 1283 |
),
|
| 1284 |
Details(
|
| 1285 |
Summary("Implementations from RedPajama-V2"),
|