Spaces:
Running
Running
change background to lighter color
Browse fileschange pink to lighter one
add margin-bottom
web.py
CHANGED
|
@@ -240,6 +240,7 @@ def web_data():
|
|
| 240 |
border: 1px solid #c3e6cb; /* Green border */
|
| 241 |
border-radius: 5px;
|
| 242 |
padding: 15px 15px 0px 15px;
|
|
|
|
| 243 |
""",
|
| 244 |
),
|
| 245 |
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
|
|
@@ -301,6 +302,7 @@ def web_data():
|
|
| 301 |
padding: 15px;
|
| 302 |
# border: 1px solid #949494; /* Grey border */
|
| 303 |
border-radius: 12px;
|
|
|
|
| 304 |
""", #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
|
| 305 |
),
|
| 306 |
#DV2("data/sample_wet.json", "data/sample_warc.json", 3),
|
|
@@ -316,9 +318,10 @@ def web_data():
|
|
| 316 |
Summary("Non-English Documents"),
|
| 317 |
DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
| 318 |
style="""
|
| 319 |
-
background-color: #
|
| 320 |
padding: 15px;
|
| 321 |
border-radius: 12px;
|
|
|
|
| 322 |
""",
|
| 323 |
),
|
| 324 |
|
|
@@ -331,6 +334,7 @@ def web_data():
|
|
| 331 |
background-color: #EAFFF1; /* Light green background */
|
| 332 |
padding: 15px;
|
| 333 |
border-radius: 12px;
|
|
|
|
| 334 |
""",
|
| 335 |
),
|
| 336 |
|
|
@@ -350,9 +354,10 @@ def web_data():
|
|
| 350 |
Summary("24 URL domains with more than 4k matches"),
|
| 351 |
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
| 352 |
style="""
|
| 353 |
-
background-color: #
|
| 354 |
padding: 15px;
|
| 355 |
border-radius: 12px;
|
|
|
|
| 356 |
""",
|
| 357 |
),
|
| 358 |
|
|
@@ -363,9 +368,10 @@ def web_data():
|
|
| 363 |
Summary("6 url domains that are removed from the blocklist"),
|
| 364 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
| 365 |
style="""
|
| 366 |
-
background-color: #
|
| 367 |
padding: 15px;
|
| 368 |
border-radius: 12px;
|
|
|
|
| 369 |
""",
|
| 370 |
),
|
| 371 |
|
|
@@ -377,9 +383,10 @@ def web_data():
|
|
| 377 |
"Sample documents whose urls are blocked by the refined url blocklist",
|
| 378 |
),
|
| 379 |
style="""
|
| 380 |
-
background-color: #
|
| 381 |
padding: 15px;
|
| 382 |
border-radius: 12px;
|
|
|
|
| 383 |
""",
|
| 384 |
),
|
| 385 |
|
|
@@ -395,9 +402,10 @@ def web_data():
|
|
| 395 |
"curated url domains that are excluded from our dataset",
|
| 396 |
),
|
| 397 |
style="""
|
| 398 |
-
background-color: #
|
| 399 |
padding: 15px;
|
| 400 |
border-radius: 12px;
|
|
|
|
| 401 |
""",
|
| 402 |
),
|
| 403 |
|
|
@@ -408,6 +416,7 @@ def web_data():
|
|
| 408 |
background-color: #EAFFF1; /* Light green background */
|
| 409 |
padding: 15px;
|
| 410 |
border-radius: 12px;
|
|
|
|
| 411 |
""",
|
| 412 |
),
|
| 413 |
|
|
@@ -438,9 +447,10 @@ def web_data():
|
|
| 438 |
"Sample documents with lines that are removed by the rule of terminal punctuation",
|
| 439 |
),
|
| 440 |
style="""
|
| 441 |
-
background-color: #
|
| 442 |
padding: 15px;
|
| 443 |
border-radius: 12px;
|
|
|
|
| 444 |
""",
|
| 445 |
),
|
| 446 |
|
|
@@ -464,9 +474,10 @@ def web_data():
|
|
| 464 |
"Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
|
| 465 |
),
|
| 466 |
style="""
|
| 467 |
-
background-color: #
|
| 468 |
padding: 15px;
|
| 469 |
border-radius: 12px;
|
|
|
|
| 470 |
""",
|
| 471 |
),
|
| 472 |
H3("2.2 Other Rules from RefinedWeb"),
|
|
@@ -487,9 +498,10 @@ def web_data():
|
|
| 487 |
"Sample documents with lines that are removed by the RefinedWeb rules",
|
| 488 |
),
|
| 489 |
style="""
|
| 490 |
-
background-color: #
|
| 491 |
padding: 15px;
|
| 492 |
border-radius: 12px;
|
|
|
|
| 493 |
""",
|
| 494 |
),
|
| 495 |
H3("2.3 Toxic Lines"),
|
|
@@ -507,9 +519,10 @@ def web_data():
|
|
| 507 |
"Sample documents with toxic lines",
|
| 508 |
),
|
| 509 |
style="""
|
| 510 |
-
background-color: #
|
| 511 |
padding: 15px;
|
| 512 |
border-radius: 12px;
|
|
|
|
| 513 |
""",
|
| 514 |
),
|
| 515 |
|
|
@@ -527,6 +540,7 @@ def web_data():
|
|
| 527 |
background-color: #EAFFF1; /* Light green background */
|
| 528 |
padding: 15px;
|
| 529 |
border-radius: 12px;
|
|
|
|
| 530 |
""",
|
| 531 |
),
|
| 532 |
P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
|
|
@@ -570,6 +584,7 @@ def web_data():
|
|
| 570 |
background-color: #FFFAEA; /* Light yellow background */
|
| 571 |
padding: 15px;
|
| 572 |
border-radius: 12px;
|
|
|
|
| 573 |
""",
|
| 574 |
),
|
| 575 |
Details(
|
|
@@ -609,6 +624,7 @@ def web_data():
|
|
| 609 |
background-color: #FFFAEA; /* Light yellow background */
|
| 610 |
padding: 15px;
|
| 611 |
border-radius: 12px;
|
|
|
|
| 612 |
""",
|
| 613 |
),
|
| 614 |
P("""
|
|
@@ -655,6 +671,7 @@ def web_data():
|
|
| 655 |
background-color: #EAFFF1; /* Light green background */
|
| 656 |
padding: 15px;
|
| 657 |
border-radius: 12px;
|
|
|
|
| 658 |
""",
|
| 659 |
),
|
| 660 |
Details(
|
|
@@ -668,6 +685,7 @@ def web_data():
|
|
| 668 |
background-color: #EAFFF1; /* Light green background */
|
| 669 |
padding: 15px;
|
| 670 |
border-radius: 12px;
|
|
|
|
| 671 |
""",
|
| 672 |
),
|
| 673 |
H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
|
@@ -696,6 +714,7 @@ def web_data():
|
|
| 696 |
background-color: #FFFAEA; /* Light yellow background */
|
| 697 |
padding: 15px;
|
| 698 |
border-radius: 12px;
|
|
|
|
| 699 |
""",
|
| 700 |
),
|
| 701 |
Details(
|
|
@@ -739,6 +758,7 @@ def web_data():
|
|
| 739 |
background-color: #FFFAEA; /* Light yellow background */
|
| 740 |
padding: 15px;
|
| 741 |
border-radius: 12px;
|
|
|
|
| 742 |
""",
|
| 743 |
),
|
| 744 |
|
|
@@ -767,6 +787,7 @@ def web_data():
|
|
| 767 |
background-color: #FFFAEA; /* Light yellow background */
|
| 768 |
padding: 15px;
|
| 769 |
border-radius: 12px;
|
|
|
|
| 770 |
""",
|
| 771 |
),
|
| 772 |
P("""
|
|
@@ -799,6 +820,7 @@ def web_data():
|
|
| 799 |
background-color: #EAFFF1; /* Light green background */
|
| 800 |
padding: 15px;
|
| 801 |
border-radius: 12px;
|
|
|
|
| 802 |
""",
|
| 803 |
),
|
| 804 |
Details(
|
|
@@ -812,6 +834,7 @@ def web_data():
|
|
| 812 |
background-color: #EAFFF1; /* Light green background */
|
| 813 |
padding: 15px;
|
| 814 |
border-radius: 12px;
|
|
|
|
| 815 |
""",
|
| 816 |
),
|
| 817 |
H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
|
@@ -843,6 +866,7 @@ def web_data():
|
|
| 843 |
background-color: #FFFAEA; /* Light yellow background */
|
| 844 |
padding: 15px;
|
| 845 |
border-radius: 12px;
|
|
|
|
| 846 |
""",
|
| 847 |
),
|
| 848 |
Details(
|
|
@@ -901,6 +925,7 @@ def web_data():
|
|
| 901 |
background-color: #FFFAEA; /* Light yellow background */
|
| 902 |
padding: 15px;
|
| 903 |
border-radius: 12px;
|
|
|
|
| 904 |
""",
|
| 905 |
),
|
| 906 |
|
|
@@ -931,6 +956,7 @@ def web_data():
|
|
| 931 |
background-color: #FFFAEA; /* Light yellow background */
|
| 932 |
padding: 15px;
|
| 933 |
border-radius: 12px;
|
|
|
|
| 934 |
""",
|
| 935 |
),
|
| 936 |
P("""
|
|
@@ -989,6 +1015,7 @@ def web_data():
|
|
| 989 |
background-color: #EAFFF1; /* Light green background */
|
| 990 |
padding: 15px;
|
| 991 |
border-radius: 12px;
|
|
|
|
| 992 |
""",
|
| 993 |
),
|
| 994 |
Details(
|
|
@@ -1008,6 +1035,7 @@ def web_data():
|
|
| 1008 |
background-color: #EAFFF1; /* Light green background */
|
| 1009 |
padding: 15px;
|
| 1010 |
border-radius: 12px;
|
|
|
|
| 1011 |
""",
|
| 1012 |
),
|
| 1013 |
H5(
|
|
@@ -1024,6 +1052,7 @@ def web_data():
|
|
| 1024 |
background-color: #EAFFF1; /* Light green background */
|
| 1025 |
padding: 15px;
|
| 1026 |
border-radius: 12px;
|
|
|
|
| 1027 |
""",
|
| 1028 |
),
|
| 1029 |
H3("3.2 Line-wise Heuristics"),
|
|
@@ -1055,6 +1084,7 @@ def web_data():
|
|
| 1055 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1056 |
padding: 15px;
|
| 1057 |
border-radius: 12px;
|
|
|
|
| 1058 |
""",
|
| 1059 |
),
|
| 1060 |
Details(
|
|
@@ -1104,6 +1134,7 @@ def web_data():
|
|
| 1104 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1105 |
padding: 15px;
|
| 1106 |
border-radius: 12px;
|
|
|
|
| 1107 |
""",
|
| 1108 |
),
|
| 1109 |
|
|
@@ -1119,6 +1150,7 @@ def web_data():
|
|
| 1119 |
background-color: #EAFFF1; /* Light green background */
|
| 1120 |
padding: 15px;
|
| 1121 |
border-radius: 12px;
|
|
|
|
| 1122 |
""",
|
| 1123 |
),
|
| 1124 |
|
|
@@ -1184,6 +1216,7 @@ def web_data():
|
|
| 1184 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1185 |
padding: 15px;
|
| 1186 |
border-radius: 12px;
|
|
|
|
| 1187 |
""",
|
| 1188 |
),
|
| 1189 |
|
|
@@ -1200,6 +1233,7 @@ def web_data():
|
|
| 1200 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1201 |
padding: 15px;
|
| 1202 |
border-radius: 12px;
|
|
|
|
| 1203 |
""",
|
| 1204 |
),
|
| 1205 |
P("""
|
|
@@ -1249,6 +1283,7 @@ def web_data():
|
|
| 1249 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1250 |
padding: 15px;
|
| 1251 |
border-radius: 12px;
|
|
|
|
| 1252 |
""",
|
| 1253 |
),
|
| 1254 |
P("""
|
|
@@ -1270,6 +1305,7 @@ def web_data():
|
|
| 1270 |
background-color: #EAFFF1; /* Light green background */
|
| 1271 |
padding: 15px;
|
| 1272 |
border-radius: 12px;
|
|
|
|
| 1273 |
""",
|
| 1274 |
),
|
| 1275 |
|
|
@@ -1291,6 +1327,7 @@ def web_data():
|
|
| 1291 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1292 |
padding: 15px;
|
| 1293 |
border-radius: 12px;
|
|
|
|
| 1294 |
""",
|
| 1295 |
),
|
| 1296 |
Details(
|
|
@@ -1322,6 +1359,7 @@ def web_data():
|
|
| 1322 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1323 |
padding: 15px;
|
| 1324 |
border-radius: 12px;
|
|
|
|
| 1325 |
""",
|
| 1326 |
),
|
| 1327 |
|
|
@@ -1337,6 +1375,7 @@ def web_data():
|
|
| 1337 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1338 |
padding: 15px;
|
| 1339 |
border-radius: 12px;
|
|
|
|
| 1340 |
""",
|
| 1341 |
),
|
| 1342 |
Details(
|
|
@@ -1352,6 +1391,7 @@ def web_data():
|
|
| 1352 |
background-color: #EAFFF1; /* Light green background */
|
| 1353 |
padding: 15px;
|
| 1354 |
border-radius: 12px;
|
|
|
|
| 1355 |
""",
|
| 1356 |
),
|
| 1357 |
|
|
@@ -1367,6 +1407,7 @@ def web_data():
|
|
| 1367 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1368 |
padding: 15px;
|
| 1369 |
border-radius: 12px;
|
|
|
|
| 1370 |
""",
|
| 1371 |
),
|
| 1372 |
Details(
|
|
@@ -1396,6 +1437,7 @@ def web_data():
|
|
| 1396 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1397 |
padding: 15px;
|
| 1398 |
border-radius: 12px;
|
|
|
|
| 1399 |
""",
|
| 1400 |
),
|
| 1401 |
Details(
|
|
@@ -1412,6 +1454,7 @@ def web_data():
|
|
| 1412 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1413 |
padding: 15px;
|
| 1414 |
border-radius: 12px;
|
|
|
|
| 1415 |
""",
|
| 1416 |
),
|
| 1417 |
P("""
|
|
@@ -1443,6 +1486,7 @@ def web_data():
|
|
| 1443 |
background-color: #EAFFF1; /* Light green background */
|
| 1444 |
padding: 15px;
|
| 1445 |
border-radius: 12px;
|
|
|
|
| 1446 |
""",
|
| 1447 |
),
|
| 1448 |
H3("3.4 Others"),
|
|
@@ -1455,9 +1499,10 @@ def web_data():
|
|
| 1455 |
Summary("Sample documents containing 'lorem ipsum'"),
|
| 1456 |
DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
|
| 1457 |
style="""
|
| 1458 |
-
background-color: #
|
| 1459 |
padding: 15px;
|
| 1460 |
border-radius: 12px;
|
|
|
|
| 1461 |
""",
|
| 1462 |
),
|
| 1463 |
H2("4. Deduplication"),
|
|
|
|
| 240 |
border: 1px solid #c3e6cb; /* Green border */
|
| 241 |
border-radius: 5px;
|
| 242 |
padding: 15px 15px 0px 15px;
|
| 243 |
+
marging-bottom: 15px
|
| 244 |
""",
|
| 245 |
),
|
| 246 |
H3("TxT360 CommonCrawl Filtering vs Other Pretraining Datasets"),
|
|
|
|
| 302 |
padding: 15px;
|
| 303 |
# border: 1px solid #949494; /* Grey border */
|
| 304 |
border-radius: 12px;
|
| 305 |
+
marging-bottom: 15px
|
| 306 |
""", #https://colors.muz.li/palette/d3d3d3/949494/d3d3d3/d3d3d3/949494
|
| 307 |
),
|
| 308 |
#DV2("data/sample_wet.json", "data/sample_warc.json", 3),
|
|
|
|
| 318 |
Summary("Non-English Documents"),
|
| 319 |
DV("data/sample_non_en.json", 3, "Sample documents that are classified as non-English"),
|
| 320 |
style="""
|
| 321 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 322 |
padding: 15px;
|
| 323 |
border-radius: 12px;
|
| 324 |
+
marging-bottom: 15px
|
| 325 |
""",
|
| 326 |
),
|
| 327 |
|
|
|
|
| 334 |
background-color: #EAFFF1; /* Light green background */
|
| 335 |
padding: 15px;
|
| 336 |
border-radius: 12px;
|
| 337 |
+
marging-bottom: 15px
|
| 338 |
""",
|
| 339 |
),
|
| 340 |
|
|
|
|
| 354 |
Summary("24 URL domains with more than 4k matches"),
|
| 355 |
DVS(urls_high_matches, "24 URL domains with more than 4k matches"),
|
| 356 |
style="""
|
| 357 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 358 |
padding: 15px;
|
| 359 |
border-radius: 12px;
|
| 360 |
+
marging-bottom: 15px
|
| 361 |
""",
|
| 362 |
),
|
| 363 |
|
|
|
|
| 368 |
Summary("6 url domains that are removed from the blocklist"),
|
| 369 |
DVS(urls_false_positives, "6 url domains that are removed from the blocklist"),
|
| 370 |
style="""
|
| 371 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 372 |
padding: 15px;
|
| 373 |
border-radius: 12px;
|
| 374 |
+
marging-bottom: 15px
|
| 375 |
""",
|
| 376 |
),
|
| 377 |
|
|
|
|
| 383 |
"Sample documents whose urls are blocked by the refined url blocklist",
|
| 384 |
),
|
| 385 |
style="""
|
| 386 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 387 |
padding: 15px;
|
| 388 |
border-radius: 12px;
|
| 389 |
+
marging-bottom: 15px
|
| 390 |
""",
|
| 391 |
),
|
| 392 |
|
|
|
|
| 402 |
"curated url domains that are excluded from our dataset",
|
| 403 |
),
|
| 404 |
style="""
|
| 405 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 406 |
padding: 15px;
|
| 407 |
border-radius: 12px;
|
| 408 |
+
marging-bottom: 15px
|
| 409 |
""",
|
| 410 |
),
|
| 411 |
|
|
|
|
| 416 |
background-color: #EAFFF1; /* Light green background */
|
| 417 |
padding: 15px;
|
| 418 |
border-radius: 12px;
|
| 419 |
+
marging-bottom: 15px
|
| 420 |
""",
|
| 421 |
),
|
| 422 |
|
|
|
|
| 447 |
"Sample documents with lines that are removed by the rule of terminal punctuation",
|
| 448 |
),
|
| 449 |
style="""
|
| 450 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 451 |
padding: 15px;
|
| 452 |
border-radius: 12px;
|
| 453 |
+
marging-bottom: 15px
|
| 454 |
""",
|
| 455 |
),
|
| 456 |
|
|
|
|
| 474 |
"Sample documents that are removed by original C4 javascript rule but are kept after our refinement",
|
| 475 |
),
|
| 476 |
style="""
|
| 477 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 478 |
padding: 15px;
|
| 479 |
border-radius: 12px;
|
| 480 |
+
marging-bottom: 15px
|
| 481 |
""",
|
| 482 |
),
|
| 483 |
H3("2.2 Other Rules from RefinedWeb"),
|
|
|
|
| 498 |
"Sample documents with lines that are removed by the RefinedWeb rules",
|
| 499 |
),
|
| 500 |
style="""
|
| 501 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 502 |
padding: 15px;
|
| 503 |
border-radius: 12px;
|
| 504 |
+
marging-bottom: 15px
|
| 505 |
""",
|
| 506 |
),
|
| 507 |
H3("2.3 Toxic Lines"),
|
|
|
|
| 519 |
"Sample documents with toxic lines",
|
| 520 |
),
|
| 521 |
style="""
|
| 522 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 523 |
padding: 15px;
|
| 524 |
border-radius: 12px;
|
| 525 |
+
marging-bottom: 15px
|
| 526 |
""",
|
| 527 |
),
|
| 528 |
|
|
|
|
| 540 |
background-color: #EAFFF1; /* Light green background */
|
| 541 |
padding: 15px;
|
| 542 |
border-radius: 12px;
|
| 543 |
+
marging-bottom: 15px
|
| 544 |
""",
|
| 545 |
),
|
| 546 |
P("""Similar to previous sections, we will present sample documents filtered out by the given quality signals.
|
|
|
|
| 584 |
background-color: #FFFAEA; /* Light yellow background */
|
| 585 |
padding: 15px;
|
| 586 |
border-radius: 12px;
|
| 587 |
+
marging-bottom: 15px
|
| 588 |
""",
|
| 589 |
),
|
| 590 |
Details(
|
|
|
|
| 624 |
background-color: #FFFAEA; /* Light yellow background */
|
| 625 |
padding: 15px;
|
| 626 |
border-radius: 12px;
|
| 627 |
+
marging-bottom: 15px
|
| 628 |
""",
|
| 629 |
),
|
| 630 |
P("""
|
|
|
|
| 671 |
background-color: #EAFFF1; /* Light green background */
|
| 672 |
padding: 15px;
|
| 673 |
border-radius: 12px;
|
| 674 |
+
marging-bottom: 15px
|
| 675 |
""",
|
| 676 |
),
|
| 677 |
Details(
|
|
|
|
| 685 |
background-color: #EAFFF1; /* Light green background */
|
| 686 |
padding: 15px;
|
| 687 |
border-radius: 12px;
|
| 688 |
+
marging-bottom: 15px
|
| 689 |
""",
|
| 690 |
),
|
| 691 |
H3("3.1.2 Fraction of Characters in the Most Common N-grams (n=2,3,4)"),
|
|
|
|
| 714 |
background-color: #FFFAEA; /* Light yellow background */
|
| 715 |
padding: 15px;
|
| 716 |
border-radius: 12px;
|
| 717 |
+
marging-bottom: 15px
|
| 718 |
""",
|
| 719 |
),
|
| 720 |
Details(
|
|
|
|
| 758 |
background-color: #FFFAEA; /* Light yellow background */
|
| 759 |
padding: 15px;
|
| 760 |
border-radius: 12px;
|
| 761 |
+
marging-bottom: 15px
|
| 762 |
""",
|
| 763 |
),
|
| 764 |
|
|
|
|
| 787 |
background-color: #FFFAEA; /* Light yellow background */
|
| 788 |
padding: 15px;
|
| 789 |
border-radius: 12px;
|
| 790 |
+
marging-bottom: 15px
|
| 791 |
""",
|
| 792 |
),
|
| 793 |
P("""
|
|
|
|
| 820 |
background-color: #EAFFF1; /* Light green background */
|
| 821 |
padding: 15px;
|
| 822 |
border-radius: 12px;
|
| 823 |
+
marging-bottom: 15px
|
| 824 |
""",
|
| 825 |
),
|
| 826 |
Details(
|
|
|
|
| 834 |
background-color: #EAFFF1; /* Light green background */
|
| 835 |
padding: 15px;
|
| 836 |
border-radius: 12px;
|
| 837 |
+
marging-bottom: 15px
|
| 838 |
""",
|
| 839 |
),
|
| 840 |
H3("3.1.3 Fraction of Characters in Duplicated N-grams (n=5,...,10)"),
|
|
|
|
| 866 |
background-color: #FFFAEA; /* Light yellow background */
|
| 867 |
padding: 15px;
|
| 868 |
border-radius: 12px;
|
| 869 |
+
marging-bottom: 15px
|
| 870 |
""",
|
| 871 |
),
|
| 872 |
Details(
|
|
|
|
| 925 |
background-color: #FFFAEA; /* Light yellow background */
|
| 926 |
padding: 15px;
|
| 927 |
border-radius: 12px;
|
| 928 |
+
marging-bottom: 15px
|
| 929 |
""",
|
| 930 |
),
|
| 931 |
|
|
|
|
| 956 |
background-color: #FFFAEA; /* Light yellow background */
|
| 957 |
padding: 15px;
|
| 958 |
border-radius: 12px;
|
| 959 |
+
marging-bottom: 15px
|
| 960 |
""",
|
| 961 |
),
|
| 962 |
P("""
|
|
|
|
| 1015 |
background-color: #EAFFF1; /* Light green background */
|
| 1016 |
padding: 15px;
|
| 1017 |
border-radius: 12px;
|
| 1018 |
+
marging-bottom: 15px
|
| 1019 |
""",
|
| 1020 |
),
|
| 1021 |
Details(
|
|
|
|
| 1035 |
background-color: #EAFFF1; /* Light green background */
|
| 1036 |
padding: 15px;
|
| 1037 |
border-radius: 12px;
|
| 1038 |
+
marging-bottom: 15px
|
| 1039 |
""",
|
| 1040 |
),
|
| 1041 |
H5(
|
|
|
|
| 1052 |
background-color: #EAFFF1; /* Light green background */
|
| 1053 |
padding: 15px;
|
| 1054 |
border-radius: 12px;
|
| 1055 |
+
marging-bottom: 15px
|
| 1056 |
""",
|
| 1057 |
),
|
| 1058 |
H3("3.2 Line-wise Heuristics"),
|
|
|
|
| 1084 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1085 |
padding: 15px;
|
| 1086 |
border-radius: 12px;
|
| 1087 |
+
marging-bottom: 15px
|
| 1088 |
""",
|
| 1089 |
),
|
| 1090 |
Details(
|
|
|
|
| 1134 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1135 |
padding: 15px;
|
| 1136 |
border-radius: 12px;
|
| 1137 |
+
marging-bottom: 15px
|
| 1138 |
""",
|
| 1139 |
),
|
| 1140 |
|
|
|
|
| 1150 |
background-color: #EAFFF1; /* Light green background */
|
| 1151 |
padding: 15px;
|
| 1152 |
border-radius: 12px;
|
| 1153 |
+
marging-bottom: 15px
|
| 1154 |
""",
|
| 1155 |
),
|
| 1156 |
|
|
|
|
| 1216 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1217 |
padding: 15px;
|
| 1218 |
border-radius: 12px;
|
| 1219 |
+
marging-bottom: 15px
|
| 1220 |
""",
|
| 1221 |
),
|
| 1222 |
|
|
|
|
| 1233 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1234 |
padding: 15px;
|
| 1235 |
border-radius: 12px;
|
| 1236 |
+
marging-bottom: 15px
|
| 1237 |
""",
|
| 1238 |
),
|
| 1239 |
P("""
|
|
|
|
| 1283 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1284 |
padding: 15px;
|
| 1285 |
border-radius: 12px;
|
| 1286 |
+
marging-bottom: 15px
|
| 1287 |
""",
|
| 1288 |
),
|
| 1289 |
P("""
|
|
|
|
| 1305 |
background-color: #EAFFF1; /* Light green background */
|
| 1306 |
padding: 15px;
|
| 1307 |
border-radius: 12px;
|
| 1308 |
+
marging-bottom: 15px
|
| 1309 |
""",
|
| 1310 |
),
|
| 1311 |
|
|
|
|
| 1327 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1328 |
padding: 15px;
|
| 1329 |
border-radius: 12px;
|
| 1330 |
+
marging-bottom: 15px
|
| 1331 |
""",
|
| 1332 |
),
|
| 1333 |
Details(
|
|
|
|
| 1359 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1360 |
padding: 15px;
|
| 1361 |
border-radius: 12px;
|
| 1362 |
+
marging-bottom: 15px
|
| 1363 |
""",
|
| 1364 |
),
|
| 1365 |
|
|
|
|
| 1375 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1376 |
padding: 15px;
|
| 1377 |
border-radius: 12px;
|
| 1378 |
+
marging-bottom: 15px
|
| 1379 |
""",
|
| 1380 |
),
|
| 1381 |
Details(
|
|
|
|
| 1391 |
background-color: #EAFFF1; /* Light green background */
|
| 1392 |
padding: 15px;
|
| 1393 |
border-radius: 12px;
|
| 1394 |
+
marging-bottom: 15px
|
| 1395 |
""",
|
| 1396 |
),
|
| 1397 |
|
|
|
|
| 1407 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1408 |
padding: 15px;
|
| 1409 |
border-radius: 12px;
|
| 1410 |
+
marging-bottom: 15px
|
| 1411 |
""",
|
| 1412 |
),
|
| 1413 |
Details(
|
|
|
|
| 1437 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1438 |
padding: 15px;
|
| 1439 |
border-radius: 12px;
|
| 1440 |
+
marging-bottom: 15px
|
| 1441 |
""",
|
| 1442 |
),
|
| 1443 |
Details(
|
|
|
|
| 1454 |
background-color: #FFFAEA; /* Light yellow background */
|
| 1455 |
padding: 15px;
|
| 1456 |
border-radius: 12px;
|
| 1457 |
+
marging-bottom: 15px
|
| 1458 |
""",
|
| 1459 |
),
|
| 1460 |
P("""
|
|
|
|
| 1486 |
background-color: #EAFFF1; /* Light green background */
|
| 1487 |
padding: 15px;
|
| 1488 |
border-radius: 12px;
|
| 1489 |
+
marging-bottom: 15px
|
| 1490 |
""",
|
| 1491 |
),
|
| 1492 |
H3("3.4 Others"),
|
|
|
|
| 1499 |
Summary("Sample documents containing 'lorem ipsum'"),
|
| 1500 |
DV("data/lorem_ipsum.json", 0, "Sample documents containing 'lorem ipsum'"),
|
| 1501 |
style="""
|
| 1502 |
+
background-color: #FAEAEA; /* Light pink background */
|
| 1503 |
padding: 15px;
|
| 1504 |
border-radius: 12px;
|
| 1505 |
+
marging-bottom: 15px
|
| 1506 |
""",
|
| 1507 |
),
|
| 1508 |
H2("4. Deduplication"),
|