Spaces:
Runtime error
Runtime error
Commit
·
4a437aa
1
Parent(s):
9f2a4f7
fix bf example
Browse files
common.py
CHANGED
|
@@ -1,6 +1,13 @@
|
|
| 1 |
from fasthtml.common import *
|
| 2 |
from fasthtml.components import *
|
| 3 |
-
from fasthtml.components import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from fh_plotly import plotly2fasthtml
|
| 5 |
import pandas as pd
|
| 6 |
import json
|
|
@@ -46,48 +53,56 @@ def dup_cluster_graph():
|
|
| 46 |
return fig
|
| 47 |
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
"
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
"
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
(
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
def dup_docs_count_graph():
|
|
@@ -298,7 +313,9 @@ global_div = Div(
|
|
| 298 |
"To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
|
| 299 |
),
|
| 300 |
plotly2fasthtml(dup_cluster_graph()),
|
| 301 |
-
P(
|
|
|
|
|
|
|
| 302 |
Img(src="images/100k.png", style="max-width: 100%;"),
|
| 303 |
P(
|
| 304 |
"We started deduplication with 61.8 TB of filtered and compressed documents. The initial dataset had roughly 48.83 billion documents. First, we performed exact deduplication using a Bloom filter with a capacity of 1 billion and a false positive rate of 0.001. This reduced the documents from 48.83 billion to 40.21 billion, removing about 17% as exact duplicates. This step used constant memory for the Bloom filter and lessened the workload for subsequent near-deduplication."
|
|
@@ -344,7 +361,7 @@ global_div = Div(
|
|
| 344 |
P(
|
| 345 |
"There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. Performing the Bloom filter step reduces the number of pairs by nearly ninefold."
|
| 346 |
),
|
| 347 |
-
|
| 348 |
P(
|
| 349 |
"The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches."
|
| 350 |
),
|
|
|
|
| 1 |
from fasthtml.common import *
|
| 2 |
from fasthtml.components import *
|
| 3 |
+
from fasthtml.components import (
|
| 4 |
+
D_title,
|
| 5 |
+
D_article,
|
| 6 |
+
D_front_matter,
|
| 7 |
+
D_contents,
|
| 8 |
+
D_byline,
|
| 9 |
+
D_cite,
|
| 10 |
+
)
|
| 11 |
from fh_plotly import plotly2fasthtml
|
| 12 |
import pandas as pd
|
| 13 |
import json
|
|
|
|
| 53 |
return fig
|
| 54 |
|
| 55 |
|
| 56 |
+
def dedup_pairs_bands():
|
| 57 |
+
return pd.DataFrame(
|
| 58 |
+
{
|
| 59 |
+
"Bloom Filter": [
|
| 60 |
+
"BF 0",
|
| 61 |
+
"",
|
| 62 |
+
"",
|
| 63 |
+
"",
|
| 64 |
+
"BF 1",
|
| 65 |
+
"",
|
| 66 |
+
"BF 8",
|
| 67 |
+
],
|
| 68 |
+
"Band 0": [
|
| 69 |
+
"(A,B)",
|
| 70 |
+
"(C,D)",
|
| 71 |
+
"(E,K)",
|
| 72 |
+
"(B,K)",
|
| 73 |
+
"...",
|
| 74 |
+
"...",
|
| 75 |
+
"...",
|
| 76 |
+
],
|
| 77 |
+
"Band 1": [
|
| 78 |
+
"(A,B)",
|
| 79 |
+
"(C,D)",
|
| 80 |
+
"(F,K)",
|
| 81 |
+
"(B,K)",
|
| 82 |
+
"...",
|
| 83 |
+
"...",
|
| 84 |
+
"...",
|
| 85 |
+
],
|
| 86 |
+
"....": [
|
| 87 |
+
"...",
|
| 88 |
+
"...",
|
| 89 |
+
"...",
|
| 90 |
+
"...",
|
| 91 |
+
"...",
|
| 92 |
+
"...",
|
| 93 |
+
"...",
|
| 94 |
+
],
|
| 95 |
+
"Band 8": [
|
| 96 |
+
"(A,B)",
|
| 97 |
+
"(C,D)",
|
| 98 |
+
"(D,E)",
|
| 99 |
+
"(E,K)",
|
| 100 |
+
"(B,K)",
|
| 101 |
+
"...",
|
| 102 |
+
"...",
|
| 103 |
+
],
|
| 104 |
+
}
|
| 105 |
+
).to_html(index=False, border=0)
|
| 106 |
|
| 107 |
|
| 108 |
def dup_docs_count_graph():
|
|
|
|
| 313 |
"To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
|
| 314 |
),
|
| 315 |
plotly2fasthtml(dup_cluster_graph()),
|
| 316 |
+
P(
|
| 317 |
+
"The example below is from one such cluster. Here most of the text is repeated with just specifics changed."
|
| 318 |
+
),
|
| 319 |
Img(src="images/100k.png", style="max-width: 100%;"),
|
| 320 |
P(
|
| 321 |
"We started deduplication with 61.8 TB of filtered and compressed documents. The initial dataset had roughly 48.83 billion documents. First, we performed exact deduplication using a Bloom filter with a capacity of 1 billion and a false positive rate of 0.001. This reduced the documents from 48.83 billion to 40.21 billion, removing about 17% as exact duplicates. This step used constant memory for the Bloom filter and lessened the workload for subsequent near-deduplication."
|
|
|
|
| 361 |
P(
|
| 362 |
"There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. Performing the Bloom filter step reduces the number of pairs by nearly ninefold."
|
| 363 |
),
|
| 364 |
+
Div(NotStr(dedup_pairs_bands()), style="margin: 40px;"),
|
| 365 |
P(
|
| 366 |
"The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches."
|
| 367 |
),
|