omwdataset

Runtime error

App Files Files Community

omkarenator commited on Oct 2, 2024

Commit

4a437aa

1 Parent(s): 9f2a4f7

fix bf example

Browse files

Files changed (1) hide show

common.py +62 -45

common.py CHANGED Viewed

@@ -1,6 +1,13 @@
 from fasthtml.common import *
 from fasthtml.components import *
-from fasthtml.components import D_title, D_article, D_front_matter, D_contents, D_byline, D_cite
 from fh_plotly import plotly2fasthtml
 import pandas as pd
 import json
@@ -46,48 +53,56 @@ def dup_cluster_graph():
     return fig
-bloom_filter_table_info = pd.DataFrame(
-    {
-        "Bloom Filter": [
-            "BF 0",
-            "BF 8 ",
-        ],
-        "Band 0": [
-            """
-                (A,B)
-                (C,D)
-                (E,K)
-                 """,
-            "(B,K)",
-        ],
-        "Band 1": [
-            """
-                (A,B)
-                (C,D)
-                (F,K)
-                 """,
-            "(B,K)",
-        ],
-        "....": [
-            "...",
-            "...",
-        ],
-        "Band 8": [
-            """
-                (A,B)
-                (C,D)
-                (D,E)
-                 """,
-            """
-                (E,K)
-                (B,K)
-                 """,
-        ],
-    }
-)
-table_html_bloom_filter = bloom_filter_table_info.to_html(index=False, border=0)
-table_div_bloom_examples = Div(NotStr(table_html_bloom_filter), style="margin: 40px;")
 def dup_docs_count_graph():
@@ -298,7 +313,9 @@ global_div = Div(
             "To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
         ),
         plotly2fasthtml(dup_cluster_graph()),
-        P("The example below is from one such cluster. Here most of the text is repeated with just specifics changed."),
         Img(src="images/100k.png", style="max-width: 100%;"),
         P(
             "We started deduplication with 61.8 TB of filtered and compressed documents. The initial dataset had roughly 48.83 billion documents. First, we performed exact deduplication using a Bloom filter with a capacity of 1 billion and a false positive rate of 0.001. This reduced the documents from 48.83 billion to 40.21 billion, removing about 17% as exact duplicates. This step used constant memory for the Bloom filter and lessened the workload for subsequent near-deduplication."
@@ -344,7 +361,7 @@ global_div = Div(
         P(
             "There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. Performing the Bloom filter step reduces the number of pairs by nearly ninefold."
         ),
-        table_div_bloom_examples,
         P(
             "The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches."
         ),

 from fasthtml.common import *
 from fasthtml.components import *
+from fasthtml.components import (
+    D_title,
+    D_article,
+    D_front_matter,
+    D_contents,
+    D_byline,
+    D_cite,
+)
 from fh_plotly import plotly2fasthtml
 import pandas as pd
 import json
     return fig
+def dedup_pairs_bands():
+    return pd.DataFrame(
+        {
+            "Bloom Filter": [
+                "BF 0",
+                "",
+                "",
+                "",
+                "BF 1",
+                "",
+                "BF 8",
+            ],
+            "Band 0": [
+                "(A,B)",
+                "(C,D)",
+                "(E,K)",
+                "(B,K)",
+                "...",
+                "...",
+                "...",
+            ],
+            "Band 1": [
+                "(A,B)",
+                "(C,D)",
+                "(F,K)",
+                "(B,K)",
+                "...",
+                "...",
+                "...",
+            ],
+            "....": [
+                "...",
+                "...",
+                "...",
+                "...",
+                "...",
+                "...",
+                "...",
+            ],
+            "Band 8": [
+                "(A,B)",
+                "(C,D)",
+                "(D,E)",
+                "(E,K)",
+                "(B,K)",
+                "...",
+                "...",
+            ],
+        }
+    ).to_html(index=False, border=0)
 def dup_docs_count_graph():
             "To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
         ),
         plotly2fasthtml(dup_cluster_graph()),
+        P(
+            "The example below is from one such cluster. Here most of the text is repeated with just specifics changed."
+        ),
         Img(src="images/100k.png", style="max-width: 100%;"),
         P(
             "We started deduplication with 61.8 TB of filtered and compressed documents. The initial dataset had roughly 48.83 billion documents. First, we performed exact deduplication using a Bloom filter with a capacity of 1 billion and a false positive rate of 0.001. This reduced the documents from 48.83 billion to 40.21 billion, removing about 17% as exact duplicates. This step used constant memory for the Bloom filter and lessened the workload for subsequent near-deduplication."
         P(
             "There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. Performing the Bloom filter step reduces the number of pairs by nearly ninefold."
         ),
+        Div(NotStr(dedup_pairs_bands()), style="margin: 40px;"),
         P(
             "The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches."
         ),