File size: 3,549 Bytes
728801d
 
 
 
 
6e5511f
728801d
6e5511f
 
 
 
 
 
 
 
 
728801d
 
 
6e5511f
 
 
 
 
 
 
 
 
 
 
 
 
728801d
6e5511f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
728801d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
<!doctype html>
<html>
	<head>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width" />
		<title>xet-repo-dedupe</title>
		<link rel="stylesheet" href="style.css" />
    <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
    <style>
      #vis {
        width: 100%;
        text-align: center;
      }
    </style>
	</head>
	<body>
		<div class="card">
			<h1>Visualizing Repo-level Dedupe</h1>
      <p>This visualization demonstrates the amount of <a target="_blank" rel="noopener noreferrer" href="https://huggingface.co/blog/from-files-to-chunks">chunk-level dedupe</a> across all public repos.</p>
      <p>"Dedupe factor" is defined as the number of re-uses of a given "xorb". A "xorb" is a collection of content-defined chunks, typically around 1,000 chunks comprising up to 64 MB of total data.</p>
      <p>Interactions:
        <ul>
          <li>
            Hover to select a xorb, and highlight the same xorb in all other repos in <strong><span style="color: red">red</span></strong>.
          </li>
          <li>
            Click to select a row (repo), and fade out all repos that don't contain any overlapping data. Double-click to clear selection.
          </li>
        </ul>
      </p>
		</div>
    <div id="vis"></div>
    <script>
      var vlSpec = {
        "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
        "resolve": {"scale": {"x": "independent"}},
        "width": 600,
        "height": 12,
        "params": [
          {
            "name": "highlight",
            "select": {"type": "point", "fields": ["xorb_id"], "on": "pointerover"}
          },
          {
            "name": "select",
            "select": {"type": "point", "fields": ["repo"], "toggle": "false"}
          },
          {
            "name": "xorbs_selected",
            "expr": "pluck(data('source_0'), 'repo_xorb_selected')"
          },
          {"name": "any_xorbs_selected", "expr": "extent(xorbs_selected)[0] != null"}
        ],
        "transform": [
          {
            "calculate": "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
            "as": "repo_selected"
          },
          {
            "calculate": "if(datum.repo_selected > 0, datum.xorb_id, null)",
            "as": "repo_xorb_selected"
          }
        ],
        "data": {
          "url": "xorbs.json"
        },
        "mark": "rect",
        "encoding": {
          "x": {
            "field": "xorb_id",
            "axis": null,
            "stack": "normalize"
          },
          "color": {
            "condition": [
              {"test": "datum.xorb_id == highlight.xorb_id", "value": "orange"}
            ],
            "field": "dedupe_factor",
            "type": "quantitative",
            "scale": {"domain": [0, 10]}
          },
          "opacity": {
            "condition": [
              {
                "test": "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
                "value": 0.2
              }
            ]
          },
          "tooltip": {"field": "dedupe_factor"},
          "row": {
            "field": "repo",
            "spacing": 1,
            "header": {"labelAngle": 0, "labelAlign": "left"},
            "sort": {"field": "dedupe_factor", "order": "descending"}
          }
        }
      };
      vegaEmbed('#vis', vlSpec);
    </script>
	</body>
</html>