Spaces:
Running
Running
File size: 5,293 Bytes
3f63dc8 688cbf7 3f63dc8 6e5511f 3f63dc8 dbd2f74 3f63dc8 3b0ec08 3f63dc8 688cbf7 3f63dc8 3b0ec08 1288276 688cbf7 1288276 3b0ec08 3f63dc8 688cbf7 3f63dc8 6e5511f dbd2f74 6e5511f dbd2f74 3f63dc8 dbd2f74 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 688cbf7 6e5511f 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 72714de 3f63dc8 6e5511f 3f63dc8 6e5511f 3f63dc8 728801d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>XLM-R Dedupe</title>
<link rel="stylesheet" href="style.css" />
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
</head>
<body>
<div class="container">
<div class="header">
<h1>Visualizing XLM-RoBERTa Finetune Dedupe</h1>
<p>
This heatmap shows deduplication across a family of fine-tuned models based on <a href="https://huggingface.co/papers/1911.02116" target="_blank">XLM-RoBERTa large</a>, a multilingual transformer introduced in 2019 and trained on 100 languages. Each row represents a model repository (which often contains multiple formats; e.g., Safetensor, Keras, PyTorch) derived from the original research. Repository data is chunked into blocks of up to 64MB in Xet's storage layer, and this heatmap visualizes those blocks across models.
</p>
<p>
The base model is <a href="https://huggingface.co/FacebookAI/xlm-roberta-large" target="_blank"><code>xlm-roberta-large</code></a>, while the others are fine-tuned for specific languages on the CoNLL NER datasets (Dutch, Spanish, English, German). Darker blue regions highlight content shared across models—the more overlap, the more efficient storage and transfer becomes. This level of deduplication leads to faster uploads, quicker iterations, and less friction when scaling experimentation.
</p>
<p>
XLM-RoBERTa large currently has <a href="https://huggingface.co/models?other=base_model:finetune:FacebookAI/xlm-roberta-large" target="_blank">396 fine-tunes on the Hub</a>. The fine-tunes from the original CoNLL research deduplicate at ~17%, representing a substantial time savings for builders repeatedly pushing new checkpoints and variants.
</p>
To explore the visualization:
<ul>
<li>
<strong>Hover</strong> over a block in a repository to
highlight it and see where else it appears in other repos.
</li>
<li>
<strong>Click</strong> any block to see all other repositories
that share blocks with that repo.
</li>
<li>
<strong>Double-click</strong> anywhere on any repo to reset and
continue exploring.
</li>
</ul>
</div>
<div class="heatmap-container">
<div id="vis"></div>
</div>
</div>
<script>
var vlSpec = {
$schema: "https://vega.github.io/schema/vega-lite/v5.json",
resolve: { scale: { x: "independent" } },
width: 800,
height: 25,
params: [
{
name: "highlight",
select: { type: "point", fields: ["xorb_id"], on: "pointerover" },
},
{
name: "select",
select: { type: "point", fields: ["repo"], toggle: "false" },
},
{
name: "xorbs_selected",
expr: "pluck(data('source_0'), 'repo_xorb_selected')",
},
{
name: "any_xorbs_selected",
expr: "extent(xorbs_selected)[0] != null",
},
],
transform: [
{
calculate:
"(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
as: "repo_selected",
},
{
calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)",
as: "repo_xorb_selected",
},
{
calculate:
"split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]",
as: "repo",
},
],
data: {
url: "xorbs.json",
},
mark: "rect",
encoding: {
x: {
field: "xorb_id",
axis: null,
sort: { field: "dedupe_factor", order: "descending" },
stack: "normalize",
},
color: {
condition: [
{ test: "datum.xorb_id == highlight.xorb_id", value: "orange" },
],
field: "dedupe_factor",
type: "quantitative",
scale: { scheme: "blues", domain: [0, 5] },
},
opacity: {
condition: [
{
test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
value: 0.2,
},
],
},
tooltip: [
{ field: "repo", type: "nominal", title: "File" },
{ field: "xorb_id", type: "nominal", title: "Block Hash" },
{
field: "dedupe_factor",
type: "quantitative",
title: "Dedupe Factor",
},
],
row: {
field: "repo",
title: "",
spacing: 1,
header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 },
sort: { field: "repo", order: "ascending" },
},
},
};
vegaEmbed("#vis", vlSpec);
</script>
</body>
</html>
|