|
<!doctype html> |
|
<html> |
|
<head> |
|
<meta charset="utf-8" /> |
|
<meta name="viewport" content="width=device-width" /> |
|
<title>xet-repo-dedupe</title> |
|
<link rel="stylesheet" href="style.css" /> |
|
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script> |
|
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script> |
|
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script> |
|
<style> |
|
#vis { |
|
width: 100%; |
|
text-align: center; |
|
} |
|
</style> |
|
</head> |
|
<body> |
|
<div class="card"> |
|
<h1>Visualizing Repo-level Dedupe</h1> |
|
<p>This visualization demonstrates the amount of <a target="_blank" rel="noopener noreferrer" href="https://huggingface.co/blog/from-files-to-chunks">chunk-level dedupe</a> across all public repos.</p> |
|
<p>"Dedupe factor" is defined as the number of re-uses of a given "xorb". A "xorb" is a collection of content-defined chunks, typically around 1,000 chunks comprising up to 64 MB of total data.</p> |
|
<p>Interactions: |
|
<ul> |
|
<li> |
|
Hover to select a xorb, and highlight the same xorb in all other repos in <strong><span style="color: red">red</span></strong>. |
|
</li> |
|
<li> |
|
Click to select a row (repo), and fade out all repos that don't contain any overlapping data. Double-click to clear selection. |
|
</li> |
|
</ul> |
|
</p> |
|
</div> |
|
<div id="vis"></div> |
|
<script> |
|
var vlSpec = { |
|
"$schema": "https://vega.github.io/schema/vega-lite/v5.json", |
|
"resolve": {"scale": {"x": "independent"}}, |
|
"width": 600, |
|
"height": 12, |
|
"params": [ |
|
{ |
|
"name": "highlight", |
|
"select": {"type": "point", "fields": ["xorb_id"], "on": "pointerover"} |
|
}, |
|
{ |
|
"name": "select", |
|
"select": {"type": "point", "fields": ["repo"], "toggle": "false"} |
|
}, |
|
{ |
|
"name": "xorbs_selected", |
|
"expr": "pluck(data('source_0'), 'repo_xorb_selected')" |
|
}, |
|
{"name": "any_xorbs_selected", "expr": "extent(xorbs_selected)[0] != null"} |
|
], |
|
"transform": [ |
|
{ |
|
"calculate": "(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1", |
|
"as": "repo_selected" |
|
}, |
|
{ |
|
"calculate": "if(datum.repo_selected > 0, datum.xorb_id, null)", |
|
"as": "repo_xorb_selected" |
|
} |
|
], |
|
"data": { |
|
"url": "xorbs.json" |
|
}, |
|
"mark": "rect", |
|
"encoding": { |
|
"x": { |
|
"field": "xorb_id", |
|
"axis": null, |
|
"stack": "normalize" |
|
}, |
|
"color": { |
|
"condition": [ |
|
{"test": "datum.xorb_id == highlight.xorb_id", "value": "orange"} |
|
], |
|
"field": "dedupe_factor", |
|
"type": "quantitative", |
|
"scale": {"domain": [0, 10]} |
|
}, |
|
"opacity": { |
|
"condition": [ |
|
{ |
|
"test": "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1", |
|
"value": 0.2 |
|
} |
|
] |
|
}, |
|
"tooltip": {"field": "dedupe_factor"}, |
|
"row": { |
|
"field": "repo", |
|
"spacing": 1, |
|
"header": {"labelAngle": 0, "labelAlign": "left"}, |
|
"sort": {"field": "dedupe_factor", "order": "descending"} |
|
} |
|
} |
|
}; |
|
vegaEmbed('#vis', vlSpec); |
|
</script> |
|
</body> |
|
</html> |
|
|