|
<!DOCTYPE html> |
|
<html lang="en"> |
|
<head> |
|
<meta charset="UTF-8" /> |
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
|
<title>Repo-Level Dedupe Visualization</title> |
|
<link rel="stylesheet" href="style.css" /> |
|
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script> |
|
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script> |
|
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script> |
|
</head> |
|
<body> |
|
<div class="container"> |
|
<div class="header"> |
|
<h1>Visualizing Repo-Level Dedupe</h1> |
|
<p> |
|
This visualization demonstrates block-level deduplication across all |
|
models in |
|
<a |
|
target="_blank" |
|
href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF" |
|
>bartowski/gemma-2-9b-it-GGUF</a |
|
>. |
|
</p> |
|
<p> |
|
Each row represents a file in the repository grouped into blocks of up |
|
to 64MB. The color of each block represents the deduplication ratio |
|
for the block, which is a function of how often the chunks in the |
|
block are shared between files. The darker the color, the more |
|
frequently content is shared, the better the overall upload and |
|
download times for a given file! The deduplication savings here take a |
|
191GB repo and cut it down to 97GB, helping to shave a few hours off |
|
the upload time. |
|
</p> |
|
<p> |
|
You can read more about chunks, blocks, and the nitty gritty details |
|
of how we make this all work in our accompanying |
|
<a |
|
target="_blank" |
|
href="https://huggingface.co/blog/from-chunks-to-blocks" |
|
>blog post</a |
|
>. |
|
</p> |
|
To explore the visualization: |
|
<ul> |
|
<li> |
|
<strong>Hover</strong> over a block in an individual file to |
|
highlight it and see where else it appears in the repository. |
|
</li> |
|
<li> |
|
<strong>Click</strong> any block in a file to see all other files |
|
that share blocks. |
|
</li> |
|
<li> |
|
<strong>Double-click</strong> anywhere on any file to reset and |
|
continue exploring. |
|
</li> |
|
</ul> |
|
</div> |
|
|
|
<div class="heatmap-container"> |
|
<div id="vis"></div> |
|
</div> |
|
</div> |
|
<script> |
|
var vlSpec = { |
|
$schema: "https://vega.github.io/schema/vega-lite/v5.json", |
|
resolve: { scale: { x: "independent" } }, |
|
width: 800, |
|
height: 25, |
|
params: [ |
|
{ |
|
name: "highlight", |
|
select: { type: "point", fields: ["xorb_id"], on: "pointerover" }, |
|
}, |
|
{ |
|
name: "select", |
|
select: { type: "point", fields: ["repo"], toggle: "false" }, |
|
}, |
|
{ |
|
name: "xorbs_selected", |
|
expr: "pluck(data('source_0'), 'repo_xorb_selected')", |
|
}, |
|
{ |
|
name: "any_xorbs_selected", |
|
expr: "extent(xorbs_selected)[0] != null", |
|
}, |
|
], |
|
transform: [ |
|
{ |
|
calculate: |
|
"(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1", |
|
as: "repo_selected", |
|
}, |
|
{ |
|
calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)", |
|
as: "repo_xorb_selected", |
|
}, |
|
{ |
|
calculate: |
|
"split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]", |
|
as: "repo", |
|
}, |
|
], |
|
data: { |
|
url: "xorbs.json", |
|
}, |
|
mark: "rect", |
|
encoding: { |
|
x: { |
|
field: "xorb_id", |
|
axis: null, |
|
sort: { field: "dedupe_factor", order: "descending" }, |
|
stack: "normalize", |
|
}, |
|
color: { |
|
condition: [ |
|
{ test: "datum.xorb_id == highlight.xorb_id", value: "orange" }, |
|
], |
|
field: "dedupe_factor", |
|
type: "quantitative", |
|
scale: { scheme: "blues", domain: [0, 10] }, |
|
}, |
|
opacity: { |
|
condition: [ |
|
{ |
|
test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1", |
|
value: 0.2, |
|
}, |
|
], |
|
}, |
|
tooltip: [ |
|
{ field: "repo", type: "nominal", title: "File" }, |
|
{ field: "xorb_id", type: "nominal", title: "Block Hash" }, |
|
{ |
|
field: "dedupe_factor", |
|
type: "quantitative", |
|
title: "Dedupe Factor", |
|
}, |
|
], |
|
row: { |
|
field: "repo", |
|
title: "", |
|
spacing: 1, |
|
header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 }, |
|
sort: { field: "repo", order: "ascending" }, |
|
}, |
|
}, |
|
}; |
|
vegaEmbed("#vis", vlSpec); |
|
</script> |
|
</body> |
|
</html> |
|
|