jsulz's picture
jsulz HF Staff
updating viz, space layout, and data
3f63dc8
raw
history blame
4.61 kB
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Repo-Level Dedupe Visualization</title>
<link rel="stylesheet" href="style.css" />
<script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
<script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
</head>
<body>
<div class="container">
<div class="header">
<h1>Visualizing Repo-Level Dedupe</h1>
<p>
This visualization demonstrates block-level deduplication across all
models in
<a href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF"
>bartowski/gemma-2-9b-it-GGUF</a
>.
</p>
<p>
Each row represents a file in the repository grouped into blocks of up
to 64MB. The color of each block represents the dedupe factor for the
block, which is a function of how many chunks in the block are present
across all files in the repository. The darker the color, the more the
block shares content. You can read more about chunks, blocks, and how
the dedupe factor is calculated
<a href="#">in this blog post</a>.
</p>
To explore the visualization:
<ul>
<li>
<strong>Hover</strong> over a block in an individual file to
highlight it and see where else it appears in the repository.
</li>
<li>
<strong>Click</strong> any block in a file to see all other files
that share blocks with it.
</li>
<li>
<strong>Double-click</strong> anywhere on a file to reset and
continue exploring.
</li>
</ul>
</div>
<div class="heatmap-container">
<div id="vis"></div>
</div>
</div>
<script>
var vlSpec = {
$schema: "https://vega.github.io/schema/vega-lite/v5.json",
resolve: { scale: { x: "independent" } },
width: 800,
height: 25,
params: [
{
name: "highlight",
select: { type: "point", fields: ["xorb_id"], on: "pointerover" },
},
{
name: "select",
select: { type: "point", fields: ["repo"], toggle: "false" },
},
{
name: "xorbs_selected",
expr: "pluck(data('source_0'), 'repo_xorb_selected')",
},
{
name: "any_xorbs_selected",
expr: "extent(xorbs_selected)[0] != null",
},
],
transform: [
{
calculate:
"(select.repo != null ? indexof(select.repo, datum.repo) : -1) + 1",
as: "repo_selected",
},
{
calculate: "if(datum.repo_selected > 0, datum.xorb_id, null)",
as: "repo_xorb_selected",
},
{
calculate:
"split(datum.repo, '/')[length(split(datum.repo, '/')) - 1]",
as: "repo",
},
],
data: {
url: "xorbs.json",
},
mark: "rect",
encoding: {
x: {
field: "xorb_id",
axis: null,
sort: { field: "dedupe_factor", order: "descending" },
stack: "normalize",
},
color: {
condition: [
{ test: "datum.xorb_id == highlight.xorb_id", value: "orange" },
],
field: "dedupe_factor",
type: "quantitative",
scale: { scheme: "blues", domain: [0, 10] },
},
opacity: {
condition: [
{
test: "any_xorbs_selected && indexof(xorbs_selected, datum.xorb_id) == -1",
value: 0.2,
},
],
},
tooltip: [
{ field: "repo", type: "nominal", title: "File" },
{ field: "xorb_id", type: "nominal", title: "Block Hash" },
{
field: "dedupe_factor",
type: "quantitative",
title: "Dedupe Factor",
},
],
row: {
field: "repo",
title: "",
spacing: 1,
header: { labelAngle: 0, labelAlign: "left", labelFontSize: 14 },
sort: { field: "dedupe_factor", order: "descending" },
},
},
};
vegaEmbed("#vis", vlSpec);
</script>
</body>
</html>