Spaces:
Running
Running
ping98k
commited on
Commit
·
935873d
1
Parent(s):
aaffa94
Enhance cluster naming and K-Means functionality; implement random selection for cluster names, improve centroid initialization, and adjust UMAP parameters for better projection accuracy.
Browse files- cluster_naming.js +14 -1
- clustering.js +58 -22
- main.js +1 -1
cluster_naming.js
CHANGED
@@ -5,7 +5,20 @@ const tokenizer = await AutoTokenizer.from_pretrained("onnx-community/Qwen3-0.6B
|
|
5 |
const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
|
6 |
|
7 |
export async function nameCluster(lines) {
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
const messages = [
|
10 |
{ role: "system", content: prompt_cluster },
|
11 |
{ role: "user", content: `Input:\n${joined}\nOutput:` }
|
|
|
5 |
const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
|
6 |
|
7 |
export async function nameCluster(lines) {
|
8 |
+
// If more than 5 lines, randomly pick 5
|
9 |
+
let selected = lines;
|
10 |
+
if (lines.length > 5) {
|
11 |
+
selected = [];
|
12 |
+
const used = new Set();
|
13 |
+
while (selected.length < 5) {
|
14 |
+
const idx = Math.floor(Math.random() * lines.length);
|
15 |
+
if (!used.has(idx)) {
|
16 |
+
used.add(idx);
|
17 |
+
selected.push(lines[idx]);
|
18 |
+
}
|
19 |
+
}
|
20 |
+
}
|
21 |
+
const joined = selected.join("\n");
|
22 |
const messages = [
|
23 |
{ role: "system", content: prompt_cluster },
|
24 |
{ role: "user", content: `Input:\n${joined}\nOutput:` }
|
clustering.js
CHANGED
@@ -1,37 +1,69 @@
|
|
1 |
import { UMAP } from "https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm";
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
export function kmeans(embeddings, k, maxIter = 100) {
|
4 |
const n = embeddings.length;
|
|
|
|
|
5 |
const dim = embeddings[0].length;
|
6 |
-
let centroids =
|
7 |
-
|
8 |
|
9 |
const reseed = () => {
|
10 |
-
let
|
|
|
11 |
for (let i = 0; i < n; ++i) {
|
12 |
-
let
|
13 |
for (let c = 0; c < k; ++c) {
|
14 |
let dist = 0;
|
15 |
-
for (let d = 0; d < dim; ++d)
|
16 |
-
|
17 |
-
|
|
|
|
|
18 |
}
|
19 |
-
if (
|
20 |
-
|
21 |
-
|
22 |
}
|
23 |
}
|
24 |
-
return embeddings[
|
25 |
};
|
26 |
|
27 |
for (let iter = 0; iter < maxIter; ++iter) {
|
28 |
-
let
|
29 |
for (let i = 0; i < n; ++i) {
|
30 |
-
let best = 0
|
|
|
31 |
for (let c = 0; c < k; ++c) {
|
32 |
let dist = 0;
|
33 |
-
for (let d = 0; d < dim; ++d)
|
34 |
-
|
|
|
|
|
35 |
if (dist < bestDist) {
|
36 |
bestDist = dist;
|
37 |
best = c;
|
@@ -39,11 +71,11 @@ export function kmeans(embeddings, k, maxIter = 100) {
|
|
39 |
}
|
40 |
if (labels[i] !== best) {
|
41 |
labels[i] = best;
|
42 |
-
|
43 |
}
|
44 |
}
|
|
|
45 |
centroids = Array.from({ length: k }, () => new Array(dim).fill(0));
|
46 |
-
const counts = new Array(k).fill(0);
|
47 |
for (let i = 0; i < n; ++i) {
|
48 |
counts[labels[i]]++;
|
49 |
for (let d = 0; d < dim; ++d)
|
@@ -53,16 +85,20 @@ export function kmeans(embeddings, k, maxIter = 100) {
|
|
53 |
if (counts[c] === 0) {
|
54 |
centroids[c] = reseed();
|
55 |
} else {
|
56 |
-
|
57 |
-
|
58 |
}
|
59 |
}
|
60 |
-
if (!
|
61 |
}
|
62 |
-
return { labels, centroids };
|
63 |
}
|
64 |
|
65 |
export function runUMAP(embeddings, nNeighbors = 15) {
|
66 |
-
const umap = new UMAP({
|
|
|
|
|
|
|
|
|
67 |
return umap.fit(embeddings);
|
68 |
}
|
|
|
1 |
import { UMAP } from "https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm";
|
2 |
|
3 |
+
function kmeansPlusPlusInit(embeddings, k) {
|
4 |
+
const n = embeddings.length;
|
5 |
+
const dim = embeddings[0].length;
|
6 |
+
const centroids = [embeddings[Math.floor(Math.random() * n)].slice()];
|
7 |
+
const d2 = new Float64Array(n);
|
8 |
+
for (let c = 1; c < k; ++c) {
|
9 |
+
let total = 0;
|
10 |
+
for (let i = 0; i < n; ++i) {
|
11 |
+
let dist = 0;
|
12 |
+
for (let d = 0; d < dim; ++d) {
|
13 |
+
const diff = embeddings[i][d] - centroids[c - 1][d];
|
14 |
+
dist += diff * diff;
|
15 |
+
}
|
16 |
+
if (c === 1 || dist < d2[i]) d2[i] = dist;
|
17 |
+
total += d2[i];
|
18 |
+
}
|
19 |
+
let r = Math.random() * total;
|
20 |
+
let idx = 0;
|
21 |
+
while (r > d2[idx]) r -= d2[idx++];
|
22 |
+
centroids.push(embeddings[idx].slice());
|
23 |
+
}
|
24 |
+
return centroids;
|
25 |
+
}
|
26 |
+
|
27 |
export function kmeans(embeddings, k, maxIter = 100) {
|
28 |
const n = embeddings.length;
|
29 |
+
if (n === 0) return { labels: [], centroids: [] };
|
30 |
+
k = Math.max(2, Math.min(k, n));
|
31 |
const dim = embeddings[0].length;
|
32 |
+
let centroids = kmeansPlusPlusInit(embeddings, k);
|
33 |
+
const labels = new Uint32Array(n);
|
34 |
|
35 |
const reseed = () => {
|
36 |
+
let farIdx = 0;
|
37 |
+
let farDist = -1;
|
38 |
for (let i = 0; i < n; ++i) {
|
39 |
+
let min = Infinity;
|
40 |
for (let c = 0; c < k; ++c) {
|
41 |
let dist = 0;
|
42 |
+
for (let d = 0; d < dim; ++d) {
|
43 |
+
const diff = embeddings[i][d] - centroids[c][d];
|
44 |
+
dist += diff * diff;
|
45 |
+
}
|
46 |
+
if (dist < min) min = dist;
|
47 |
}
|
48 |
+
if (min > farDist) {
|
49 |
+
farDist = min;
|
50 |
+
farIdx = i;
|
51 |
}
|
52 |
}
|
53 |
+
return embeddings[farIdx].slice();
|
54 |
};
|
55 |
|
56 |
for (let iter = 0; iter < maxIter; ++iter) {
|
57 |
+
let moved = false;
|
58 |
for (let i = 0; i < n; ++i) {
|
59 |
+
let best = 0;
|
60 |
+
let bestDist = Infinity;
|
61 |
for (let c = 0; c < k; ++c) {
|
62 |
let dist = 0;
|
63 |
+
for (let d = 0; d < dim; ++d) {
|
64 |
+
const diff = embeddings[i][d] - centroids[c][d];
|
65 |
+
dist += diff * diff;
|
66 |
+
}
|
67 |
if (dist < bestDist) {
|
68 |
bestDist = dist;
|
69 |
best = c;
|
|
|
71 |
}
|
72 |
if (labels[i] !== best) {
|
73 |
labels[i] = best;
|
74 |
+
moved = true;
|
75 |
}
|
76 |
}
|
77 |
+
const counts = new Uint32Array(k);
|
78 |
centroids = Array.from({ length: k }, () => new Array(dim).fill(0));
|
|
|
79 |
for (let i = 0; i < n; ++i) {
|
80 |
counts[labels[i]]++;
|
81 |
for (let d = 0; d < dim; ++d)
|
|
|
85 |
if (counts[c] === 0) {
|
86 |
centroids[c] = reseed();
|
87 |
} else {
|
88 |
+
const inv = 1 / counts[c];
|
89 |
+
for (let d = 0; d < dim; ++d) centroids[c][d] *= inv;
|
90 |
}
|
91 |
}
|
92 |
+
if (!moved) break;
|
93 |
}
|
94 |
+
return { labels: Array.from(labels), centroids };
|
95 |
}
|
96 |
|
97 |
export function runUMAP(embeddings, nNeighbors = 15) {
|
98 |
+
const umap = new UMAP({
|
99 |
+
nComponents: 2,
|
100 |
+
nNeighbors: Math.max(1, Math.min(nNeighbors, embeddings.length - 1)),
|
101 |
+
minDist: 0.1
|
102 |
+
});
|
103 |
return umap.fit(embeddings);
|
104 |
}
|
main.js
CHANGED
@@ -62,7 +62,7 @@ document.getElementById("kmeans-btn").onclick = async () => {
|
|
62 |
// UMAP projection
|
63 |
const { UMAP } = await import('https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm');
|
64 |
const nNeighbors = Math.max(1, Math.min(lines.length - 1, 15));
|
65 |
-
const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.
|
66 |
const proj = umap.fit(embeddings);
|
67 |
// Group lines by cluster
|
68 |
const clustered = Array.from({ length: k }, () => []);
|
|
|
62 |
// UMAP projection
|
63 |
const { UMAP } = await import('https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm');
|
64 |
const nNeighbors = Math.max(1, Math.min(lines.length - 1, 15));
|
65 |
+
const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.2, metric: "cosine" });
|
66 |
const proj = umap.fit(embeddings);
|
67 |
// Group lines by cluster
|
68 |
const clustered = Array.from({ length: k }, () => []);
|