ping98k commited on
Commit
935873d
·
1 Parent(s): aaffa94

Enhance cluster naming and K-Means functionality; implement random selection for cluster names, improve centroid initialization, and adjust UMAP parameters for better projection accuracy.

Browse files
Files changed (3) hide show
  1. cluster_naming.js +14 -1
  2. clustering.js +58 -22
  3. main.js +1 -1
cluster_naming.js CHANGED
@@ -5,7 +5,20 @@ const tokenizer = await AutoTokenizer.from_pretrained("onnx-community/Qwen3-0.6B
5
  const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
6
 
7
  export async function nameCluster(lines) {
8
- const joined = lines.join("\n");
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  const messages = [
10
  { role: "system", content: prompt_cluster },
11
  { role: "user", content: `Input:\n${joined}\nOutput:` }
 
5
  const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
6
 
7
  export async function nameCluster(lines) {
8
+ // If more than 5 lines, randomly pick 5
9
+ let selected = lines;
10
+ if (lines.length > 5) {
11
+ selected = [];
12
+ const used = new Set();
13
+ while (selected.length < 5) {
14
+ const idx = Math.floor(Math.random() * lines.length);
15
+ if (!used.has(idx)) {
16
+ used.add(idx);
17
+ selected.push(lines[idx]);
18
+ }
19
+ }
20
+ }
21
+ const joined = selected.join("\n");
22
  const messages = [
23
  { role: "system", content: prompt_cluster },
24
  { role: "user", content: `Input:\n${joined}\nOutput:` }
clustering.js CHANGED
@@ -1,37 +1,69 @@
1
  import { UMAP } from "https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm";
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  export function kmeans(embeddings, k, maxIter = 100) {
4
  const n = embeddings.length;
 
 
5
  const dim = embeddings[0].length;
6
- let centroids = Array.from({ length: k }, () => embeddings[Math.floor(Math.random() * n)].slice());
7
- let labels = new Array(n).fill(0);
8
 
9
  const reseed = () => {
10
- let bestIdx = 0, bestDist = -1;
 
11
  for (let i = 0; i < n; ++i) {
12
- let minDist = Infinity;
13
  for (let c = 0; c < k; ++c) {
14
  let dist = 0;
15
- for (let d = 0; d < dim; ++d)
16
- dist += (embeddings[i][d] - centroids[c][d]) ** 2;
17
- if (dist < minDist) minDist = dist;
 
 
18
  }
19
- if (minDist > bestDist) {
20
- bestDist = minDist;
21
- bestIdx = i;
22
  }
23
  }
24
- return embeddings[bestIdx].slice();
25
  };
26
 
27
  for (let iter = 0; iter < maxIter; ++iter) {
28
- let changed = false;
29
  for (let i = 0; i < n; ++i) {
30
- let best = 0, bestDist = Infinity;
 
31
  for (let c = 0; c < k; ++c) {
32
  let dist = 0;
33
- for (let d = 0; d < dim; ++d)
34
- dist += (embeddings[i][d] - centroids[c][d]) ** 2;
 
 
35
  if (dist < bestDist) {
36
  bestDist = dist;
37
  best = c;
@@ -39,11 +71,11 @@ export function kmeans(embeddings, k, maxIter = 100) {
39
  }
40
  if (labels[i] !== best) {
41
  labels[i] = best;
42
- changed = true;
43
  }
44
  }
 
45
  centroids = Array.from({ length: k }, () => new Array(dim).fill(0));
46
- const counts = new Array(k).fill(0);
47
  for (let i = 0; i < n; ++i) {
48
  counts[labels[i]]++;
49
  for (let d = 0; d < dim; ++d)
@@ -53,16 +85,20 @@ export function kmeans(embeddings, k, maxIter = 100) {
53
  if (counts[c] === 0) {
54
  centroids[c] = reseed();
55
  } else {
56
- for (let d = 0; d < dim; ++d)
57
- centroids[c][d] /= counts[c];
58
  }
59
  }
60
- if (!changed) break;
61
  }
62
- return { labels, centroids };
63
  }
64
 
65
  export function runUMAP(embeddings, nNeighbors = 15) {
66
- const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.1 });
 
 
 
 
67
  return umap.fit(embeddings);
68
  }
 
1
  import { UMAP } from "https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm";
2
 
3
+ function kmeansPlusPlusInit(embeddings, k) {
4
+ const n = embeddings.length;
5
+ const dim = embeddings[0].length;
6
+ const centroids = [embeddings[Math.floor(Math.random() * n)].slice()];
7
+ const d2 = new Float64Array(n);
8
+ for (let c = 1; c < k; ++c) {
9
+ let total = 0;
10
+ for (let i = 0; i < n; ++i) {
11
+ let dist = 0;
12
+ for (let d = 0; d < dim; ++d) {
13
+ const diff = embeddings[i][d] - centroids[c - 1][d];
14
+ dist += diff * diff;
15
+ }
16
+ if (c === 1 || dist < d2[i]) d2[i] = dist;
17
+ total += d2[i];
18
+ }
19
+ let r = Math.random() * total;
20
+ let idx = 0;
21
+ while (r > d2[idx]) r -= d2[idx++];
22
+ centroids.push(embeddings[idx].slice());
23
+ }
24
+ return centroids;
25
+ }
26
+
27
  export function kmeans(embeddings, k, maxIter = 100) {
28
  const n = embeddings.length;
29
+ if (n === 0) return { labels: [], centroids: [] };
30
+ k = Math.max(2, Math.min(k, n));
31
  const dim = embeddings[0].length;
32
+ let centroids = kmeansPlusPlusInit(embeddings, k);
33
+ const labels = new Uint32Array(n);
34
 
35
  const reseed = () => {
36
+ let farIdx = 0;
37
+ let farDist = -1;
38
  for (let i = 0; i < n; ++i) {
39
+ let min = Infinity;
40
  for (let c = 0; c < k; ++c) {
41
  let dist = 0;
42
+ for (let d = 0; d < dim; ++d) {
43
+ const diff = embeddings[i][d] - centroids[c][d];
44
+ dist += diff * diff;
45
+ }
46
+ if (dist < min) min = dist;
47
  }
48
+ if (min > farDist) {
49
+ farDist = min;
50
+ farIdx = i;
51
  }
52
  }
53
+ return embeddings[farIdx].slice();
54
  };
55
 
56
  for (let iter = 0; iter < maxIter; ++iter) {
57
+ let moved = false;
58
  for (let i = 0; i < n; ++i) {
59
+ let best = 0;
60
+ let bestDist = Infinity;
61
  for (let c = 0; c < k; ++c) {
62
  let dist = 0;
63
+ for (let d = 0; d < dim; ++d) {
64
+ const diff = embeddings[i][d] - centroids[c][d];
65
+ dist += diff * diff;
66
+ }
67
  if (dist < bestDist) {
68
  bestDist = dist;
69
  best = c;
 
71
  }
72
  if (labels[i] !== best) {
73
  labels[i] = best;
74
+ moved = true;
75
  }
76
  }
77
+ const counts = new Uint32Array(k);
78
  centroids = Array.from({ length: k }, () => new Array(dim).fill(0));
 
79
  for (let i = 0; i < n; ++i) {
80
  counts[labels[i]]++;
81
  for (let d = 0; d < dim; ++d)
 
85
  if (counts[c] === 0) {
86
  centroids[c] = reseed();
87
  } else {
88
+ const inv = 1 / counts[c];
89
+ for (let d = 0; d < dim; ++d) centroids[c][d] *= inv;
90
  }
91
  }
92
+ if (!moved) break;
93
  }
94
+ return { labels: Array.from(labels), centroids };
95
  }
96
 
97
  export function runUMAP(embeddings, nNeighbors = 15) {
98
+ const umap = new UMAP({
99
+ nComponents: 2,
100
+ nNeighbors: Math.max(1, Math.min(nNeighbors, embeddings.length - 1)),
101
+ minDist: 0.1
102
+ });
103
  return umap.fit(embeddings);
104
  }
main.js CHANGED
@@ -62,7 +62,7 @@ document.getElementById("kmeans-btn").onclick = async () => {
62
  // UMAP projection
63
  const { UMAP } = await import('https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm');
64
  const nNeighbors = Math.max(1, Math.min(lines.length - 1, 15));
65
- const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.5, metric: "cosine" });
66
  const proj = umap.fit(embeddings);
67
  // Group lines by cluster
68
  const clustered = Array.from({ length: k }, () => []);
 
62
  // UMAP projection
63
  const { UMAP } = await import('https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm');
64
  const nNeighbors = Math.max(1, Math.min(lines.length - 1, 15));
65
+ const umap = new UMAP({ nComponents: 2, nNeighbors, minDist: 0.2, metric: "cosine" });
66
  const proj = umap.fit(embeddings);
67
  // Group lines by cluster
68
  const clustered = Array.from({ length: k }, () => []);