Spaces:
Running
Running
ping98k
Refactor K-Means clustering implementation; modularize embedding and clustering logic, enhance heatmap and scatter plot functions, and improve cluster naming process.
12c4198
import { pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.6.0'; | |
const embed = await pipeline( | |
"feature-extraction", | |
"onnx-community/Qwen3-Embedding-0.6B-ONNX", | |
{ device: "webgpu", dtype: "q4f16" }, | |
); | |
export async function getGroupEmbeddings(groups, task) { | |
const groupEmbeddings = []; | |
for (const g of groups) { | |
// Remove lines starting with ## | |
const lines = g.split(/\n/) | |
.map(x => x.trim()) | |
.filter(x => x && !x.startsWith('##')); | |
const prompts = lines.map(s => `Instruct: ${task}\nQuery:${s}`); | |
const out = await embed(prompts, { pooling: "mean", normalize: true }); | |
const embeddings = typeof out.tolist === 'function' ? out.tolist() : out.data; | |
const dim = embeddings[0].length; | |
const avg = new Float32Array(dim); | |
for (const e of embeddings) { for (let i = 0; i < dim; i++) avg[i] += e[i]; } | |
for (let i = 0; i < dim; i++) avg[i] /= embeddings.length; | |
groupEmbeddings.push(avg); | |
} | |
return groupEmbeddings; | |
} | |
export async function getLineEmbeddings(lines, task) { | |
const prompts = lines.map(s => `Instruct: ${task}\nQuery:${s}`); | |
const out = await embed(prompts, { pooling: "mean", normalize: true }); | |
return typeof out.tolist === 'function' ? out.tolist() : out.data; | |
} | |