Embedding-Playground / cluster_naming.js
ping98k
Enhance cluster naming and K-Means functionality; implement random selection for cluster names, improve centroid initialization, and adjust UMAP parameters for better projection accuracy.
935873d
import { AutoTokenizer, AutoModelForCausalLM } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.6.0';
import { prompt_cluster } from "./prompt_cluster.js";
const tokenizer = await AutoTokenizer.from_pretrained("onnx-community/Qwen3-0.6B-ONNX");
const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
export async function nameCluster(lines) {
// If more than 5 lines, randomly pick 5
let selected = lines;
if (lines.length > 5) {
selected = [];
const used = new Set();
while (selected.length < 5) {
const idx = Math.floor(Math.random() * lines.length);
if (!used.has(idx)) {
used.add(idx);
selected.push(lines[idx]);
}
}
}
const joined = selected.join("\n");
const messages = [
{ role: "system", content: prompt_cluster },
{ role: "user", content: `Input:\n${joined}\nOutput:` }
];
const inputs = tokenizer.apply_chat_template(messages, {
add_generation_prompt: true,
return_dict: true,
enable_thinking: false,
});
const outputTokens = await model.generate({
...inputs,
max_new_tokens: 1024,
do_sample: true,
temperature: 0.6
});
let rawName = tokenizer.decode(outputTokens[0], { skip_special_tokens: false }).trim();
const THINK_TAG = "</think>";
const END_TAG = "<|im_end|>";
if (rawName.includes(THINK_TAG)) {
rawName = rawName.substring(rawName.lastIndexOf(THINK_TAG) + THINK_TAG.length).trim();
}
if (rawName.includes(END_TAG)) {
rawName = rawName.substring(0, rawName.indexOf(END_TAG)).trim();
}
return rawName;
}