From bc1a793042b5290fd5dbd1f879eee3f98b90b394 Mon Sep 17 00:00:00 2001 From: Brian Joseph Petro Date: Sun, 22 Dec 2024 10:54:04 -0500 Subject: [PATCH] - Introduced the SmartClusters module, including core classes for managing clusters and their members. - Added a new `source.js` adapter implementing a k-centers clustering approach, enhancing clustering capabilities. - Refactored `SmartCluster` and `SmartClusters` classes to improve member management and clustering logic. - Enhanced the rendering of clusters in the UI with updated HTML structure and improved member display. - Added integration tests to validate clustering functionality and member reassignment upon deletion. - Created a test content generation script to facilitate integration testing with various markdown files. --- smart-clusters/.gitignore | 1 + smart-clusters/adapters/_adapter.js | 72 +++-- .../adapters/data/ajson_multi_file.js | 0 smart-clusters/adapters/source.js | 268 ++++++++++++++++++ smart-clusters/components/cluster.js | 33 +-- smart-clusters/index.js | 5 + smart-clusters/package.json | 42 +++ smart-clusters/smart_cluster.js | 137 ++++----- smart-clusters/smart_clusters.js | 248 ++++------------ smart-clusters/test/source_clusters.test.js | 209 ++++++++++++++ smart-clusters/test/test_content.js | 133 +++++++++ smart-clusters/utils/shuffle_array.js | 20 ++ .../adapters/ajson_multi_file.js | 2 +- smart-embed-model/package.json | 5 +- 14 files changed, 846 insertions(+), 329 deletions(-) create mode 100644 smart-clusters/.gitignore create mode 100644 smart-clusters/adapters/data/ajson_multi_file.js create mode 100644 smart-clusters/adapters/source.js create mode 100644 smart-clusters/index.js create mode 100644 smart-clusters/package.json create mode 100644 smart-clusters/test/source_clusters.test.js create mode 100644 smart-clusters/test/test_content.js create mode 100644 smart-clusters/utils/shuffle_array.js diff --git a/smart-clusters/.gitignore b/smart-clusters/.gitignore new file mode 100644 index 00000000..b512c09d --- /dev/null +++ b/smart-clusters/.gitignore @@ -0,0 +1 @@ +node_modules \ No newline at end of file diff --git a/smart-clusters/adapters/_adapter.js b/smart-clusters/adapters/_adapter.js index 57ee16ac..3d5fbdc7 100644 --- a/smart-clusters/adapters/_adapter.js +++ b/smart-clusters/adapters/_adapter.js @@ -1,55 +1,47 @@ /** - * @class SmartClustersDataAdapter - * @classdesc Provides load/save operations for SmartCluster entities. - * This is a placeholder to demonstrate how to integrate data persistence. - * In practice, implement reading/writing to file or DB here. + * @file _adapter.js + * @description Base adapter classes for building clusters from sources (or other items). */ -export class SmartClustersDataAdapter { - constructor(collection) { - this.collection = collection; - } - - get env() { return this.collection.env; } +/** + * @class ClusterCollectionAdapter + * @classdesc + * Interface for a collection-level adapter that can build clusters from environment items. + */ +export class ClusterCollectionAdapter { /** - * Loads all cluster data from a persistent store (JSON file, DB, etc.). - * In this placeholder, we assume data is stored in memory (or a single JSON file). - * @returns {Promise} + * @constructor + * @param {Object} collection - The cluster collection instance. */ - async load_all() { - // TODO: Load from `smart_env.json` or a cluster-specific file if desired. - // For now, assume empty or existing data in this.env. - const stored_data = this.env._clusters_data || []; - for (const cluster_data of stored_data) { - const cluster = new this.collection.item_type(this.env, cluster_data); - this.collection.set(cluster); - } + constructor(collection) { + this.collection = collection; } /** - * Saves all clusters to a persistent store. - * @returns {Promise} + * @async + * Build clusters. (No-op by default.) */ - async save_all() { - // Gather all clusters and store them: - const clusters_data = Object.values(this.collection.items).map(item => item.data); - // Placeholder: store in memory. In production, write to a file or DB. - this.env._clusters_data = clusters_data; + async build_groups() { + throw new Error("Not implemented. Override in subclass."); } +} +/** + * @class ClusterItemAdapter + * @classdesc + * Interface for item-level logic if needed for cluster items. + */ +export class ClusterItemAdapter { /** - * Save a single cluster. - * @param {SmartCluster} cluster - * @returns {Promise} + * @constructor + * @param {Object} item - The cluster item instance. */ - async save(cluster) { - // Update single cluster data in memory. - const idx = (this.env._clusters_data || []).findIndex(c => c.key === cluster.key); - if (idx === -1) { - this.env._clusters_data = this.env._clusters_data || []; - this.env._clusters_data.push(cluster.data); - } else { - this.env._clusters_data[idx] = cluster.data; - } + constructor(item) { + this.item = item; } } + +export default { + collection: ClusterCollectionAdapter, + item: ClusterItemAdapter +}; diff --git a/smart-clusters/adapters/data/ajson_multi_file.js b/smart-clusters/adapters/data/ajson_multi_file.js new file mode 100644 index 00000000..e69de29b diff --git a/smart-clusters/adapters/source.js b/smart-clusters/adapters/source.js new file mode 100644 index 00000000..5d7fac86 --- /dev/null +++ b/smart-clusters/adapters/source.js @@ -0,0 +1,268 @@ +/** + * @file source.js (Alternative “k-centers” style) + * @description + * An alternative clustering adapter that exclusively uses a “k-centers” approach, + * aiming to minimize the maximum distance (or equivalently, maximize the minimum similarity). + */ + +import { ClusterCollectionAdapter, ClusterItemAdapter } from "./_adapter.js"; +import { cos_sim } from "smart-entities/cos_sim.js"; // or from your local cos_sim +import { shuffle_array } from "../utils/shuffle_array.js"; + +/** + * @class SourceClustersAdapterKCenters + * @extends ClusterCollectionAdapter + * @description + * Builds clusters by scanning `env.smart_sources` for items with a `.vec`, + * using a k-centers approach. The cluster "center" is always the actual + * member in that cluster that minimizes maximum distance to all other members + * in the cluster (i.e. `nearest_member`). + */ +export class SourceClustersAdapter extends ClusterCollectionAdapter { + + /** + * Primary entrypoint: build the clusters from the `smart_sources`. + * + * REQUIRED USER SETTINGS (in cluster `settings_config`): + * - `clusters_ct` + * - `max_iterations` + * + * Optional: You could read from additional fields if desired, but here we only + * use `clusters_ct` (the number of clusters) and `max_iterations`. + */ + async build_groups() { + console.log("build_groups"); + // 1. Grab user config + const { + clusters_ct = 5, + max_iterations = 10, + } = this.collection.settings ?? {}; // or this.collection.settings_config + + // 2. Filter out any sources that lack a vector + const sources = this.collection.env.smart_sources.filter(s => s?.vec); + + if (sources.length === 0) { + console.warn("No sources with vectors found; skipping cluster build."); + return; + } + + // 3. CLEAR existing clusters (or you can mark them deleted) + this._clear_existing_clusters(); + + // 4. PICK initial cluster centers (k-centers style): + // pick 1 random, then repeatedly pick the source that is furthest from all chosen centers + const centers = this._choose_initial_k_centers(sources, clusters_ct); + + // 5. Create cluster items for each center + const clusterItems = await Promise.all(centers.map(async (centerSource, i) => { + return await this.collection.create_or_update({ + key: centerSource.key, + center_source_key: centerSource.key, + name: `Cluster #${i + 1}`, + members: [], + number_of_members: 0, + clustering_timestamp: Date.now(), + }); + })); + + // 6. Refine clusters for up to max_iterations + for (let iter = 0; iter < max_iterations; iter++) { + let changed = false; + + // 6a. Assign every source to the nearest center + // We’ll track membership in a scratch map: clusterKey => arrayOfSourceKeys + const newMembershipMap = {}; + clusterItems.forEach(ci => { + newMembershipMap[ci.key] = []; + }); + + for (const src of sources) { + // find cluster whose center yields the highest cos_sim + let bestCluster = null; + let bestSim = -Infinity; + + for (const ci of clusterItems) { + const centerVec = this._get_center_vec(ci); + if (!centerVec) continue; + const sim = cos_sim(src.vec, centerVec); + if (sim > bestSim) { + bestSim = sim; + bestCluster = ci; + } + } + + if (bestCluster) { + newMembershipMap[bestCluster.key].push(src.key); + } + } + + // 6b. For each cluster, pick the "nearest_member" that + // minimizes the maximum distance to all other members + // (or equivalently, maximizes min-sim). + for (const ci of clusterItems) { + const newMembers = newMembershipMap[ci.key] || []; + ci.data.members = newMembers; // store membership + if (newMembers.length === 0) continue; + + // pick the new center by "nearest_member" logic + const newCenterKey = this._find_nearest_member(ci, newMembers); + if (newCenterKey && newCenterKey !== ci.data.center_source_key) { + ci.data.key = newCenterKey; + ci.data.center_source_key = newCenterKey; + changed = true; + } + } + + if (!changed) { + // no cluster center changed => stable + break; + } + } + + console.log("clusterItems", clusterItems.map(ci => ci.key)); + // 7. Finalize cluster data + clusterItems.forEach(ci => { + ci.data.number_of_members = ci.data.members?.length ?? 0; + ci.data.clustering_timestamp = Date.now(); + this.collection.set(ci); + // Mark them for saving + ci.queue_save(); + }); + console.log(Object.values(this.collection.items).length); + } + + /** + * Private helper: Choose K centers using a standard k-center approach: + * - pick 1 center at random + * - pick each subsequent center by finding the source that is furthest from any existing center + */ + _choose_initial_k_centers(sources, k) { + if (k >= sources.length) return sources.slice(0, k); + + const pickedCenters = []; + // pick the first random + const shuffled = shuffle_array([...sources]); + pickedCenters.push(shuffled[0]); + + // pick the rest + while (pickedCenters.length < k) { + let bestCandidate = null; + let bestDist = -Infinity; + + // for each source, compute distance to its nearest picked center + // we want the one that is furthest from *all* picked centers + for (const s of sources) { + if (pickedCenters.includes(s)) continue; + // find the highest sim among the already-chosen centers + let nearestSim = -Infinity; + for (const c of pickedCenters) { + const sim = cos_sim(s.vec, c.vec); + if (sim > nearestSim) { + nearestSim = sim; + } + } + // distance ~ 1 - sim, or we can just track sim + // we want to maximize the distance => minimize the sim + if (nearestSim < bestDist || bestDist < 0) { + // we are looking for the source with the minimal "nearestSim" + // so we actually want the smallest nearestSim + } + // Actually simpler: track `lowestSimSoFar` and pick the source whose `lowestSimSoFar` is smallest + if (bestCandidate === null) { + bestCandidate = s; + bestDist = nearestSim; + } else if (nearestSim < bestDist) { + bestCandidate = s; + bestDist = nearestSim; + } + } + + if (bestCandidate) { + pickedCenters.push(bestCandidate); + } else { + // if none found, means all are accounted for + break; + } + } + + return pickedCenters; + } + + /** + * Private helper: Clear existing clusters by removing items from the cluster collection. + */ + _clear_existing_clusters() { + const cluster_keys = Object.keys(this.collection.items); + cluster_keys.forEach(k => { + this.collection.delete_item(k); + }); + } + + /** + * Private helper: Return the cluster's current center vector. + * + * @param {SmartCluster} cluster + * @returns {number[] | null} + */ + _get_center_vec(cluster) { + const centerSource = this.collection.env.smart_sources.get(cluster.data.center_source_key); + return centerSource?.vec || null; + } + + /** + * Private helper: Among `memberKeys`, pick the key that yields the smallest maximum distance + * (largest min-sim) to the other members in that cluster. + * + * @param {SmartCluster} cluster + * @param {string[]} memberKeys + * @returns {string|null} chosen center source key + */ + _find_nearest_member(cluster, memberKeys) { + // if only 1 member, that must be center + if (memberKeys.length === 1) return memberKeys[0]; + + let bestKey = cluster.data.center_source_key ?? null; + let bestScore = -Infinity; // track the best "score" + + // convert keys to source objects + const sources = memberKeys + .map(k => this.collection.env.smart_sources.get(k)) + .filter(s => s?.vec); + + // for each candidate, measure the minimum cos_sim with others, or the average + // "k-center" typically uses “minimize the maximum distance”, i.e. + // we measure the “worst-case similarity” from candidate to all others + for (const candidate of sources) { + let worstSim = Infinity; + for (const other of sources) { + if (other.key === candidate.key) continue; + const sim = cos_sim(candidate.vec, other.vec); + if (sim < worstSim) { + worstSim = sim; + } + } + // we want to maximize worstSim + if (worstSim > bestScore) { + bestScore = worstSim; + bestKey = candidate.key; + } + } + return bestKey; + } +} + +/** + * @class SourceClusterAdapterKCenters + * @extends ClusterItemAdapter + * @description + * If needed, override any per-cluster item logic. Typically we rely on `SmartCluster` + * for "delete => reassign" etc. + */ +export class SourceClusterAdapter extends ClusterItemAdapter { + // no additional logic needed for a minimal example +} + +export default { + collection: SourceClustersAdapter, + item: SourceClusterAdapter +}; \ No newline at end of file diff --git a/smart-clusters/components/cluster.js b/smart-clusters/components/cluster.js index e48d9fd5..949a4bc9 100644 --- a/smart-clusters/components/cluster.js +++ b/smart-clusters/components/cluster.js @@ -1,20 +1,21 @@ export async function render(scope, opts = {}) { - const html = `
-

Clusters

-

View and manage your clusters below:

-
-
`; + const html = ` +
+

Smart Clusters

+

These are your automatically generated clusters based on source vectors.

+
    +
    + `; const frag = this.create_doc_fragment(html); - const list = frag.querySelector('.sg-cluster-list'); - // Render each cluster - for (const cluster of Object.values(scope.items)) { - const div = document.createElement('div'); - div.className = 'sg-cluster-item'; - div.innerHTML = `

    ${cluster.data.name || cluster.key}

    -

    Members: ${cluster.data.member_keys.length}

    `; - list.appendChild(div); - } - + const ul = frag.querySelector('.sc-cluster-list'); + Object.values(scope.items).forEach(cluster => { + const li = document.createElement('li'); + li.innerHTML = ` + ${cluster.name} + (Members: ${cluster.data.members?.length ?? 0}) + `; + ul.appendChild(li); + }); return frag; -} +} \ No newline at end of file diff --git a/smart-clusters/index.js b/smart-clusters/index.js new file mode 100644 index 00000000..a69ba543 --- /dev/null +++ b/smart-clusters/index.js @@ -0,0 +1,5 @@ +import { SmartClusters } from "./smart_clusters.js"; +import { SmartCluster } from "./smart_cluster.js"; +import source_cluster_adapter from "./adapters/source.js"; + +export { SmartClusters, SmartCluster, source_cluster_adapter }; \ No newline at end of file diff --git a/smart-clusters/package.json b/smart-clusters/package.json new file mode 100644 index 00000000..f493fde8 --- /dev/null +++ b/smart-clusters/package.json @@ -0,0 +1,42 @@ +{ + "name": "smart-clusters", + "author": "Brian Joseph Petro (🌴 Brian)", + "license": "MIT", + "version": "0.0.1", + "type": "module", + "description": "Smart Clusters", + "main": "index.js", + "scripts": { + "test": "npx ava --verbose" + }, + "keywords": [ + "embeddings", + "clusters" + ], + "repository": { + "type": "git", + "url": "brianpetro/jsbrains" + }, + "bugs": { + "url": "https://github.com/brianpetro/jsbrains/issues" + }, + "homepage": "https://jsbrains.org", + "dependencies": { + "smart-groups": "file:../smart-groups" + }, + "devDependencies": { + "@huggingface/transformers": "^3.2.1", + "ava": "^6.0.1", + "smart-blocks": "file:../smart-blocks", + "smart-embed-model": "file:../smart-embed-model", + "smart-environment": "file:../smart-environment", + "smart-fs": "file:../smart-fs", + "smart-settings": "file:../smart-settings", + "smart-sources": "file:../smart-sources" + }, + "ava": { + "files": [ + "test/**/*.test.js" + ] + } +} diff --git a/smart-clusters/smart_cluster.js b/smart-clusters/smart_cluster.js index a0daa0d6..10cece68 100644 --- a/smart-clusters/smart_cluster.js +++ b/smart-clusters/smart_cluster.js @@ -1,99 +1,88 @@ -import { SmartEntity } from "smart-entities"; +import { SmartGroup } from "smart-groups"; +import { cos_sim } from "../smart-entities/cos_sim.js"; -export class SmartCluster extends SmartEntity { +export class SmartCluster extends SmartGroup { static get defaults() { return { data: { - key: null, - member_keys: [], - centroid_vec: null, - last_clustered_at: 0, - size: 0, + center_source_key: null, + members: [], name: '', - config: {} - }, + number_of_members: 0, + clustering_timestamp: 0, + } }; } - constructor(env, data = {}) { - super(env, data); + get key() { + return this.center_source.key; } /** - * Recalculate centroid of this cluster using either mean or median of members' embeddings. + * cluster.center_vec is a getter returning cluster.center_source.vec + * @returns {number[]|null} */ - recalculate_centroid() { - if (!this.member_keys?.length) { - this.data.centroid_vec = null; - return; - } - - const member_vectors = this.member_keys - .map(key => this.get_item_vector(key)) - .filter(vec => vec && Array.isArray(vec)); - - if (!member_vectors.length) { - this.data.centroid_vec = null; - return; - } - - const vec_length = member_vectors[0].length; - const all_values = member_vectors.map(v => v.slice()); - - if (this.data.config.centroid_type === 'median') { - // median centroid - const median_vec = []; - for (let i = 0; i < vec_length; i++) { - const vals = all_values.map(vec => vec[i]).sort((a, b) => a - b); - const mid = Math.floor(vals.length / 2); - median_vec[i] = vals.length % 2 === 0 ? (vals[mid - 1] + vals[mid]) / 2 : vals[mid]; - } - this.data.centroid_vec = median_vec; - } else { - // mean centroid (default) - const sum_vec = new Array(vec_length).fill(0); - for (const vec of all_values) { - for (let i = 0; i < vec_length; i++) { - sum_vec[i] += vec[i]; - } - } - const mean_vec = sum_vec.map(val => val / all_values.length); - this.data.centroid_vec = mean_vec; - } + get center_vec() { + return this.center_source?.vec || null; } /** - * Retrieves vector for a given item key. - * Currently assumes items are from smart_sources or smart_blocks. - * Extend as needed for other collections. - * @param {string} item_key + * cluster.center_source is a getter returning the source instance + * from env.smart_sources.get(cluster.data.center_source_key) */ - get_item_vector(item_key) { - const source = this.env.smart_sources.get(item_key) || this.env.smart_blocks.get(item_key); - return source?.vec || null; + get center_source() { + if(!this.data.center_source_key) return null; + return this.env.smart_sources.get(this.data.center_source_key); } /** - * Generate a name for the cluster from its members. - * Simple heuristic: take top few member names and join them. + * Dynamically generate a cluster name from top members or use data.name if present. + * Example: "Cluster: (Note1, Note2, ...)" */ - generate_name() { - const items = this.member_keys.map(key => this.env.smart_sources.get(key) || this.env.smart_blocks.get(key)) - .filter(item => item); - - const names = items.map(it => it.name || it.key); - // Simple name: first 2-3 item names joined - this.data.name = names.slice(0, 3).join(", ") + (names.length > 3 ? "..." : ""); + get name() { + if(this.data.name) return this.data.name; + const membersList = (this.data.members || []) + .slice(0, 3) + .map(k => this.env.smart_sources.get(k)?.file_name || k) + .join(", "); + return `Cluster (${membersList}${this.data.members?.length>3 ? "..." : ""})`; } - - async save() { - await this.env.smart_clusters.data_adapter.save(this); + set name(val) { + this.data.name = val; } - get member_keys() { return this.data.member_keys; } + async delete() { + // 1) Reassign members + const allClusters = Object.values(this.collection.items) + .filter(c => c.key !== this.key); + + if (allClusters.length) { + this.data.members.forEach(mKey => { + const source = this.env.smart_sources.get(mKey); + if (!source?.vec) return; + + // find the best new cluster + let best = { cluster: null, sim: -Infinity }; + allClusters.forEach(cluster => { + const cvec = cluster.center_vec; + if (!cvec) return; + const sim = cos_sim(source.vec, cvec); + if (sim > best.sim) best = { cluster, sim }; + }); + if (best.cluster) { + best.cluster.data.members.push(mKey); + best.cluster.queue_save(); + } + }); + } else { + console.warn("No other clusters exist; members are un-clustered."); + } + + // 2) Actually remove from the collection in-memory: + this.collection.delete_item(this.key); - set member_keys(keys) { - this.data.member_keys = keys; - this.data.size = keys.length; + // If you also want to mark it `deleted` for AJSON logs: + this.deleted = true; + this.queue_save(); } -} +} \ No newline at end of file diff --git a/smart-clusters/smart_clusters.js b/smart-clusters/smart_clusters.js index 753679e7..29136dff 100644 --- a/smart-clusters/smart_clusters.js +++ b/smart-clusters/smart_clusters.js @@ -1,205 +1,61 @@ -import { SmartEntities } from "smart-entities"; -import { SmartCluster } from "./smart_cluster.js"; -import { SmartClustersDataAdapter } from "./adapters/_adapter.js"; - -export class SmartClusters extends SmartEntities { - static get defaults() { - return { - config: { - clusters_ct: 5, - max_iterations: 10, - centroid_type: 'mean', // 'mean' or 'median' - }, - }; - } - - constructor(env, opts = {}) { - super(env, opts); - this.merge_defaults(); - this.data_adapter = new SmartClustersDataAdapter(this); - } - - merge_defaults() { - let current_class = this.constructor; - while (current_class) { - const default_val = current_class.defaults || {}; - for (let key in default_val) { - if (typeof default_val[key] === 'object') { - this[key] = { ...default_val[key], ...this[key] }; - } else { - if (this[key] === undefined) this[key] = default_val[key]; - } - } - current_class = Object.getPrototypeOf(current_class); - } - } - - get item_type() { return SmartCluster; } - get collection_key() { return 'smart_clusters'; } - - async init() { - await super.init(); - await this.data_adapter.load_all(); - } +import { SmartGroups } from "smart-groups"; + +/** + * @class SmartClusters + * @extends SmartGroups + * @classdesc + * Manages a collection of `SmartCluster` items. Provides a `build_groups()` that calls the cluster adapter. + */ +export class SmartClusters extends SmartGroups { + // E.g., store them under "clusters" folder or "multi" if desired + get data_dir() { return 'clusters'; } /** - * Runs k-means clustering on items (e.g. from `smart_sources`) - * @param {Array} items - an array of items (sources/blocks) to cluster. Each should have .vec embedding. + * Primary method that triggers the clustering adapter to create or update clusters. */ - async generate_clusters(items) { - if (!items || !items.length) { - console.log("No items to cluster."); - return; - } - - const { clusters_ct, max_iterations, centroid_type } = this.config; - - // Extract vectors and keys - const vectors = items.map(it => it.vec).filter(v => v); - const keys = items.map(it => it.key); - - if (!vectors.length) { - console.log("No vectors found, cannot cluster."); - return; - } - - // Initialize cluster centroids by picking random items - const initial_indices = []; - while (initial_indices.length < clusters_ct && initial_indices.length < vectors.length) { - const rand_idx = Math.floor(Math.random() * vectors.length); - if (!initial_indices.includes(rand_idx)) initial_indices.push(rand_idx); - } - let centroids = initial_indices.map(i => vectors[i].slice()); - - let assignments = new Array(vectors.length).fill(-1); - let changed = true; - let iteration = 0; - - while (changed && iteration < max_iterations) { - changed = false; - // Assign each vector to nearest centroid - for (let i = 0; i < vectors.length; i++) { - const vec = vectors[i]; - const nearest_c = this.nearest_centroid(vec, centroids); - if (assignments[i] !== nearest_c) { - assignments[i] = nearest_c; - changed = true; - } - } - - // Recalculate centroids - const new_centroids = []; - for (let c = 0; c < clusters_ct; c++) { - const cluster_vectors = vectors.filter((v, i) => assignments[i] === c); - if (!cluster_vectors.length) { - // If empty cluster, randomly reinitialize centroid - const rand_idx = Math.floor(Math.random() * vectors.length); - new_centroids.push(vectors[rand_idx].slice()); - continue; - } - - if (centroid_type === 'median') { - new_centroids.push(this.median_vector(cluster_vectors)); - } else { - // mean centroid - new_centroids.push(this.mean_vector(cluster_vectors)); - } - } - - centroids = new_centroids; - iteration++; - } - - // Create/Update clusters - const now = Date.now(); - // Clear old clusters? - this.clear(); - - for (let c = 0; c < clusters_ct; c++) { - const member_indices = assignments - .map((cl, i) => cl === c ? i : -1) - .filter(i => i >= 0); - - const member_keys = member_indices.map(i => keys[i]); - const cluster_data = { - key: `cluster_${c}_${now}`, - member_keys, - centroid_vec: centroids[c], - last_clustered_at: now, - size: member_keys.length, - config: { - centroid_type - } - }; - const cluster = new SmartCluster(this.env, cluster_data); - cluster.generate_name(); - this.set(cluster); - } - - await this.data_adapter.save_all(); - } - - mean_vector(vectors) { - const vec_length = vectors[0].length; - const sum = new Array(vec_length).fill(0); - for (const v of vectors) { - for (let i = 0; i < vec_length; i++) { - sum[i] += v[i]; - } - } - return sum.map(val => val / vectors.length); - } - - median_vector(vectors) { - const vec_length = vectors[0].length; - const median_vec = []; - for (let i = 0; i < vec_length; i++) { - const vals = vectors.map(v => v[i]).sort((a, b) => a - b); - const mid = Math.floor(vals.length / 2); - median_vec[i] = vals.length % 2 === 0 ? (vals[mid - 1] + vals[mid]) / 2 : vals[mid]; - } - return median_vec; + async build_groups() { + await this.cluster_adapter.build_groups(); + await this.process_save_queue(); } - nearest_centroid(vec, centroids) { - let nearest = -1; - let best_dist = Infinity; - for (let c = 0; c < centroids.length; c++) { - const d = this.euclidean_distance(vec, centroids[c]); - if (d < best_dist) { - best_dist = d; - nearest = c; - } - } - return nearest; - } - - euclidean_distance(a, b) { - let sum = 0; - for (let i = 0; i < a.length; i++) { - const diff = a[i] - b[i]; - sum += diff * diff; + /** + * Return the cluster adapter specified in `opts.cluster_adapter` or the default. + */ + get cluster_adapter() { + if(!this._cluster_adapter) { + const adapter_class = this.opts?.group_adapter?.collection; + if(!adapter_class) throw new Error("No cluster adapter class provided. Configure `opts.group_adapter` in SmartClusters constructor."); + this._cluster_adapter = new adapter_class(this); } - return Math.sqrt(sum); + return this._cluster_adapter; } - async process_cluster_queue() { - console.log("No cluster queue logic implemented yet."); - } - - get notices() { return this.env.smart_connections_plugin?.notices || this.env.main?.notices; } - - async process_embed_queue() { - console.log("smart_clusters: no embed queue processing implemented."); - } - - async render_clusters(container, opts = {}) { - if (container) container.innerHTML = 'Loading clusters...'; - const frag = await this.env.render_component('clusters', this, opts); - if (container) { - container.innerHTML = ''; - container.appendChild(frag); - } - return frag; + /** + * Example settings config that controls how many clusters, max iterations, etc. + */ + get settings_config() { + const base = super.settings_config || {}; + return { + ...base, + 'clusters_ct': { + name: "Number of Clusters", + type: "number", + default: 5, + description: "How many clusters to form.", + }, + 'max_iterations': { + name: "Max Iterations", + type: "number", + default: 10, + description: "Maximum number of refinement iterations." + }, + 'centroid_type': { + name: "Centroid Type", + type: "select", + options: ["mean", "median"], + default: "mean", + description: "Choose mean or median approach for computing cluster center." + }, + }; } -} +} \ No newline at end of file diff --git a/smart-clusters/test/source_clusters.test.js b/smart-clusters/test/source_clusters.test.js new file mode 100644 index 00000000..059644d4 --- /dev/null +++ b/smart-clusters/test/source_clusters.test.js @@ -0,0 +1,209 @@ +import test from 'ava'; +import path from 'path'; +import fs from 'fs'; +import { execSync } from 'child_process'; + +import { SmartEnv } from 'smart-environment/smart_env.js'; +import { NodeFsSmartFsAdapter } from 'smart-fs/adapters/node_fs.js'; +import { SmartFs } from 'smart-fs/smart_fs.js'; +import { SmartSettings } from 'smart-settings/smart_settings.js'; + +import { SmartSources, SmartSource } from 'smart-sources'; +import { MarkdownSourceContentAdapter } from 'smart-sources/adapters/markdown_source.js'; +import { SmartBlocks, SmartBlock } from 'smart-blocks'; +import { MarkdownBlockContentAdapter } from 'smart-blocks/adapters/markdown_block.js'; + +import source_ajson_data_adapter from 'smart-sources/adapters/data/ajson_multi_file.js'; +import block_ajson_data_adapter from 'smart-blocks/adapters/data/ajson_multi_file.js'; +import group_ajson_data_adapter from 'smart-groups/adapters/data/ajson_multi_file.js'; + +import { SmartClusters, SmartCluster, source_cluster_adapter } from '../index.js'; +import { SmartEmbedModel } from 'smart-embed-model'; +import { SmartEmbedTransformersAdapter } from 'smart-embed-model/adapters/transformers.js'; + +/** + * Creates an environment pointing to `test/test-content` with SmartClusters. + * If that folder does not exist, we run `test_content.js` to generate it. + */ +async function create_integration_env() { + const testContentDir = path.join(process.cwd(), 'test', 'test-content'); + + // If the folder doesn't exist, create it by running test_content.js + if (!fs.existsSync(testContentDir)) { + if(fs.existsSync('test/test_content.js')) { + execSync('node test/test_content.js'); + await new Promise(resolve => setTimeout(resolve, 1000)); + } else { + throw new Error( + `Missing test_content.js script. ` + + `Please provide one in test/test-content or run your existing script to create test data.` + ); + } + } + + // Now create a fresh environment: + const env = await SmartEnv.create( + { + load_settings: () => ({}), + save_settings: () => {}, + get settings() { return {}; }, + }, + { + env_path: testContentDir, + modules: { + // minimal or real FS + smart_fs: { class: SmartFs, adapter: NodeFsSmartFsAdapter }, + // basic settings + smart_settings: { class: SmartSettings }, + // optional embed model + smart_embed_model: { + class: SmartEmbedModel, + adapters: { + transformers: SmartEmbedTransformersAdapter, + }, + }, + }, + collections: { + // The main source collection + smart_sources: { + class: SmartSources, + data_adapter: source_ajson_data_adapter, + source_adapters: { + md: MarkdownSourceContentAdapter, + } + }, + // blocks + smart_blocks: { + class: SmartBlocks, + data_adapter: block_ajson_data_adapter, + block_adapters: { + md: MarkdownBlockContentAdapter, + } + }, + // clusters + smart_clusters: { + class: SmartClusters, + data_adapter: group_ajson_data_adapter, + group_adapter: source_cluster_adapter + }, + }, + item_types: { + SmartSource, + SmartBlock, + SmartCluster, + }, + default_settings: { + smart_sources: { + data_dir: 'multi', + embed_model: { + adapter: 'transformers', + // Provide a placeholder or a smaller HF model if desired + transformers: { + legacy_transformers: false, + model_key: 'TaylorAI/bge-micro-v2', + gpu_batch_size: 2 + }, + }, + min_chars: 10, + }, + // cluster settings + smart_clusters: { + data_dir: 'clusters' + } + } + } + ); + + return env; +} + +test.before(async (t) => { + // 1. Create environment + t.context.env = await create_integration_env(); + + // 2. Initialize items in sources + await t.context.env.smart_sources.init_items(); + await t.context.env.smart_sources.process_load_queue(); + + // 3. Import actual markdown from disk => parse blocks => embed + await t.context.env.smart_sources.process_source_import_queue(); + + // 4. Save any newly created items + await t.context.env.smart_sources.process_save_queue(); +}); + +test.after(async (t) => { + // Cleanup if you wish, or keep the data for debugging + fs.rmdirSync(path.join(process.cwd(), 'test', 'test-content'), { recursive: true }); +}); + +test.serial("Integration: Sources loaded with embeddings, able to cluster", async (t) => { + const { env } = t.context; + const sources = env.smart_sources; + const clusters = env.smart_clusters; + + // confirm we have some sources in memory + const allSources = Object.values(sources.items); + t.true(allSources.length > 0, 'Should have multiple sources from test-content'); + + // In a real scenario, they'd each have .vec after embedding + const sourcesWithVec = allSources.filter((src) => src.vec); + t.true(sourcesWithVec.length > 0, 'Some sources have .vec (embedding).'); + + // Now cluster them + clusters.settings.clusters_ct = 3; // e.g., 3 clusters + clusters.settings.max_iterations = 5; + await clusters.build_groups(); + + const clusterItems = Object.values(clusters.items); + t.true(clusterItems.length > 0, 'Should produce some cluster items'); + + // Check membership + let totalAssigned = 0; + for (const cluster of clusterItems) { + const { members, number_of_members } = cluster.data; + totalAssigned += members?.length ?? 0; + t.is(members?.length, number_of_members, 'members array length matches number_of_members field'); + } + t.is(totalAssigned, sourcesWithVec.length, 'All vectorized sources assigned to some cluster'); + + t.pass('Successfully built clusters from embedded sources.'); +}); + +test.serial("Integration: Deleting a cluster reassigns its members to remaining clusters", async (t) => { + const { env } = t.context; + const clusters = env.smart_clusters; + + // pick the first cluster + const existingClusters = Object.values(clusters.items); + t.true(existingClusters.length >= 2, 'Need at least 2 clusters for reassignment test'); + const clusterToDelete = existingClusters[0]; + const membersBefore = clusterToDelete.data.members || []; + + // delete it + await clusterToDelete.delete(); + + // verify it's removed from the collection + t.falsy(clusters.get(clusterToDelete.key), 'Deleted cluster item no longer in collection'); + + // verify its members are found in some other cluster's .members + const remainingClusters = Object.values(clusters.items); + for (const mKey of membersBefore) { + const wasReassigned = remainingClusters.some((c) => c.data.members?.includes(mKey)); + t.true(wasReassigned, `Member ${mKey} got reassigned to another cluster`); + } +}); + +test.serial("Integration: Each cluster's center_source is among its .members", async (t) => { + const { env } = t.context; + const clusters = env.smart_clusters; + + for (const c of Object.values(clusters.items)) { + t.truthy(c.data.center_source_key, 'Should define a center_source_key'); + t.true( + c.data.members.includes(c.data.center_source_key), + 'The center source must be in the cluster members array.' + ); + } + t.pass("Center source is indeed part of that cluster's membership."); +}); diff --git a/smart-clusters/test/test_content.js b/smart-clusters/test/test_content.js new file mode 100644 index 00000000..341b3cc1 --- /dev/null +++ b/smart-clusters/test/test_content.js @@ -0,0 +1,133 @@ +#!/usr/bin/env node + +/** + * @file create_test_content.js + * @description + * Node script to create a handful of Markdown files in `smart-clusters/test/test-content/` for integration testing. + * Each file includes comments in this script describing its purpose in cluster-related tests. + * + * Usage: + * 1. `cd` into the root of your project (one level up from `smart-clusters/`). + * 2. Run: `node smart-clusters/test/create_test_content.js` + * + * This script will: + * - Ensure the `smart-clusters/test/test-content/` directory exists. + * - Write multiple markdown files with test content. + * - Each file is intended to produce different similarity/dissimilarity outcomes. + */ + +import fs from 'fs'; +import path from 'path'; + +// Directory in which we create these test .md files +const baseDir = path.join(process.cwd(), 'test/test-content'); + +// Ensure the directory exists +if (!fs.existsSync(baseDir)) { + fs.mkdirSync(baseDir, { recursive: true }); +} + +// Files to create: +// 1. sample_1.md +// Purpose: A short, simple markdown file with minimal text. Should embed as a small vector. +// Expectation: Clusters that rely on minimal content might group it with other short minimal files. +// +// 2. similar_a.md +// Purpose: Contains text similar to similar_b.md. We expect these two files to appear in the same cluster. +// Expectation: High cosine similarity between this and similar_b.md. +// +// 3. similar_b.md +// Purpose: Contains text similar to similar_a.md. Also expected to cluster together with similar_a.md. +// Expectation: High cosine similarity with similar_a.md. +// +// 4. unique_a.md +// Purpose: Has content that differs significantly from other files, to observe how it may form a singleton cluster. +// Expectation: Low similarity to other items, possibly ends up alone. +// +// 5. random_1.md +// Purpose: Demonstrates random text that doesn't match others strongly. +// Expectation: Possibly forms a cluster with random_2 if content overlaps, or sits alone. +// +// 6. random_2.md +// Purpose: Another random text file to test random or partial similarity to random_1.md. +// Expectation: Might cluster with random_1 if there's enough textual overlap, or remain somewhat separate. +// +// Additional files can be added here as needed. + +const filesData = [ + { + name: 'sample_1.md', + content: `# Sample 1 + +This is a very short source file used for testing Smart Clusters. + +It doesn't share content with other test files, but is short enough that any embedding might be minimal. + +- Key testing aspect: minimal content, checks how clustering deals with near-empty or low-token items. +` + }, + { + name: 'similar_a.md', + content: `# Similar A + +Hello world. This text is used to check similarity with Similar B. + +It contains phrases like "cluster analysis", "cosine similarity", and "embedding vectors". +We hope that the repeated references to embedding topics will align it with Similar B's content. + +- Key testing aspect: high textual overlap with "similar_b.md". +` + }, + { + name: 'similar_b.md', + content: `# Similar B + +Hello world. This text is used to check similarity with Similar A. + +It also mentions "embedding vectors", "cluster analysis", and "cosine similarity". +In principle, it should appear in the same cluster as Similar A due to repeated keywords. + +- Key testing aspect: high textual overlap with "similar_a.md". +` + }, + { + name: 'unique_a.md', + content: `# Unique A + +Quantum entanglement defies local realism and has profound implications for the foundation of quantum mechanics. + +It doesn't reference any typical "embedding" or "cosine" verbiage. So it should remain quite distinct. + +- Key testing aspect: drastically different content than the rest. +` + }, + { + name: 'random_1.md', + content: `# Random 1 + +Jlore ipwxa kdla zptfy ucbls oflo mfrow. Dqnjnx clfri zpil xioqd. +Random text with no direct connection to the other test files. + +- Key testing aspect: mostly nonsense; if random_2 shares partial nonsense, they might cluster. +` + }, + { + name: 'random_2.md', + content: `# Random 2 + +Udmxa jzpsw okalm dxnwy gfrty zptfy srandom xioqd zgy shq. +Some partial overlap with random_1, but still mostly gibberish. + +- Key testing aspect: possibly a partial textual overlap with random_1. +` + } +]; + +// Write each file to the test-content folder +filesData.forEach(fileObj => { + const filePath = path.join(baseDir, fileObj.name); + fs.writeFileSync(filePath, fileObj.content, 'utf8'); + console.log(`Created: ${fileObj.name}`); +}); + +console.log(`\nTest markdown files have been created in: ${baseDir}\n`); \ No newline at end of file diff --git a/smart-clusters/utils/shuffle_array.js b/smart-clusters/utils/shuffle_array.js new file mode 100644 index 00000000..cf6750b0 --- /dev/null +++ b/smart-clusters/utils/shuffle_array.js @@ -0,0 +1,20 @@ +/** + * @file shuffle_array.js + * @description Provides a function to shuffle an array in-place using the Fisher-Yates (Knuth) algorithm. + */ + +/** + * Shuffles an array in-place using the Fisher-Yates (Knuth) algorithm. + * + * @param {any[]} arr - The array to shuffle. + * @returns {any[]} The same array, shuffled in-place. + */ +export function shuffle_array(arr) { + for (let i = arr.length - 1; i > 0; i--) { + const swap_index = Math.floor(Math.random() * (i + 1)); + const temp = arr[i]; + arr[i] = arr[swap_index]; + arr[swap_index] = temp; + } + return arr; +} \ No newline at end of file diff --git a/smart-collections/adapters/ajson_multi_file.js b/smart-collections/adapters/ajson_multi_file.js index f327cc6a..a2dd1f73 100644 --- a/smart-collections/adapters/ajson_multi_file.js +++ b/smart-collections/adapters/ajson_multi_file.js @@ -194,7 +194,7 @@ export class AjsonMultiFileItemDataAdapter extends FileItemDataAdapter { else await this.fs.remove(this.data_path); } } catch (e) { - console.warn("Error loading item (queueing import)", this.item.key, this.data_path, e); + // console.warn("Error loading item (queueing import)", this.item.key, this.data_path, e); this.item.queue_import(); } } diff --git a/smart-embed-model/package.json b/smart-embed-model/package.json index 790d5bbb..576cf04b 100644 --- a/smart-embed-model/package.json +++ b/smart-embed-model/package.json @@ -26,6 +26,7 @@ }, "homepage": "https://jsbrains.org", "devDependencies": { + "@huggingface/transformers": "^3.2.1", "ava": "^6.0.1", "dotenv": "^16.3.1", "esbuild": "^0.23.1", @@ -33,7 +34,7 @@ }, "dependencies": { "js-tiktoken": "^1.0.11", - "smart-model": "../smart-model", - "smart-http-request": "../smart-http-request" + "smart-http-request": "../smart-http-request", + "smart-model": "../smart-model" } }