-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Introduced the SmartClusters module, including core classes for ma…
…naging clusters and their members. - Added a new `source.js` adapter implementing a k-centers clustering approach, enhancing clustering capabilities. - Refactored `SmartCluster` and `SmartClusters` classes to improve member management and clustering logic. - Enhanced the rendering of clusters in the UI with updated HTML structure and improved member display. - Added integration tests to validate clustering functionality and member reassignment upon deletion. - Created a test content generation script to facilitate integration testing with various markdown files.
- Loading branch information
Brian Joseph Petro
committed
Dec 22, 2024
1 parent
424f069
commit bc1a793
Showing
14 changed files
with
846 additions
and
329 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
node_modules |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,55 +1,47 @@ | ||
/** | ||
* @class SmartClustersDataAdapter | ||
* @classdesc Provides load/save operations for SmartCluster entities. | ||
* This is a placeholder to demonstrate how to integrate data persistence. | ||
* In practice, implement reading/writing to file or DB here. | ||
* @file _adapter.js | ||
* @description Base adapter classes for building clusters from sources (or other items). | ||
*/ | ||
export class SmartClustersDataAdapter { | ||
constructor(collection) { | ||
this.collection = collection; | ||
} | ||
|
||
get env() { return this.collection.env; } | ||
|
||
/** | ||
* @class ClusterCollectionAdapter | ||
* @classdesc | ||
* Interface for a collection-level adapter that can build clusters from environment items. | ||
*/ | ||
export class ClusterCollectionAdapter { | ||
/** | ||
* Loads all cluster data from a persistent store (JSON file, DB, etc.). | ||
* In this placeholder, we assume data is stored in memory (or a single JSON file). | ||
* @returns {Promise<void>} | ||
* @constructor | ||
* @param {Object} collection - The cluster collection instance. | ||
*/ | ||
async load_all() { | ||
// TODO: Load from `smart_env.json` or a cluster-specific file if desired. | ||
// For now, assume empty or existing data in this.env. | ||
const stored_data = this.env._clusters_data || []; | ||
for (const cluster_data of stored_data) { | ||
const cluster = new this.collection.item_type(this.env, cluster_data); | ||
this.collection.set(cluster); | ||
} | ||
constructor(collection) { | ||
this.collection = collection; | ||
} | ||
|
||
/** | ||
* Saves all clusters to a persistent store. | ||
* @returns {Promise<void>} | ||
* @async | ||
* Build clusters. (No-op by default.) | ||
*/ | ||
async save_all() { | ||
// Gather all clusters and store them: | ||
const clusters_data = Object.values(this.collection.items).map(item => item.data); | ||
// Placeholder: store in memory. In production, write to a file or DB. | ||
this.env._clusters_data = clusters_data; | ||
async build_groups() { | ||
throw new Error("Not implemented. Override in subclass."); | ||
} | ||
} | ||
|
||
/** | ||
* @class ClusterItemAdapter | ||
* @classdesc | ||
* Interface for item-level logic if needed for cluster items. | ||
*/ | ||
export class ClusterItemAdapter { | ||
/** | ||
* Save a single cluster. | ||
* @param {SmartCluster} cluster | ||
* @returns {Promise<void>} | ||
* @constructor | ||
* @param {Object} item - The cluster item instance. | ||
*/ | ||
async save(cluster) { | ||
// Update single cluster data in memory. | ||
const idx = (this.env._clusters_data || []).findIndex(c => c.key === cluster.key); | ||
if (idx === -1) { | ||
this.env._clusters_data = this.env._clusters_data || []; | ||
this.env._clusters_data.push(cluster.data); | ||
} else { | ||
this.env._clusters_data[idx] = cluster.data; | ||
} | ||
constructor(item) { | ||
this.item = item; | ||
} | ||
} | ||
|
||
export default { | ||
collection: ClusterCollectionAdapter, | ||
item: ClusterItemAdapter | ||
}; |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,268 @@ | ||
/** | ||
* @file source.js (Alternative “k-centers” style) | ||
* @description | ||
* An alternative clustering adapter that exclusively uses a “k-centers” approach, | ||
* aiming to minimize the maximum distance (or equivalently, maximize the minimum similarity). | ||
*/ | ||
|
||
import { ClusterCollectionAdapter, ClusterItemAdapter } from "./_adapter.js"; | ||
import { cos_sim } from "smart-entities/cos_sim.js"; // or from your local cos_sim | ||
import { shuffle_array } from "../utils/shuffle_array.js"; | ||
|
||
/** | ||
* @class SourceClustersAdapterKCenters | ||
* @extends ClusterCollectionAdapter | ||
* @description | ||
* Builds clusters by scanning `env.smart_sources` for items with a `.vec`, | ||
* using a k-centers approach. The cluster "center" is always the actual | ||
* member in that cluster that minimizes maximum distance to all other members | ||
* in the cluster (i.e. `nearest_member`). | ||
*/ | ||
export class SourceClustersAdapter extends ClusterCollectionAdapter { | ||
|
||
/** | ||
* Primary entrypoint: build the clusters from the `smart_sources`. | ||
* | ||
* REQUIRED USER SETTINGS (in cluster `settings_config`): | ||
* - `clusters_ct` | ||
* - `max_iterations` | ||
* | ||
* Optional: You could read from additional fields if desired, but here we only | ||
* use `clusters_ct` (the number of clusters) and `max_iterations`. | ||
*/ | ||
async build_groups() { | ||
console.log("build_groups"); | ||
// 1. Grab user config | ||
const { | ||
clusters_ct = 5, | ||
max_iterations = 10, | ||
} = this.collection.settings ?? {}; // or this.collection.settings_config | ||
|
||
// 2. Filter out any sources that lack a vector | ||
const sources = this.collection.env.smart_sources.filter(s => s?.vec); | ||
|
||
if (sources.length === 0) { | ||
console.warn("No sources with vectors found; skipping cluster build."); | ||
return; | ||
} | ||
|
||
// 3. CLEAR existing clusters (or you can mark them deleted) | ||
this._clear_existing_clusters(); | ||
|
||
// 4. PICK initial cluster centers (k-centers style): | ||
// pick 1 random, then repeatedly pick the source that is furthest from all chosen centers | ||
const centers = this._choose_initial_k_centers(sources, clusters_ct); | ||
|
||
// 5. Create cluster items for each center | ||
const clusterItems = await Promise.all(centers.map(async (centerSource, i) => { | ||
return await this.collection.create_or_update({ | ||
key: centerSource.key, | ||
center_source_key: centerSource.key, | ||
name: `Cluster #${i + 1}`, | ||
members: [], | ||
number_of_members: 0, | ||
clustering_timestamp: Date.now(), | ||
}); | ||
})); | ||
|
||
// 6. Refine clusters for up to max_iterations | ||
for (let iter = 0; iter < max_iterations; iter++) { | ||
let changed = false; | ||
|
||
// 6a. Assign every source to the nearest center | ||
// We’ll track membership in a scratch map: clusterKey => arrayOfSourceKeys | ||
const newMembershipMap = {}; | ||
clusterItems.forEach(ci => { | ||
newMembershipMap[ci.key] = []; | ||
}); | ||
|
||
for (const src of sources) { | ||
// find cluster whose center yields the highest cos_sim | ||
let bestCluster = null; | ||
let bestSim = -Infinity; | ||
|
||
for (const ci of clusterItems) { | ||
const centerVec = this._get_center_vec(ci); | ||
if (!centerVec) continue; | ||
const sim = cos_sim(src.vec, centerVec); | ||
if (sim > bestSim) { | ||
bestSim = sim; | ||
bestCluster = ci; | ||
} | ||
} | ||
|
||
if (bestCluster) { | ||
newMembershipMap[bestCluster.key].push(src.key); | ||
} | ||
} | ||
|
||
// 6b. For each cluster, pick the "nearest_member" that | ||
// minimizes the maximum distance to all other members | ||
// (or equivalently, maximizes min-sim). | ||
for (const ci of clusterItems) { | ||
const newMembers = newMembershipMap[ci.key] || []; | ||
ci.data.members = newMembers; // store membership | ||
if (newMembers.length === 0) continue; | ||
|
||
// pick the new center by "nearest_member" logic | ||
const newCenterKey = this._find_nearest_member(ci, newMembers); | ||
if (newCenterKey && newCenterKey !== ci.data.center_source_key) { | ||
ci.data.key = newCenterKey; | ||
ci.data.center_source_key = newCenterKey; | ||
changed = true; | ||
} | ||
} | ||
|
||
if (!changed) { | ||
// no cluster center changed => stable | ||
break; | ||
} | ||
} | ||
|
||
console.log("clusterItems", clusterItems.map(ci => ci.key)); | ||
// 7. Finalize cluster data | ||
clusterItems.forEach(ci => { | ||
ci.data.number_of_members = ci.data.members?.length ?? 0; | ||
ci.data.clustering_timestamp = Date.now(); | ||
this.collection.set(ci); | ||
// Mark them for saving | ||
ci.queue_save(); | ||
}); | ||
console.log(Object.values(this.collection.items).length); | ||
} | ||
|
||
/** | ||
* Private helper: Choose K centers using a standard k-center approach: | ||
* - pick 1 center at random | ||
* - pick each subsequent center by finding the source that is furthest from any existing center | ||
*/ | ||
_choose_initial_k_centers(sources, k) { | ||
if (k >= sources.length) return sources.slice(0, k); | ||
|
||
const pickedCenters = []; | ||
// pick the first random | ||
const shuffled = shuffle_array([...sources]); | ||
pickedCenters.push(shuffled[0]); | ||
|
||
// pick the rest | ||
while (pickedCenters.length < k) { | ||
let bestCandidate = null; | ||
let bestDist = -Infinity; | ||
|
||
// for each source, compute distance to its nearest picked center | ||
// we want the one that is furthest from *all* picked centers | ||
for (const s of sources) { | ||
if (pickedCenters.includes(s)) continue; | ||
// find the highest sim among the already-chosen centers | ||
let nearestSim = -Infinity; | ||
for (const c of pickedCenters) { | ||
const sim = cos_sim(s.vec, c.vec); | ||
if (sim > nearestSim) { | ||
nearestSim = sim; | ||
} | ||
} | ||
// distance ~ 1 - sim, or we can just track sim | ||
// we want to maximize the distance => minimize the sim | ||
if (nearestSim < bestDist || bestDist < 0) { | ||
// we are looking for the source with the minimal "nearestSim" | ||
// so we actually want the smallest nearestSim | ||
} | ||
// Actually simpler: track `lowestSimSoFar` and pick the source whose `lowestSimSoFar` is smallest | ||
if (bestCandidate === null) { | ||
bestCandidate = s; | ||
bestDist = nearestSim; | ||
} else if (nearestSim < bestDist) { | ||
bestCandidate = s; | ||
bestDist = nearestSim; | ||
} | ||
} | ||
|
||
if (bestCandidate) { | ||
pickedCenters.push(bestCandidate); | ||
} else { | ||
// if none found, means all are accounted for | ||
break; | ||
} | ||
} | ||
|
||
return pickedCenters; | ||
} | ||
|
||
/** | ||
* Private helper: Clear existing clusters by removing items from the cluster collection. | ||
*/ | ||
_clear_existing_clusters() { | ||
const cluster_keys = Object.keys(this.collection.items); | ||
cluster_keys.forEach(k => { | ||
this.collection.delete_item(k); | ||
}); | ||
} | ||
|
||
/** | ||
* Private helper: Return the cluster's current center vector. | ||
* | ||
* @param {SmartCluster} cluster | ||
* @returns {number[] | null} | ||
*/ | ||
_get_center_vec(cluster) { | ||
const centerSource = this.collection.env.smart_sources.get(cluster.data.center_source_key); | ||
return centerSource?.vec || null; | ||
} | ||
|
||
/** | ||
* Private helper: Among `memberKeys`, pick the key that yields the smallest maximum distance | ||
* (largest min-sim) to the other members in that cluster. | ||
* | ||
* @param {SmartCluster} cluster | ||
* @param {string[]} memberKeys | ||
* @returns {string|null} chosen center source key | ||
*/ | ||
_find_nearest_member(cluster, memberKeys) { | ||
// if only 1 member, that must be center | ||
if (memberKeys.length === 1) return memberKeys[0]; | ||
|
||
let bestKey = cluster.data.center_source_key ?? null; | ||
let bestScore = -Infinity; // track the best "score" | ||
|
||
// convert keys to source objects | ||
const sources = memberKeys | ||
.map(k => this.collection.env.smart_sources.get(k)) | ||
.filter(s => s?.vec); | ||
|
||
// for each candidate, measure the minimum cos_sim with others, or the average | ||
// "k-center" typically uses “minimize the maximum distance”, i.e. | ||
// we measure the “worst-case similarity” from candidate to all others | ||
for (const candidate of sources) { | ||
let worstSim = Infinity; | ||
for (const other of sources) { | ||
if (other.key === candidate.key) continue; | ||
const sim = cos_sim(candidate.vec, other.vec); | ||
if (sim < worstSim) { | ||
worstSim = sim; | ||
} | ||
} | ||
// we want to maximize worstSim | ||
if (worstSim > bestScore) { | ||
bestScore = worstSim; | ||
bestKey = candidate.key; | ||
} | ||
} | ||
return bestKey; | ||
} | ||
} | ||
|
||
/** | ||
* @class SourceClusterAdapterKCenters | ||
* @extends ClusterItemAdapter | ||
* @description | ||
* If needed, override any per-cluster item logic. Typically we rely on `SmartCluster` | ||
* for "delete => reassign" etc. | ||
*/ | ||
export class SourceClusterAdapter extends ClusterItemAdapter { | ||
// no additional logic needed for a minimal example | ||
} | ||
|
||
export default { | ||
collection: SourceClustersAdapter, | ||
item: SourceClusterAdapter | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,21 @@ | ||
export async function render(scope, opts = {}) { | ||
const html = `<div class="sg-clusters-view"> | ||
<h2>Clusters</h2> | ||
<p>View and manage your clusters below:</p> | ||
<div class="sg-cluster-list"></div> | ||
</div>`; | ||
const html = ` | ||
<div class="sc-clusters-view"> | ||
<h2>Smart Clusters</h2> | ||
<p>These are your automatically generated clusters based on source vectors.</p> | ||
<ul class="sc-cluster-list"></ul> | ||
</div> | ||
`; | ||
const frag = this.create_doc_fragment(html); | ||
|
||
const list = frag.querySelector('.sg-cluster-list'); | ||
// Render each cluster | ||
for (const cluster of Object.values(scope.items)) { | ||
const div = document.createElement('div'); | ||
div.className = 'sg-cluster-item'; | ||
div.innerHTML = `<h3>${cluster.data.name || cluster.key}</h3> | ||
<p>Members: ${cluster.data.member_keys.length}</p>`; | ||
list.appendChild(div); | ||
} | ||
|
||
const ul = frag.querySelector('.sc-cluster-list'); | ||
Object.values(scope.items).forEach(cluster => { | ||
const li = document.createElement('li'); | ||
li.innerHTML = ` | ||
<strong>${cluster.name}</strong> | ||
<small>(Members: ${cluster.data.members?.length ?? 0})</small> | ||
`; | ||
ul.appendChild(li); | ||
}); | ||
return frag; | ||
} | ||
} |
Oops, something went wrong.