Skip to content

Commit

Permalink
- Removed obsolete getters from SmartCluster to streamline the class.
Browse files Browse the repository at this point in the history
- Enhanced SourceClustersAdapter to include orphaned cluster logic, allowing for better handling of sources that do not fit into any cluster.
- Introduced new settings for minimum similarity threshold and refined cluster assignment process.
- Improved iteration logic to track cluster membership and similarity metrics, ensuring more accurate clustering results.
- Updated logging for better clarity on cluster assignments and processing steps.
  • Loading branch information
Brian Joseph Petro committed Dec 22, 2024
1 parent 3dee534 commit f242a68
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 79 deletions.
193 changes: 134 additions & 59 deletions smart-clusters/adapters/source.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,102 +18,194 @@ import { shuffle_array } from "../utils/shuffle_array.js";
*/
export class SourceClustersAdapter extends ClusterCollectionAdapter {
/**
* Build clusters of sources. The user’s cluster settings are read from `this.collection.settings`.
* Build clusters of sources, with orphaned cluster logic:
* 1) Setup config
* 2) Collect vectorized sources
* 3) Clear existing clusters
* 4) Pick initial centers
* 5) Iterative refinement (k iterations)
* - track cluster minSim, derive next iteration threshold
* 6) Orphan assignment
*/
async build_groups() {
// 1) Config from settings
const {
max_iterations = 10,
max_cluster_size_percent = 0.3,
clusters_ct = 5,
// new settings:
min_similarity_threshold_mode = 'lowest',
orphan_cluster_key = 'orphaned',
} = this.collection.settings ?? {};
// 2) Collect all vectorized sources

const sources = this.collection.env.smart_sources.filter(s => s?.vec);
let clusters_ct = 5;
if(!this.collection.settings?.clusters_ct) {
clusters_ct = Math.max(clusters_ct, Math.floor(sources.length/100));
}
console.log(`[SourceClustersAdapter] Building ${clusters_ct} clusters from ${sources.length} sources.`);
if (!sources.length) {
console.warn("No vectorized sources found; skipping cluster build.");
console.warn("No vectorized sources found; skipping cluster build for SourceClustersAdapter.");
return;
}
const max_cluster_size = Math.max(
1,
Math.floor(max_cluster_size_percent * sources.length)
);
const max_cluster_size = Math.max(1, Math.floor(max_cluster_size_percent * sources.length));

// 3) Remove existing clusters from memory
// 1) Remove existing clusters
this._clear_existing_clusters();

// 4) Pick initial cluster centers
// 2) Build initial cluster items
const centers = this._choose_initial_k_centers(sources, clusters_ct);

// 5) Create cluster items with stable IDs: "cluster_0", "cluster_1", etc.
const clusterItems = centers.map((src, i) => {
const stableKey = `cluster_${i + 1}`;
const item = this.collection.create_or_update({
key: stableKey, // stable ID
key: stableKey,
center_source_key: src.key,
members: [],
number_of_members: 0,
iteration_min_sim: null, // track the cluster's min similarity
clustering_timestamp: Date.now(),
});
return item;
});

// 6) Iterate & refine
// 2b) Create the orphan cluster item
let orphanCluster = this.collection.get(orphan_cluster_key);
if (!orphanCluster) {
orphanCluster = this.collection.create_or_update({
key: orphan_cluster_key, // stable ID
center_source_key: null,
members: [],
number_of_members: 0,
iteration_min_sim: null,
clustering_timestamp: Date.now(),
});
}

// 3) Iterations with tracking minSim
let changed = true;
let globalThreshold = 0; // we update each iteration if using 'lowest' or 'median' logic
for (let iter = 0; iter < max_iterations && changed; iter++) {
changed = false;

// Build new membership arrays keyed by stable cluster ID
const membershipMap = Object.fromEntries(
clusterItems.map(ci => [ci.key, []])
);
// Build membership buckets
const membershipMap = Object.fromEntries(clusterItems.map(ci => [ci.key, []]));

// Assign each source
// For each source, pick cluster with highest sim, subject to max_cluster_size
// We'll keep track of the bestSim to each cluster center
for (const src of sources) {
const { bestClusterKey } = this._pick_best_cluster(
const { bestClusterKey, bestSim } = this._pick_best_cluster(
src, clusterItems, membershipMap, max_cluster_size
);

membershipMap[bestClusterKey].push(src.key);
}

// Recompute center
// Recompute center + track iterationMinSim
let anyCenterChanged = false;
for (const ci of clusterItems) {
const newMembers = membershipMap[ci.key];
ci.data.members = newMembers;
if (!newMembers.length) continue;
if (!newMembers.length) {
ci.data.members = [];
ci.data.iteration_min_sim = null;
continue;
}

// find best center among newMembers
const newCenterKey = this._find_nearest_member(newMembers);
if (newCenterKey && newCenterKey !== ci.data.center_source_key) {
ci.data.center_source_key = newCenterKey;
changed = true;
anyCenterChanged = true;
}
// also compute iteration_min_sim for this cluster
const cvec = this._get_center_vec(ci);
if (cvec) {
let clusterMinSim = 1.0;
for (const mk of newMembers) {
const s = this.collection.env.smart_sources.get(mk);
if (!s?.vec) continue;
const sim = cos_sim(s.vec, cvec);
if (sim < clusterMinSim) clusterMinSim = sim;
}
ci.data.iteration_min_sim = clusterMinSim;
}
// finalize members
ci.data.members = newMembers;
}

// pick global threshold from cluster min-sims (lowest, median, average, etc.)
globalThreshold = this._pick_global_threshold(clusterItems, min_similarity_threshold_mode);

if (anyCenterChanged) changed = true;
}

// 7) Finalize membership
// 4) Orphan assignment pass
// If a source is below the globalThreshold for all clusters, it goes to orphan cluster
orphanCluster.data.members = []; // reset
for (const ci of clusterItems) {
// filter out items that don't meet threshold
const keep = [];
const orphaned = [];
const cvec = this._get_center_vec(ci);
for (const mk of ci.data.members) {
const s = this.collection.env.smart_sources.get(mk);
if (!s?.vec || !cvec) {
orphaned.push(mk);
continue;
}
const sim = cos_sim(s.vec, cvec);
if (sim < globalThreshold) {
// This item is an orphan
orphaned.push(mk);
} else {
keep.push(mk);
}
}
ci.data.members = keep;
orphanCluster.data.members.push(...orphaned);
}

// finalize
for (const ci of clusterItems) {
ci.data.number_of_members = ci.data.members.length;
ci.data.clustering_timestamp = Date.now();
ci.queue_save();
}
orphanCluster.data.number_of_members = orphanCluster.data.members.length;
orphanCluster.data.clustering_timestamp = Date.now();
orphanCluster.data.center_source_key = orphanCluster.data.members[0];
orphanCluster.queue_save();

// Debug check: total assigned must match
const totalAssigned = clusterItems.reduce((sum, ci) => sum + ci.data.members.length, 0);
console.log(
`[SourceClustersAdapter] Assigned ${totalAssigned} sources among ${clusterItems.length} clusters. ` +
`We started with ${sources.length} vectorized sources.`
`[SourceClustersAdapter] assigned ${sources.length} sources among ${clusterItems.length} clusters + 1 orphan cluster.`
);
}

/**
* For each source, pick the cluster that has the highest cos_sim with that cluster’s center
* as long as it’s not “full” (under max_cluster_size).
* If all are full, pick whichever cluster has the fewest members.
* Picks a global threshold from cluster minSim.
* If min_similarity_threshold_mode = 'lowest', pick the max of [all cluster iteration_min_sim].
* If 'median', pick the median. Adjust as you see fit.
*/
_pick_global_threshold(clusterItems, mode) {
const minSims = clusterItems
.map(ci => ci.data.iteration_min_sim)
.filter(v => typeof v === 'number');

if (!minSims.length) return 0;

switch (mode) {
case 'lowest': {
// largest among the cluster-minSims
// Example: if cluster1 minSim=0.6, cluster2=0.4, cluster3=0.55 => pick 0.6
// meaning items must be >= 0.6 to remain in that cluster
return Math.max(...minSims);
}
case 'median': {
// median approach
const sorted = [...minSims].sort((a,b)=>a-b);
const mid = Math.floor(sorted.length/2);
if (sorted.length %2) return sorted[mid];
return (sorted[mid-1] + sorted[mid])/2;
}
default:
// fallback
return 0;
}
}

_pick_best_cluster(src, clusterItems, membershipMap, maxSize) {
let bestClusterKey = null;
let bestSim = -Infinity;
Expand All @@ -126,51 +218,40 @@ export class SourceClustersAdapter extends ClusterCollectionAdapter {
if (!centerVec) continue;

const sim = cos_sim(src.vec, centerVec);

const currCount = membershipMap[ci.key].length;

if (currCount < maxSize) {
// normal assignment
if (sim > bestSim) {
bestSim = sim;
bestClusterKey = ci.key;
}
}
// track the cluster with smallest membership for fallback
// fallback cluster is whichever has smallest membership, if all are full
if (currCount < fallbackCount) {
fallbackCount = currCount;
fallbackKey = ci.key;
}
}

// If bestClusterKey is still null, fallback
if (!bestClusterKey && fallbackKey) {
bestClusterKey = fallbackKey;
}

if (!bestClusterKey) {
// Should not happen if we have at least one cluster
console.warn(`No cluster assigned for source ${src.key}?`);
bestClusterKey = clusterItems[0].key;
}
return { bestClusterKey };
return { bestClusterKey, bestSim };
}

// Remove them from memory so they won't linger
_clear_existing_clusters() {
for (const key of Object.keys(this.collection.items)) {
this.collection.delete_item(key);
}
}

// k-center “plus plus” approach: pick the first at random, then pick each subsequent
// with minimal similarity to existing picks
_choose_initial_k_centers(sources, k) {
if (k >= sources.length) return sources.slice(0, k);

const picked = [];
const shuffled = shuffle_array([...sources]);
picked.push(shuffled[0]);

while (picked.length < k) {
let bestCandidate = null;
let bestDist = Infinity;
Expand All @@ -181,7 +262,7 @@ export class SourceClustersAdapter extends ClusterCollectionAdapter {
const sim = cos_sim(s.vec, c.vec);
if (sim > nearestSim) nearestSim = sim;
}
// pick the candidate with the lowest bestSim” (i.e. far from all chosen)
// choose candidate with lowest bestSim => far from existing centers
if (nearestSim < bestDist) {
bestDist = nearestSim;
bestCandidate = s;
Expand All @@ -193,19 +274,13 @@ export class SourceClustersAdapter extends ClusterCollectionAdapter {
return picked;
}

// Return the center’s vector from the cluster’s stored center_source_key
_get_center_vec(ci) {
const centerSrc = this.collection.env.smart_sources.get(ci.data.center_source_key);
return centerSrc?.vec || null;
}

/**
* Among the cluster’s newMembers, pick the item with largest “worstSim” to all others.
* This ensures the chosen center is the “most central” in a k-center sense.
*/
_find_nearest_member(memberKeys) {
if (memberKeys.length === 1) return memberKeys[0];

const sources = memberKeys
.map(k => this.collection.env.smart_sources.get(k))
.filter(s => s?.vec);
Expand All @@ -229,7 +304,7 @@ export class SourceClustersAdapter extends ClusterCollectionAdapter {
}

export class SourceClusterAdapter extends ClusterItemAdapter {
// no changes needed for the item-level
// no changes needed for item-level
}

export default {
Expand Down
20 changes: 0 additions & 20 deletions smart-clusters/smart_cluster.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ export class SmartCluster extends SmartGroup {
};
}

get key() {
return this.center_source.key;
}

/**
* cluster.center_vec is a getter returning cluster.center_source.vec
* @returns {number[]|null}
Expand All @@ -37,22 +33,6 @@ export class SmartCluster extends SmartGroup {
return this.env.smart_sources.get(this.data.center_source_key);
}

/**
* Dynamically generate a cluster name from top members or use data.name if present.
* Example: "Cluster: (Note1, Note2, ...)"
*/
get name() {
if(this.data.name) return this.data.name;
const membersList = (this.data.members || [])
.slice(0, 3)
.map(k => this.env.smart_sources.get(k)?.file_name || k)
.join(", ");
return `Cluster (${membersList}${this.data.members?.length>3 ? "..." : ""})`;
}
set name(val) {
this.data.name = val;
}

async delete() {
// 1) Reassign members
const allClusters = Object.values(this.collection.items)
Expand Down

0 comments on commit f242a68

Please sign in to comment.