Skip to content

Commit

Permalink
- Updated SmartClusters and SmartCluster classes to improve membe…
Browse files Browse the repository at this point in the history
…r management and clustering logic.

- Introduced `max_cluster_size_percent` setting to limit cluster sizes based on a percentage of total sources.
- Added new vector getters in `SmartCluster` for better access to cluster vectors.
- Refactored `SourceClustersAdapter` to implement stable cluster IDs and improved cluster assignment logic.
- Removed obsolete `ajson_multi_file.js` and `cluster.js` components to streamline the codebase.
- Updated tests to reflect changes in adapter imports and cluster settings.
  • Loading branch information
Brian Joseph Petro committed Dec 22, 2024
1 parent 6d33c6c commit 3dee534
Show file tree
Hide file tree
Showing 9 changed files with 188 additions and 211 deletions.
Empty file.
320 changes: 145 additions & 175 deletions smart-clusters/adapters/source.js

Large diffs are not rendered by default.

21 changes: 0 additions & 21 deletions smart-clusters/components/cluster.js

This file was deleted.

4 changes: 2 additions & 2 deletions smart-clusters/index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { SmartClusters } from "./smart_clusters.js";
import { SmartCluster } from "./smart_cluster.js";
import source_cluster_adapter from "./adapters/source.js";
import source_cluster_group_adapter from "./adapters/source.js";

export { SmartClusters, SmartCluster, source_cluster_adapter };
export { SmartClusters, SmartCluster, source_cluster_group_adapter };
2 changes: 2 additions & 0 deletions smart-clusters/smart_cluster.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ export class SmartCluster extends SmartGroup {
get center_vec() {
return this.center_source?.vec || null;
}
get vec() { return this.center_vec; }
get group_vec() { return this.center_vec; }

/**
* cluster.center_source is a getter returning the source instance
Expand Down
6 changes: 6 additions & 0 deletions smart-clusters/smart_clusters.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,12 @@ export class SmartClusters extends SmartGroups {
default: "mean",
description: "Choose mean or median approach for computing cluster center."
},
'max_cluster_size_percent': {
name: "Max Cluster Size (%)",
type: "number",
default: 0.3, // 30%
description: "Each cluster can only hold up to this fraction of total sources."
},
};
}
}
6 changes: 4 additions & 2 deletions smart-clusters/test/source_clusters.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import source_ajson_data_adapter from 'smart-sources/adapters/data/ajson_multi_f
import block_ajson_data_adapter from 'smart-blocks/adapters/data/ajson_multi_file.js';
import group_ajson_data_adapter from 'smart-groups/adapters/data/ajson_multi_file.js';

import { SmartClusters, SmartCluster, source_cluster_adapter } from '../index.js';
import { SmartClusters, SmartCluster, source_cluster_group_adapter } from '../index.js';
import { SmartEmbedModel } from 'smart-embed-model';
import { SmartEmbedTransformersAdapter } from 'smart-embed-model/adapters/transformers.js';

Expand Down Expand Up @@ -84,7 +84,7 @@ async function create_integration_env() {
smart_clusters: {
class: SmartClusters,
data_adapter: group_ajson_data_adapter,
group_adapter: source_cluster_adapter
group_adapter: source_cluster_group_adapter
},
},
item_types: {
Expand Down Expand Up @@ -154,6 +154,7 @@ test.serial("Integration: Sources loaded with embeddings, able to cluster", asyn
// Now cluster them
clusters.settings.clusters_ct = 3; // e.g., 3 clusters
clusters.settings.max_iterations = 5;
clusters.settings.max_cluster_size_percent = 0.1;
await clusters.build_groups();

const clusterItems = Object.values(clusters.items);
Expand All @@ -165,6 +166,7 @@ test.serial("Integration: Sources loaded with embeddings, able to cluster", asyn
const { members, number_of_members } = cluster.data;
totalAssigned += members?.length ?? 0;
t.is(members?.length, number_of_members, 'members array length matches number_of_members field');
console.log(`[SourceClustersAdapter] Cluster ${cluster.key} has ${members?.length} members.`);
}
t.is(totalAssigned, sourcesWithVec.length, 'All vectorized sources assigned to some cluster');

Expand Down
31 changes: 25 additions & 6 deletions smart-groups/adapters/vector/median_members.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import { DefaultEntitiesVectorAdapter, DefaultEntityVectorAdapter } from "smart-entities/adapters/default.js";
import { sort_by_score_ascending, sort_by_score_descending } from "smart-entities/utils/sort_by_score.js";
import { cos_sim } from "smart-entities/cos_sim.js";
import { results_acc, furthest_acc } from "smart-entities/top_acc.js";


export class MedianMemberVectorsAdapter extends DefaultEntitiesVectorAdapter {
}
Expand All @@ -12,14 +15,30 @@ export class MedianMemberVectorAdapter extends DefaultEntityVectorAdapter {
return this.item.member_collection;
}
async nearest_members(filter = {}) {
filter.key_starts_with = this.item.data.path;
const results = await this.member_collection.nearest(this.median_vec, filter);
return results.sort(sort_by_score_descending);
const {
limit = 50, // TODO: default configured in settings
} = filter;
const nearest = this.members
.reduce((acc, member) => {
if (!member.vec) return acc; // skip if no vec
const result = { item: member, score: cos_sim(this.item.group_vec, member.vec) };
results_acc(acc, result, limit); // update acc
return acc;
}, { min: 0, results: new Set() });
return Array.from(nearest.results).sort(sort_by_score_descending);
}
async furthest_members(filter = {}) {
filter.key_starts_with = this.item.data.path;
const results = await this.member_collection.furthest(this.median_vec, filter);
return results.sort(sort_by_score_ascending);
const {
limit = 50, // TODO: default configured in settings
} = filter;
const furthest = this.members
.reduce((acc, member) => {
if (!member.vec) return acc; // skip if no vec
const result = { item: member, score: cos_sim(this.item.group_vec, member.vec) };
furthest_acc(acc, result, limit); // update acc
return acc;
}, { min: 0, results: new Set() });
return Array.from(furthest.results).sort(sort_by_score_ascending);
}
get median_vec() {
if (!this._median_vec) {
Expand Down
9 changes: 4 additions & 5 deletions smart-groups/smart_group.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ export class SmartGroup extends SmartEntity {
* @returns {SmartSource[]} Array of SmartSource instances
*/
get members() {
return this.member_collection.filter(source =>
source.path.startsWith(this.data.path)
);
return this.member_collection.get_many(this.data.members);
}
get member_collection() {
return this.env[this.member_collection_key];
Expand All @@ -45,14 +43,14 @@ export class SmartGroup extends SmartEntity {
}

async get_nearest_members() {
if(!this.median_vec) {
if(!this.group_vec) {
console.log(`no median vec for directory: ${this.data.path}`);
return [];
}
return this.vector_adapter.nearest_members();
}
async get_furthest_members() {
if(!this.median_vec) {
if(!this.group_vec) {
console.log(`no median vec for directory: ${this.data.path}`);
return [];
}
Expand All @@ -66,6 +64,7 @@ export class SmartGroup extends SmartEntity {
return this.entity_adapter.median_vec;
}
get vec() { return this.median_vec; }
get group_vec() { return this.median_vec; }

/**
* Gets the median vector of all contained blocks
Expand Down

0 comments on commit 3dee534

Please sign in to comment.