You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
In the early versions of k-means in nvGRAPH, we had an outer wrapper that would enable the user to ask for the 'correct' k to be found automatically. The user input a range for k-min and k-max, then the routine would run k-means and using a special metric (residual over dispersion) do a bisection search to find k-opt. This has been lost in the latest version, it should be restored.
Describe the solution you'd like
Add a routine associated with k-means 'kmeans_find_clusters( data, k_min,k_max)' that will perform a search for the minimizing k. This already existed in CUDA code in early versions of the kmeans-standalone code, it's independent of the actual kmeans solver itself, it is an outer loop.
Describe alternatives you've considered
Running a full grid search works also, but that takes k_max -k_min runs, when this should take log(kmax - kmin).
Additional context
This could be in C++/CUDA, or purely python or cuPY for that matter. This should be viewed as a reference implementation. The dispersion kernel is also attached below. This version owns a single memory block that is reused by all the kmeans runs in the opt loop, but that isn't necessary now that we have RMM.
template <typename IndexType_, typename ValueType_>
NVGRAPH_ERROR kmeans_find_clusters(IndexType_ n, IndexType_ d,
IndexType_ kmin, IndexType_ kmax,
ValueType_ tol, IndexType_ * maxiter,
const ValueType_ * __restrict__ obs,
ValueType_ * centroids,
IndexType_ * codes,
int * k_star, ValueType_ * residual) {
// Check that parameters are validif(n < 1) {
WARNING("invalid parameter (n<1)");
return NVGRAPH_ERR_BAD_PARAMETERS;
}
if(d < 1) {
WARNING("invalid parameter (d<1)");
return NVGRAPH_ERR_BAD_PARAMETERS;
}
if(kmin < 1) {
WARNING("invalid parameter (kmin<1)");
return NVGRAPH_ERR_BAD_PARAMETERS;
}
if(kmax > n){
WARNING("invalid parameter (kmax>n)");
return NVGRAPH_ERR_BAD_PARAMETERS;
}
if(tol < 0) {
WARNING("invalid parameter (tol<0)");
return NVGRAPH_ERR_BAD_PARAMETERS;
}
if(maxiter < 0) {
WARNING("invalid parameter (maxiter<0)");
return NVGRAPH_ERR_BAD_PARAMETERS;
}
// Allocate memory// Device memory
thrust::device_vector<IndexType_> clusterSizes(kmax);
//thrust::device_vector<IndexType_> codes(n);
thrust::device_vector<ValueType_> data_dots(n);
thrust::device_vector<ValueType_> centroid_dots(kmax);
thrust::device_vector<ValueType_> work(n*max(kmax,d));
thrust::device_vector<IndexType_> work_int(2*d*n);
IndexType_ *clusterSizes_ptr = clusterSizes.data().get();
//IndexType_ *codes_ptr = codes.data().get();
IndexType_ *work_int_ptr = work_int.data().get();
ValueType_ *data_dots_ptr = data_dots.data().get();
ValueType_ *centroid_dots_ptr = centroid_dots.data().get();
ValueType_ *work_ptr = work.data().get();
// Host memory
thrust::host_vector<ValueType_> results(kmax+1);
thrust::host_vector<ValueType_> clusterDispersion(kmax+1);
//Loop to find *best* k // Perform k-means in binary searchint left = kmin;//must be at least 2 int right = kmax;//int(floor(len(data)/2)) #assumption of clusters of size 2 at leastint mid = int(floor((right+left)/2));
int oldmid = mid;
int tests=0;
int iters = 0;
ValueType_ objective[3]; // 0= left of mid, 1= right of midif(left ==1) left =2; // at least do 2 clusters//(centers_r, pt_assign_r,res_right)= eval_kmeans(data, right)// eval left edgekmeans(n, d, left, tol, *maxiter, obs, codes, clusterSizes_ptr, centroids,
work_ptr, work_int_ptr, data_dots_ptr, centroid_dots_ptr, residual, &iters);
results[left]= *residual;
clusterDispersion[left] = calculate_cluster_dispersion( n, d, left, clusterSizes_ptr, centroids, work_ptr);
//eval right edge
iters=0;
results[right] = 1e20;
while ( results[right] > results[left] && tests< 3){
kmeans(n, d, right, tol, *maxiter, obs, codes, clusterSizes_ptr, centroids,
work_ptr, work_int_ptr, data_dots_ptr, centroid_dots_ptr, residual, &iters);
results[right]= *residual;
clusterDispersion[right] = calculate_cluster_dispersion( n, d, right, clusterSizes_ptr, centroids, work_ptr);
tests +=1;
}
objective[0] = (n-left) /(left -1) * clusterDispersion[left] / results[left];
objective[1] = (n-right)/(right-1) * clusterDispersion[right] / results[right];
//printf(" L=%g,%g,R=%g,%g : resid,objectives\n", results[left], objective[0], results[right], objective[1]);//binary searchwhile ( left< right-1) {
results[mid] = 1e20;
tests=0;
iters=0;
while ( results[mid] > results[left] && tests< 3){
kmeans(n, d, mid, tol, *maxiter, obs, codes, clusterSizes_ptr, centroids,
work_ptr, work_int_ptr, data_dots_ptr, centroid_dots_ptr, residual, &iters);
results[mid]= *residual;
clusterDispersion[mid] = calculate_cluster_dispersion( n, d, mid, clusterSizes_ptr, centroids, work_ptr);
if (results[mid]> results[left] && (mid+1)<right ) {
mid+=1;
results[mid] = 1e20;
}
elseif( results[mid]>results[left] && (mid-1)>left) {
mid-=1;
results[mid]= 1e20;
}
tests +=1;
}
//objective[0] =abs(results[left]-results[mid]) /(results[left]-minres); //objective[0] /= mid-left;//objective[1] =abs(results[mid] -results[right])/(results[mid]-minres); //objective[1] /= right-mid; // maximize Calinski-Harabasz Index, minimize resid/ cluster
objective[0] = (n-left) /(left -1) * clusterDispersion[left] / results[left];
objective[1] = (n-right)/(right-1) * clusterDispersion[right] / results[right];
objective[2] = (n-mid) /(mid-1) * clusterDispersion[mid] / results[mid] ;
// yes, overwriting the above temporary results is what I want//printf(" L=%g M=%g R=%g : objectives\n", objective[0], objective[2], objective[1]);
objective[0] = (objective[2]- objective[0])/ (mid-left);
objective[1] = (objective[1]- objective[2])/ (right-mid);
//printf(" L=%g,R=%g : d obj/ d k \n", objective[0], objective[1]);//printf(" left, mid, right, res_left, res_mid, res_right\n");//printf(" %d, %d, %d, %g, %g, %g\n", left, mid, right, results[left], results[mid], results[right]);if(objective[0]>0 && objective[1]<0 ) {
// our point is in the left-of-mid side
right = mid;
}
else {
left= mid;
}
oldmid=mid;
mid = int(floor((right+left)/2));
}
*k_star = right;
objective[0] = (n-left)/(left-1) * clusterDispersion[left] / results[left];
objective[1] = (n-oldmid)/(oldmid-1) * clusterDispersion[oldmid] / results[oldmid];
//objective[0] =abs(results[left]-results[mid]) /(results[left]-minres); //objective[0] /= mid-left;//objective[1] =abs(results[mid] -results[right])/(results[mid]-minres); //objective[1] /= right-mid; if (objective[1] < objective[0])
*k_star = left;
// if k_star isn't what we just ran, re-run to get correct centroids and dist data on return-> this saves memoryif (*k_star != oldmid)
kmeans(n, d, *k_star, tol, *maxiter, obs, codes, clusterSizes_ptr, centroids,
work_ptr, work_int_ptr, data_dots_ptr, centroid_dots_ptr, residual, &iters);
*maxiter = iters;
return NVGRAPH_OK;
}
template<typename T, typename I>
T calculate_cluster_dispersion( int n, int d, int k,
const I* __restrict__ clusterSizes_ptr,
const T* __restrict__ centroids,
T* __restrict__ work) {
// CUDA grid dimensions
dim3 blockDim, gridDim;
T *cent_ptr = work;
blockDim.x = 1;
blockDim.y = 1;
blockDim.z = 1;
gridDim.x = 1;
gridDim.y = 1;
gridDim.z = 1;
T disp = T(0);
clusterDispersionKernel<<<blockDim, gridDim>>>(n, d, k, cent_ptr, clusterSizes_ptr, centroids);
CHECK_CUDA(cudaMemcpy(&disp, cent_ptr, sizeof(T), cudaMemcpyDeviceToHost));
return disp;
}
// Compute cluster dispersion, which is the cluster-size weighted sum of squared distances to the global centroid for all// centroids.template <typename IndexType_, typename ValueType_>
static __global__
voidclusterDispersionKernel(IndexType_ n, IndexType_ d, IndexType_ k,
ValueType_ * __restrict__ cent,
const IndexType_ * __restrict__ clusterSizes_ptr,
const ValueType_ * __restrict__ centroids) {
//find_weighted_centroid( int n, int d, int k, const I* __restrict__ clusterSizes_ptr, const T* __restrict__ centroids);if (threadIdx.x ==0){
for (int i =0; i< d; ++i) cent[i] = 0.0;
// Get cluster size from global memoryfor( int i = 0; i < k; ++i)
for (int j = 0; j<d; ++j)
cent[j] += clusterSizes_ptr[i]*centroids[j+i*d]/n;
}
ValueType_ disp = ValueType_(0);
__syncthreads();
//sum_weighted_distances( int d, int k, const I* __restrict__ clusterSizes_ptr, const T* __restrict__ centroids):if (threadIdx.x ==0){
for (int i = 0; i< k; ++i)
for (int j = 0; j<d; ++j)
disp += clusterSizes_ptr[i]*(cent[j] -centroids[j+i*d])*(cent[j]-centroids[j+i*d]);
}
cent[0]= sqrt(disp);
}
The text was updated successfully, but these errors were encountered:
In the early versions of k-means in nvGRAPH, we had an outer wrapper that would enable the user to ask for the 'correct' k to be found automatically. The user input a range for k-min and k-max, then the routine would run k-means and using a special metric (residual over dispersion) do a bisection search to find k-opt. This has been lost in the latest version, it should be restored.
Describe the solution you'd like
Add a routine associated with k-means 'kmeans_find_clusters( data, k_min,k_max)' that will perform a search for the minimizing k. This already existed in CUDA code in early versions of the kmeans-standalone code, it's independent of the actual kmeans solver itself, it is an outer loop.
Describe alternatives you've considered
Running a full grid search works also, but that takes k_max -k_min runs, when this should take log(kmax - kmin).
Additional context
This could be in C++/CUDA, or purely python or cuPY for that matter. This should be viewed as a reference implementation. The dispersion kernel is also attached below. This version owns a single memory block that is reused by all the kmeans runs in the opt loop, but that isn't necessary now that we have RMM.
The text was updated successfully, but these errors were encountered: