Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Large two-level clustering #2882

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 32 additions & 19 deletions faiss/python/extra_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,17 +107,15 @@ def randn(n, seed=12345):
def checksum(a):
""" compute a checksum for quick-and-dirty comparisons of arrays """
a = a.view('uint8')
n = a.size
n4 = n & ~3
cs = ivec_checksum(int(n4 / 4), swig_ptr(a[:n4].view('int32')))
for i in range(n4, n):
cs += x[i] * 33657
if a.ndim == 1:
return bvec_checksum(s.size, swig_ptr(a))
n, d = a.shape
cs = np.zeros(n, dtype='uint64')
bvecs_checksum(n, d, swig_ptr(a), swig_ptr(cs))
return cs


rand_smooth_vectors_c = rand_smooth_vectors


def rand_smooth_vectors(n, d, seed=1234):
res = np.empty((n, d), dtype='float32')
rand_smooth_vectors_c(n, d, swig_ptr(res), seed)
Expand Down Expand Up @@ -422,7 +420,7 @@ def __init__(self, d, k, **kwargs):
including niter=25, verbose=False, spherical = False
"""
self.d = d
self.k = k
self.reset(k)
self.gpu = False
if "progressive_dim_steps" in kwargs:
self.cp = ProgressiveDimClusteringParameters()
Expand All @@ -437,7 +435,32 @@ def __init__(self, d, k, **kwargs):
# if this raises an exception, it means that it is a non-existent field
getattr(self.cp, k)
setattr(self.cp, k, v)
self.set_index()

def set_index(self):
d = self.d
if self.cp.__class__ == ClusteringParameters:
if self.cp.spherical:
self.index = IndexFlatIP(d)
else:
self.index = IndexFlatL2(d)
if self.gpu:
self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
else:
if self.gpu:
fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
else:
fac = ProgressiveDimIndexFactory()
self.fac = fac

def reset(self, k=None):
""" prepare k-means object to perform a new clustering, possibly
with another number of centroids """
if k is not None:
self.k = int(k)
self.centroids = None
self.obj = None
self.iteration_stats = None

def train(self, x, weights=None, init_centroids=None):
""" Perform k-means clustering.
Expand Down Expand Up @@ -476,24 +499,14 @@ def train(self, x, weights=None, init_centroids=None):
nc, d2 = init_centroids.shape
assert d2 == d
faiss.copy_array_to_vector(init_centroids.ravel(), clus.centroids)
if self.cp.spherical:
self.index = IndexFlatIP(d)
else:
self.index = IndexFlatL2(d)
if self.gpu:
self.index = faiss.index_cpu_to_all_gpus(self.index, ngpu=self.gpu)
clus.train(x, self.index, weights)
else:
# not supported for progressive dim
assert weights is None
assert init_centroids is None
assert not self.cp.spherical
clus = ProgressiveDimClustering(d, self.k, self.cp)
if self.gpu:
fac = GpuProgressiveDimIndexFactory(ngpu=self.gpu)
else:
fac = ProgressiveDimIndexFactory()
clus.train(n, swig_ptr(x), fac)
clus.train(n, swig_ptr(x), self.fac)

centroids = faiss.vector_float_to_array(clus.centroids)

Expand Down
19 changes: 17 additions & 2 deletions faiss/utils/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,15 +428,30 @@ void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist) {
}
}

size_t ivec_checksum(size_t n, const int32_t* asigned) {
uint64_t ivec_checksum(size_t n, const int32_t* asigned) {
const uint32_t* a = reinterpret_cast<const uint32_t*>(asigned);
size_t cs = 112909;
uint64_t cs = 112909;
while (n--) {
cs = cs * 65713 + a[n] * 1686049;
}
return cs;
}

uint64_t bvec_checksum(size_t n, const uint8_t* a) {
uint64_t cs = ivec_checksum(n / 4, (const int32_t*)a);
for (size_t i = n / 4 * 4; i < n; i++) {
cs = cs * 65713 + a[n] * 1686049;
}
return cs;
}

void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs) {
#pragma omp parallel for if (n > 1000)
for (size_t i = 0; i < n; i++) {
cs[i] = bvec_checksum(d, a + i * d);
}
}

const float* fvecs_maybe_subsample(
size_t d,
size_t* n,
Expand Down
14 changes: 13 additions & 1 deletion faiss/utils/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,19 @@ int ivec_hist(size_t n, const int* v, int vmax, int* hist);
void bincode_hist(size_t n, size_t nbits, const uint8_t* codes, int* hist);

/// compute a checksum on a table.
size_t ivec_checksum(size_t n, const int32_t* a);
uint64_t ivec_checksum(size_t n, const int32_t* a);

/// compute a checksum on a table.
uint64_t bvec_checksum(size_t n, const uint8_t* a);

/** compute checksums for the rows of a matrix
*
* @param n number of rows
* @param d size per row
* @param a matrix to handle, size n * d
* @param cs output checksums, size n
*/
void bvecs_checksum(size_t n, size_t d, const uint8_t* a, uint64_t* cs);

/** random subsamples a set of vectors if there are too many of them
*
Expand Down