diff --git a/faiss/Clustering.cpp b/faiss/Clustering.cpp index 7cf2ba0948..3594a01c2b 100644 --- a/faiss/Clustering.cpp +++ b/faiss/Clustering.cpp @@ -27,20 +27,6 @@ namespace faiss { -ClusteringParameters::ClusteringParameters() - : niter(25), - nredo(1), - verbose(false), - spherical(false), - int_centroids(false), - update_index(false), - frozen_centroids(false), - min_points_per_centroid(39), - max_points_per_centroid(256), - seed(1234), - decode_block_size(32768) {} -// 39 corresponds to 10000 / 256 -> to avoid warnings on PQ tests with randu10k - Clustering::Clustering(int d, int k) : d(d), k(k) {} Clustering::Clustering(int d, int k, const ClusteringParameters& cp) diff --git a/faiss/Clustering.h b/faiss/Clustering.h index f8008d4293..ef1f00adcd 100644 --- a/faiss/Clustering.h +++ b/faiss/Clustering.h @@ -5,7 +5,7 @@ * LICENSE file in the root directory of this source tree. */ -// -*- c++ -*- +/** Implementation of k-means clustering with many variants. */ #ifndef FAISS_CLUSTERING_H #define FAISS_CLUSTERING_H @@ -19,25 +19,35 @@ namespace faiss { * constructor of the Clustering object. */ struct ClusteringParameters { - int niter; ///< clustering iterations - int nredo; ///< redo clustering this many times and keep best - - bool verbose; - bool spherical; ///< do we want normalized centroids? - bool int_centroids; ///< round centroids coordinates to integer - bool update_index; ///< re-train index after each iteration? - bool frozen_centroids; ///< use the centroids provided as input and do not - ///< change them during iterations - - int min_points_per_centroid; ///< otherwise you get a warning - int max_points_per_centroid; ///< to limit size of dataset - - int seed; ///< seed for the random number generator - - size_t decode_block_size; ///< how many vectors at a time to decode - - /// sets reasonable defaults - ClusteringParameters(); + /// number of clustering iterations + int niter = 25; + /// redo clustering this many times and keep the clusters with the best + /// objective + int nredo = 1; + + bool verbose = false; + /// whether to normalize centroids after each iteration (useful for inner + /// product clustering) + bool spherical = false; + /// round centroids coordinates to integer after each iteration? + bool int_centroids = false; + /// re-train index after each iteration? + bool update_index = false; + + /// Use the subset of centroids provided as input and do not change them + /// during iterations + bool frozen_centroids = false; + /// If fewer than this number of training vectors per centroid are provided, + /// writes a warning. Note that fewer than 1 point per centroid raises an + /// exception. + int min_points_per_centroid = 39; + /// to limit size of dataset, otherwise the training set is subsampled + int max_points_per_centroid = 256; + /// seed for the random number generator + int seed = 1234; + + /// when the training set is encoded, batch size of the codec decoder + size_t decode_block_size = 32768; }; struct ClusteringIterationStats { @@ -94,7 +104,7 @@ struct Clustering : ClusteringParameters { * to decode the input vectors. * * @param codec codec used to decode the vectors (nullptr = - * vectors are in fact floats) * + * vectors are in fact floats) */ void train_encoded( idx_t nx, diff --git a/faiss/MatrixStats.cpp b/faiss/MatrixStats.cpp index a864127bb0..440d9f9dae 100644 --- a/faiss/MatrixStats.cpp +++ b/faiss/MatrixStats.cpp @@ -12,6 +12,7 @@ #include /* va_list, va_start, va_arg, va_end */ #include +#include #include #include @@ -21,18 +22,6 @@ namespace faiss { * MatrixStats *********************************************************************/ -MatrixStats::PerDimStats::PerDimStats() - : n(0), - n_nan(0), - n_inf(0), - n0(0), - min(HUGE_VALF), - max(-HUGE_VALF), - sum(0), - sum2(0), - mean(NAN), - stddev(NAN) {} - void MatrixStats::PerDimStats::add(float x) { n++; if (std::isnan(x)) { @@ -74,19 +63,12 @@ void MatrixStats::do_comment(const char* fmt, ...) { buf += size; } -MatrixStats::MatrixStats(size_t n, size_t d, const float* x) - : n(n), - d(d), - n_collision(0), - n_valid(0), - n0(0), - min_norm2(HUGE_VAL), - max_norm2(0) { +MatrixStats::MatrixStats(size_t n, size_t d, const float* x) : n(n), d(d) { std::vector comment_buf(10000); buf = comment_buf.data(); nbuf = comment_buf.size(); - do_comment("analyzing %ld vectors of size %ld\n", n, d); + do_comment("analyzing %zd vectors of size %zd\n", n, d); if (d > 1024) { do_comment( @@ -94,6 +76,9 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x) "please consider dimensionality reducution (with PCAMatrix)\n"); } + hash_value = hash_bytes((const uint8_t*)x, n * d * sizeof(*x)); + do_comment("hash value 0x%016" PRIx64 "\n", hash_value); + size_t nbytes = sizeof(x[0]) * d; per_dim_stats.resize(d); @@ -156,7 +141,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x) if (n_collision > 0) { do_comment( - "%ld collisions in hash table, " + "%zd collisions in hash table, " "counts may be invalid\n", n_collision); } @@ -167,14 +152,14 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x) max = it->second; } } - do_comment("vector %ld has %ld copies\n", max.first, max.count); + do_comment("vector %zd has %zd copies\n", max.first, max.count); } { // norm stats min_norm2 = sqrt(min_norm2); max_norm2 = sqrt(max_norm2); do_comment( - "range of L2 norms=[%g, %g] (%ld null vectors)\n", + "range of L2 norms=[%g, %g] (%zd null vectors)\n", min_norm2, max_norm2, n0); @@ -182,7 +167,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x) if (max_norm2 < min_norm2 * 1.0001) { do_comment( "vectors are normalized, inner product and " - "L2 search are equivalent\n"); + "L2 search are equivalent\n"); } if (max_norm2 > min_norm2 * 100) { @@ -227,7 +212,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x) do_comment("no constant dimensions\n"); } else { do_comment( - "%ld dimensions are constant: they can be removed\n", + "%zd dimensions are constant: they can be removed\n", n_0_range); } @@ -235,7 +220,7 @@ MatrixStats::MatrixStats(size_t n, size_t d, const float* x) do_comment("no dimension has a too large mean\n"); } else { do_comment( - "%ld dimensions are too large " + "%zd dimensions are too large " "wrt. their variance, may loose precision " "in IndexFlatL2 (use CenteringTransform)\n", n_dangerous_range); diff --git a/faiss/MatrixStats.h b/faiss/MatrixStats.h index 8d18d10089..45a7c97da4 100644 --- a/faiss/MatrixStats.h +++ b/faiss/MatrixStats.h @@ -10,6 +10,7 @@ #pragma once #include +#include #include #include #include @@ -26,20 +27,31 @@ struct MatrixStats { std::string comments; // raw statistics - size_t n, d; - size_t n_collision, n_valid, n0; - double min_norm2, max_norm2; + size_t n = 0, d = 0; + size_t n_collision = 0; + size_t n_valid = 0; + size_t n0 = 0; + double min_norm2 = HUGE_VALF; + double max_norm2 = 0; + uint64_t hash_value = 0; struct PerDimStats { - size_t n, n_nan, n_inf, n0; + /// counts of various special entries + size_t n = 0; + size_t n_nan = 0; + size_t n_inf = 0; + size_t n0 = 0; - float min, max; - double sum, sum2; + /// to get min/max and stddev values + float min = HUGE_VALF; + float max = -HUGE_VALF; + double sum = 0; + double sum2 = 0; - size_t n_valid; - double mean, stddev; + size_t n_valid = 0; + double mean = NAN; + double stddev = NAN; - PerDimStats(); void add(float x); void compute_mean_std(); }; diff --git a/tests/test_build_blocks.py b/tests/test_build_blocks.py index 77f022adf8..0a97e63185 100644 --- a/tests/test_build_blocks.py +++ b/tests/test_build_blocks.py @@ -256,6 +256,14 @@ def test_normalized(self): print(comments) assert 'vectors are normalized' in comments + def test_hash(self): + cc = [] + for _ in range(2): + rs = np.random.RandomState(123) + m = rs.rand(40, 20).astype('float32') + cc.append(faiss.MatrixStats(m).hash_value) + self.assertTrue(cc[0] == cc[1]) + class TestScalarQuantizer(unittest.TestCase):