diff --git a/docs/source/cpp_api/neighbors_cagra.rst b/docs/source/cpp_api/neighbors_cagra.rst
index 6613b0b06d..99ecd3a985 100644
--- a/docs/source/cpp_api/neighbors_cagra.rst
+++ b/docs/source/cpp_api/neighbors_cagra.rst
@@ -19,3 +19,13 @@ namespace *raft::neighbors::cagra*
     :content-only:
 
 
+Serializer Methods
+------------------
+``#include <raft/neighbors/cagra_serialize.cuh>``
+
+namespace *raft::neighbors::cagra*
+
+.. doxygengroup:: cagra_serialize
+    :project: RAFT
+    :members:
+    :content-only:
diff --git a/docs/source/cpp_api/neighbors_ivf_pq.rst b/docs/source/cpp_api/neighbors_ivf_pq.rst
index 348928d719..17948a37fe 100644
--- a/docs/source/cpp_api/neighbors_ivf_pq.rst
+++ b/docs/source/cpp_api/neighbors_ivf_pq.rst
@@ -21,6 +21,17 @@ Serializer Methods
 namespace *raft::neighbors::ivf_pq*
 
 .. doxygengroup:: ivf_pq_serialize
+    :project: RAFT
+    :members:
+    :content-only:
+
+Candidate Refinement
+--------------------
+``#include <raft/neighbors/refine.cuh>``
+
+namespace *raft::neighbors*
+
+.. doxygengroup:: ann_refine
     :project: RAFT
     :members:
     :content-only:
\ No newline at end of file
diff --git a/docs/source/pylibraft_api/cluster.rst b/docs/source/pylibraft_api/cluster.rst
index 59e53e7d4c..085297fe34 100644
--- a/docs/source/pylibraft_api/cluster.rst
+++ b/docs/source/pylibraft_api/cluster.rst
@@ -7,6 +7,9 @@ This page provides pylibraft class references for the publicly-exposed elements
    :language: python
    :class: highlight
 
+KMeans
+######
+
 .. autoclass:: pylibraft.cluster.kmeans.KMeansParams
     :members:
 
@@ -14,8 +17,4 @@ This page provides pylibraft class references for the publicly-exposed elements
 
 .. autofunction:: pylibraft.cluster.kmeans.cluster_cost
 
-.. autofunction:: pylibraft.cluster.compute_new_centroids
-
-
-
-
+.. autofunction:: pylibraft.cluster.kmeans.compute_new_centroids
diff --git a/docs/source/pylibraft_api/neighbors.rst b/docs/source/pylibraft_api/neighbors.rst
index ca89c25ed4..680a2982cb 100644
--- a/docs/source/pylibraft_api/neighbors.rst
+++ b/docs/source/pylibraft_api/neighbors.rst
@@ -27,6 +27,11 @@ CAGRA
 
 .. autofunction:: pylibraft.neighbors.cagra.search
 
+Serializer Methods
+------------------
+.. autofunction:: pylibraft.neighbors.cagra.save
+
+.. autofunction:: pylibraft.neighbors.cagra.load
 
 IVF-Flat
 ########
@@ -43,6 +48,12 @@ IVF-Flat
 
 .. autofunction:: pylibraft.neighbors.ivf_flat.search
 
+Serializer Methods
+------------------
+
+.. autofunction:: pylibraft.neighbors.ivf_flat.save
+
+.. autofunction:: pylibraft.neighbors.ivf_flat.load
 
 IVF-PQ
 ######
@@ -59,8 +70,14 @@ IVF-PQ
 
 .. autofunction:: pylibraft.neighbors.ivf_pq.search
 
+Serializer Methods
+------------------
+
+.. autofunction:: pylibraft.neighbors.ivf_pq.save
+
+.. autofunction:: pylibraft.neighbors.ivf_pq.load
 
 Candidate Refinement
-####################
+--------------------
 
 .. autofunction:: pylibraft.neighbors.refine
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 3a3db0f3ea..6fd7523c64 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -182,6 +182,10 @@ options:
 All algorithms present in the CSV file supplied to this script with parameter `result_csv`
 will appear in the plot.
 
+The figure below is the resulting plot of running our benchmarks as of August 2023 for a batch size of 10, on an NVIDIA H100 GPU and an Intel Xeon Platinum 8480CL CPU. It presents the throughput (in Queries-Per-Second) performance for every level of recall.
+
+![Throughput vs recall plot comparing popular ANN algorithms with RAFT's at batch size 10](../../img/raft-vector-search-batch-10.png)
+
 ## Adding a new ANN algorithm
 ### Implementation and Configuration
 Implementation of a new algorithm should be a C++ class that inherits `class ANN` (defined in `cpp/bench/ann/src/ann.h`) and implements all the pure virtual functions.
diff --git a/img/raft-vector-search-batch-10.png b/img/raft-vector-search-batch-10.png
new file mode 100644
index 0000000000..5416e611f2
Binary files /dev/null and b/img/raft-vector-search-batch-10.png differ
diff --git a/python/pylibraft/pylibraft/cluster/kmeans.pyx b/python/pylibraft/pylibraft/cluster/kmeans.pyx
index b61fb4ab02..f4af519dc1 100644
--- a/python/pylibraft/pylibraft/cluster/kmeans.pyx
+++ b/python/pylibraft/pylibraft/cluster/kmeans.pyx
@@ -85,33 +85,26 @@ def compute_new_centroids(X,
     --------
 
     >>> import cupy as cp
-
     >>> from pylibraft.common import Handle
     >>> from pylibraft.cluster.kmeans import compute_new_centroids
-
     >>> # A single RAFT handle can optionally be reused across
     >>> # pylibraft functions.
     >>> handle = Handle()
-
     >>> n_samples = 5000
     >>> n_features = 50
     >>> n_clusters = 3
-
     >>> X = cp.random.random_sample((n_samples, n_features),
     ...                               dtype=cp.float32)
-
     >>> centroids = cp.random.random_sample((n_clusters, n_features),
     ...                                         dtype=cp.float32)
     ...
     >>> labels = cp.random.randint(0, high=n_clusters, size=n_samples,
     ...                            dtype=cp.int32)
-
-    >>> new_centroids = cp.empty((n_clusters, n_features), dtype=cp.float32)
-
+    >>> new_centroids = cp.empty((n_clusters, n_features),
+    ...                          dtype=cp.float32)
     >>> compute_new_centroids(
     ...     X, centroids, labels, new_centroids, handle=handle
     ... )
-
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
@@ -221,11 +214,9 @@ def init_plus_plus(X, n_clusters=None, seed=None, handle=None, centroids=None):
 
     >>> import cupy as cp
     >>> from pylibraft.cluster.kmeans import init_plus_plus
-
     >>> n_samples = 5000
     >>> n_features = 50
     >>> n_clusters = 3
-
     >>> X = cp.random.random_sample((n_samples, n_features),
     ...                               dtype=cp.float32)
 
@@ -301,19 +292,14 @@ def cluster_cost(X, centroids, handle=None):
     --------
 
     >>> import cupy as cp
-    >>>
     >>> from pylibraft.cluster.kmeans import cluster_cost
-    >>>
     >>> n_samples = 5000
     >>> n_features = 50
     >>> n_clusters = 3
-    >>>
     >>> X = cp.random.random_sample((n_samples, n_features),
     ...                             dtype=cp.float32)
-
     >>> centroids = cp.random.random_sample((n_clusters, n_features),
     ...                                      dtype=cp.float32)
-
     >>> inertia = cluster_cost(X, centroids)
     """
     x_cai = X.__cuda_array_interface__
@@ -524,13 +510,10 @@ def fit(
     --------
 
     >>> import cupy as cp
-    >>>
     >>> from pylibraft.cluster.kmeans import fit, KMeansParams
-    >>>
     >>> n_samples = 5000
     >>> n_features = 50
     >>> n_clusters = 3
-    >>>
     >>> X = cp.random.random_sample((n_samples, n_features),
     ...                             dtype=cp.float32)
 
diff --git a/python/pylibraft/pylibraft/common/handle.pyx b/python/pylibraft/pylibraft/common/handle.pyx
index b4cdb9b0c1..7e3dc289e0 100644
--- a/python/pylibraft/pylibraft/common/handle.pyx
+++ b/python/pylibraft/pylibraft/common/handle.pyx
@@ -197,8 +197,8 @@ cdef class Handle(DeviceResources):
 
 
 _HANDLE_PARAM_DOCSTRING = """
-     handle : Optional RAFT resource handle for reusing expensive CUDA
-        resources. If a handle isn't supplied, CUDA resources will be
+     handle : Optional RAFT resource handle for reusing CUDA resources.
+        If a handle isn't supplied, CUDA resources will be
         allocated inside this function and synchronized before the
         function exits. If a handle is supplied, you will need to
         explicitly synchronize yourself by calling `handle.sync()`
diff --git a/python/pylibraft/pylibraft/neighbors/brute_force.pyx b/python/pylibraft/pylibraft/neighbors/brute_force.pyx
index 2d118072ab..4aa47b8a18 100644
--- a/python/pylibraft/pylibraft/neighbors/brute_force.pyx
+++ b/python/pylibraft/pylibraft/neighbors/brute_force.pyx
@@ -95,7 +95,6 @@ def knn(dataset, queries, k=None, indices=None, distances=None,
     distances :  Optional array interface compliant matrix shape
                 (n_queries, k), dtype float. If supplied, neighbor
                 indices will be written here in-place. (default None)
-
     {handle_docstring}
 
     Returns
@@ -108,16 +107,12 @@ def knn(dataset, queries, k=None, indices=None, distances=None,
 
     Examples
     --------
-
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors.brute_force import knn
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
-
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> # Search using the built index
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
index 7d758a32ef..fbc1623cac 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
+++ b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
@@ -85,6 +85,25 @@ from pylibraft.neighbors.common cimport _get_metric_string
 
 
 cdef class IndexParams:
+    """"
+    Parameters to build index for CAGRA nearest neighbor search
+
+    Parameters
+    ----------
+    metric : string denoting the metric type, default="sqeuclidean"
+        Valid values for metric: ["sqeuclidean"], where
+            - sqeuclidean is the euclidean distance without the square root
+              operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2
+    intermediate_graph_degree : int, default = 128
+
+    graph_degree : int, default = 64
+
+    add_data_on_build : bool, default = True
+        After training the coarse and fine quantizers, we will populate
+        the index with the dataset if add_data_on_build == True, otherwise
+        the index is left empty, and the extend method can be used
+        to add new vectors to the index.
+    """
     cdef c_cagra.index_params params
 
     def __init__(self, *,
@@ -92,29 +111,6 @@ cdef class IndexParams:
                  intermediate_graph_degree=128,
                  graph_degree=64,
                  add_data_on_build=True):
-        """"
-        Parameters to build index for CAGRA nearest neighbor search
-
-        Parameters
-        ----------
-        metric : string denoting the metric type, default="sqeuclidean"
-            Valid values for metric: ["sqeuclidean", "inner_product",
-            "euclidean"], where
-            - sqeuclidean is the euclidean distance without the square root
-              operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
-            - euclidean is the euclidean distance
-            - inner product distance is defined as
-              distance(a, b) = \\sum_i a_i * b_i.
-        intermediate_graph_degree : int, default = 128
-
-        graph_degree : int, default = 64
-
-        add_data_on_build : bool, default = True
-            After training the coarse and fine quantizers, we will populate
-            the index with the dataset if add_data_on_build == True, otherwise
-            the index is left empty, and the extend method can be used
-            to add new vectors to the index.
-        """
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
         self.params.intermediate_graph_degree = intermediate_graph_degree
@@ -163,7 +159,7 @@ cdef class IndexFloat(Index):
         m_str = "metric=" + _get_metric_string(self.index.metric())
         attr_str = [attr + "=" + str(getattr(self, attr))
                     for attr in ["metric", "dim", "graph_degree"]]
-        attr_str = m_str + attr_str
+        attr_str = [m_str] + attr_str
         return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")"
 
     @property
@@ -203,7 +199,7 @@ cdef class IndexInt8(Index):
         m_str = "metric=" + _get_metric_string(self.index.metric())
         attr_str = [attr + "=" + str(getattr(self, attr))
                     for attr in ["metric", "dim", "graph_degree"]]
-        attr_str = m_str + attr_str
+        attr_str = [m_str] + attr_str
         return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")"
 
     @property
@@ -243,7 +239,7 @@ cdef class IndexUint8(Index):
         m_str = "metric=" + _get_metric_string(self.index.metric())
         attr_str = [attr + "=" + str(getattr(self, attr))
                     for attr in ["metric", "dim", "graph_degree"]]
-        attr_str = m_str + attr_str
+        attr_str = [m_str] + attr_str
         return "Index(type=CAGRA, " + (", ".join(attr_str)) + ")"
 
     @property
@@ -280,8 +276,8 @@ def build(IndexParams index_params, dataset, handle=None):
     It is required that both the dataset and the optimized graph fit the
     GPU memory.
 
-     The following distance metrics are supported:
-     - L2
+    The following distance metrics are supported:
+        - L2
 
     Parameters
     ----------
@@ -298,31 +294,23 @@ def build(IndexParams index_params, dataset, handle=None):
     --------
 
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import cagra
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
     >>> k = 10
-
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-
     >>> handle = DeviceResources()
     >>> build_params = cagra.IndexParams(metric="sqeuclidean")
-
     >>> index = cagra.build(build_params, dataset, handle=handle)
-
     >>> distances, neighbors = cagra.search(cagra.SearchParams(),
     ...                                      index, dataset,
     ...                                      k, handle=handle)
-
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
-
     >>> distances = cp.asarray(distances)
     >>> neighbors = cp.asarray(neighbors)
     """
@@ -415,6 +403,54 @@ def build(IndexParams index_params, dataset, handle=None):
 
 
 cdef class SearchParams:
+    """
+    CAGRA search parameters
+
+    Parameters
+    ----------
+    max_queries: int, default = 0
+        Maximum number of queries to search at the same time (batch size).
+        Auto select when 0.
+    itopk_size: int, default = 64
+        Number of intermediate search results retained during the search.
+        This is the main knob to adjust trade off between accuracy and
+        search speed. Higher values improve the search accuracy.
+    max_iterations: int, default = 0
+        Upper limit of search iterations. Auto select when 0.
+    algo: string denoting the search algorithm to use, default = "auto"
+        Valid values for algo: ["auto", "single_cta", "multi_cta"], where
+        - auto will automatically select the best value based on query size
+        - single_cta is better when query contains larger number of
+        vectors (e.g >10)
+        - multi_cta is better when query contains only a few vectors
+    team_size: int, default = 0
+        Number of threads used to calculate a single distance. 4, 8, 16,
+        or 32.
+    search_width: int, default = 1
+        Number of graph nodes to select as the starting point for the
+        search in each iteration.
+    min_iterations: int, default = 0
+        Lower limit of search iterations.
+    thread_block_size: int, default = 0
+        Thread block size. 0, 64, 128, 256, 512, 1024.
+        Auto selection when 0.
+    hashmap_mode: string denoting the type of hash map to use. It's
+        usually better to allow the algorithm to select this value.,
+        default = "auto"
+        Valid values for hashmap_mode: ["auto", "small", "hash"], where
+        - auto will automatically select the best value based on algo
+        - small will use the small shared memory hash table with resetting.
+        - hash will use a single hash table in global memory.
+    hashmap_min_bitlen: int, default = 0
+        Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
+    hashmap_max_fill_rate: float, default = 0.5
+        Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
+    num_random_samplings: int, default = 1
+        Number of iterations of initial random seed node selection. 1 or
+        more.
+    rand_xor_mask: int, default = 0x128394
+        Bit mask used for initial random seed node selection.
+    """
     cdef c_cagra.search_params params
 
     def __init__(self, *,
@@ -431,56 +467,6 @@ cdef class SearchParams:
                  hashmap_max_fill_rate=0.5,
                  num_random_samplings=1,
                  rand_xor_mask=0x128394):
-        """
-        CAGRA search parameters
-
-        Parameters
-        ----------
-        max_queries: int, default = 0
-            Maximum number of queries to search at the same time (batch size).
-            Auto select when 0.
-        itopk_size: int, default = 64
-            Number of intermediate search results retained during the search.
-            This is the main knob to adjust trade off between accuracy and
-            search speed. Higher values improve the search accuracy.
-        max_iterations: int, default = 0
-            Upper limit of search iterations. Auto select when 0.
-        algo: string denoting the search algorithm to use, default = "auto"
-            Valid values for algo: ["auto", "single_cta", "multi_cta"], where
-            - auto will automatically select the best value based on query size
-            - single_cta is better when query contains larger number of
-            vectors (e.g >10)
-            - multi_cta is better when query contains only a few vectors
-        team_size: int, default = 0
-            Number of threads used to calculate a single distance. 4, 8, 16,
-            or 32.
-        search_width: int, default = 1
-            Number of graph nodes to select as the starting point for the
-            search in each iteration.
-        min_iterations: int, default = 0
-            Lower limit of search iterations.
-        thread_block_size: int, default = 0
-            Thread block size. 0, 64, 128, 256, 512, 1024.
-            Auto selection when 0.
-        hashmap_mode: string denoting the type of hash map to use. It's
-            usually better to allow the algorithm to select this value.,
-            default = "auto"
-            Valid values for hashmap_mode: ["auto", "small", "hash"], where
-            - auto will automatically select the best value based on algo
-            - small will use the small shared memory hash table with resetting.
-            - hash will use a single hash table in global memory.
-        hashmap_min_bitlen: int, default = 0
-            Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
-        hashmap_max_fill_rate: float, default = 0.5
-            Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
-        num_random_samplings: int, default = 1
-            Number of iterations of initial random seed node selection. 1 or
-            more.
-        rand_xor_mask: int, default = 0x128394
-            Bit mask used for initial random seed node selection.
-
-
-        """
         self.params.max_queries = max_queries
         self.params.itopk_size = itopk_size
         self.params.max_iterations = max_iterations
@@ -514,9 +500,13 @@ cdef class SearchParams:
         self.params.rand_xor_mask = rand_xor_mask
 
     def __repr__(self):
-        # todo(dantegd): add all relevant attrs
         attr_str = [attr + "=" + str(getattr(self, attr))
-                    for attr in ["max_queries"]]
+                    for attr in [
+                        "max_queries", "itopk_size", "max_iterations", "algo",
+                        "team_size", "search_width", "min_iterations",
+                        "thread_block_size", "hashmap_mode",
+                        "hashmap_min_bitlen", "hashmap_max_fill_rate",
+                        "num_random_samplings", "rand_xor_mask"]]
         return "SearchParams(type=CAGRA, " + (", ".join(attr_str)) + ")"
 
     @property
@@ -604,20 +594,16 @@ def search(SearchParams search_params,
     Examples
     --------
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import cagra
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-
     >>> # Build index
     >>> handle = DeviceResources()
     >>> index = cagra.build(cagra.IndexParams(), dataset, handle=handle)
-
     >>> # Search using the built index
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
@@ -626,17 +612,14 @@ def search(SearchParams search_params,
     ...     max_queries=100,
     ...     itopk_size=64
     ... )
-
     >>> # Using a pooling allocator reduces overhead of temporary array
     >>> # creation during search. This is useful if multiple searches
     >>> # are performad with same query size.
     >>> distances, neighbors = cagra.search(search_params, index, queries,
     ...                                     k, handle=handle)
-
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
-
     >>> neighbors = cp.asarray(neighbors)
     >>> distances = cp.asarray(distances)
     """
@@ -712,9 +695,9 @@ def search(SearchParams search_params,
 @auto_sync_handle
 def save(filename, Index index, handle=None):
     """
-    Saves the index to file.
+    Saves the index to a file.
 
-    Saving / loading the index is. The serialization format is
+    Saving / loading the index is experimental. The serialization format is
     subject to change.
 
     Parameters
@@ -728,19 +711,18 @@ def save(filename, Index index, handle=None):
     Examples
     --------
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import cagra
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-
     >>> # Build index
     >>> handle = DeviceResources()
     >>> index = cagra.build(cagra.IndexParams(), dataset, handle=handle)
+    >>> # Serialize and deserialize the cagra index built
     >>> cagra.save("my_index.bin", index, handle=handle)
+    >>> index_loaded = cagra.load("my_index.bin", handle=handle)
     """
     if not index.trained:
         raise ValueError("Index need to be built before saving it.")
@@ -778,7 +760,7 @@ def load(filename, handle=None):
     """
     Loads index from file.
 
-    Saving / loading the index is. The serialization format is
+    Saving / loading the index is experimental. The serialization format is
     subject to change, therefore loading an index saved with a previous
     version of raft is not guaranteed to work.
 
@@ -792,13 +774,6 @@ def load(filename, handle=None):
     -------
     index : Index
 
-    Examples
-    --------
-    >>> import cupy as cp
-
-    >>> from pylibraft.common import DeviceResources
-    >>> from pylibraft.neighbors import cagra
-
     """
     if handle is None:
         handle = DeviceResources()
@@ -810,11 +785,12 @@ def load(filename, handle=None):
     cdef IndexInt8 idx_int8
     cdef IndexUint8 idx_uint8
 
-    # we extract the dtype from the arrai interfaces in the file
+    # we extract the dtype from the array interfaces in the file
     with open(filename, 'rb') as f:
         type_str = f.read(700).decode("utf-8", errors='ignore')
 
-    dataset_dt = np.dtype(type_str[673:676])
+    # Read description of the 6th element to get the datatype
+    dataset_dt = np.dtype(type_str.split('descr')[6][5:7])
 
     if dataset_dt == np.float32:
         idx_float = IndexFloat(handle)
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx
index e265bee23b..d8fbdc74da 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_flat/ivf_flat.pyx
@@ -75,6 +75,45 @@ from pylibraft.neighbors.ivf_flat.cpp.c_ivf_flat cimport (
 
 
 cdef class IndexParams:
+    """
+    Parameters to build index for IVF-FLAT nearest neighbor search
+
+    Parameters
+    ----------
+    n_list : int, default = 1024
+        The number of clusters used in the coarse quantizer.
+    metric : string denoting the metric type, default="sqeuclidean"
+        Valid values for metric: ["sqeuclidean", "inner_product",
+        "euclidean"], where
+            - sqeuclidean is the euclidean distance without the square root
+              operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
+            - euclidean is the euclidean distance
+            - inner product distance is defined as
+              distance(a, b) = \\sum_i a_i * b_i.
+    kmeans_n_iters : int, default = 20
+        The number of iterations searching for kmeans centers during index
+        building.
+    kmeans_trainset_fraction : int, default = 0.5
+        If kmeans_trainset_fraction is less than 1, then the dataset is
+        subsampled, and only n_samples * kmeans_trainset_fraction rows
+        are used for training.
+    add_data_on_build : bool, default = True
+        After training the coarse and fine quantizers, we will populate
+        the index with the dataset if add_data_on_build == True, otherwise
+        the index is left empty, and the extend method can be used
+        to add new vectors to the index.
+    adaptive_centers : bool, default = False
+        By default (adaptive_centers = False), the cluster centers are
+        trained in `ivf_flat::build`, and and never modified in
+        `ivf_flat::extend`. The alternative behavior (adaptive_centers
+        = true) is to update the cluster centers for new data when it is
+        added. In this case, `index.centers()` are always exactly the
+        centroids of the data in the corresponding clusters. The drawback
+        of this behavior is that the centroids depend on the order of
+        adding new data (through the classification of the added data);
+        that is, `index.centers()` "drift" together with the changing
+        distribution of the newly added data.
+    """
     cdef c_ivf_flat.index_params params
 
     def __init__(self, *,
@@ -84,45 +123,6 @@ cdef class IndexParams:
                  kmeans_trainset_fraction=0.5,
                  add_data_on_build=True,
                  bool adaptive_centers=False):
-        """"
-        Parameters to build index for IVF-FLAT nearest neighbor search
-
-        Parameters
-        ----------
-        n_list : int, default = 1024
-            The number of clusters used in the coarse quantizer.
-        metric : string denoting the metric type, default="sqeuclidean"
-            Valid values for metric: ["sqeuclidean", "inner_product",
-            "euclidean"], where
-            - sqeuclidean is the euclidean distance without the square root
-              operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
-            - euclidean is the euclidean distance
-            - inner product distance is defined as
-              distance(a, b) = \\sum_i a_i * b_i.
-        kmeans_n_iters : int, default = 20
-            The number of iterations searching for kmeans centers during index
-            building.
-        kmeans_trainset_fraction : int, default = 0.5
-            If kmeans_trainset_fraction is less than 1, then the dataset is
-            subsampled, and only n_samples * kmeans_trainset_fraction rows
-            are used for training.
-        add_data_on_build : bool, default = True
-            After training the coarse and fine quantizers, we will populate
-            the index with the dataset if add_data_on_build == True, otherwise
-            the index is left empty, and the extend method can be used
-            to add new vectors to the index.
-        adaptive_centers : bool, default = False
-            By default (adaptive_centers = False), the cluster centers are
-            trained in `ivf_flat::build`, and and never modified in
-            `ivf_flat::extend`. The alternative behavior (adaptive_centers
-            = true) is to update the cluster centers for new data when it is
-            added. In this case, `index.centers()` are always exactly the
-            centroids of the data in the corresponding clusters. The drawback
-            of this behavior is that the centroids depend on the order of
-            adding new data (through the classification of the added data);
-            that is, `index.centers()` "drift" together with the changing
-            distribution of the newly added data.
-        """
         self.params.n_lists = n_lists
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
@@ -333,33 +333,27 @@ def build(IndexParams index_params, dataset, handle=None):
     --------
 
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_flat
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
-
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> handle = DeviceResources()
     >>> index_params = ivf_flat.IndexParams(
     ...     n_lists=1024,
     ...     metric="sqeuclidean")
-
     >>> index = ivf_flat.build(index_params, dataset, handle=handle)
-
     >>> # Search using the built index
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
     >>> k = 10
-    >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), index,
-    ...                                      queries, k, handle=handle)
-
+    >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(),
+    ...                                        index, queries, k,
+    ...                                        handle=handle)
     >>> distances = cp.asarray(distances)
     >>> neighbors = cp.asarray(neighbors)
-
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
@@ -439,25 +433,21 @@ def extend(Index index, new_vectors, new_indices, handle=None):
     --------
 
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_flat
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
-
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> handle = DeviceResources()
-    >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle)
-
+    >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset,
+    ...                        handle=handle)
     >>> n_rows = 100
     >>> more_data = cp.random.random_sample((n_rows, n_features),
     ...                                     dtype=cp.float32)
     >>> indices = index.size + cp.arange(n_rows, dtype=cp.int64)
     >>> index = ivf_flat.extend(index, more_data, indices)
-
     >>> # Search using the built index
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
@@ -465,7 +455,6 @@ def extend(Index index, new_vectors, new_indices, handle=None):
     >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(),
     ...                                      index, queries,
     ...                                      k, handle=handle)
-
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
@@ -540,17 +529,17 @@ def extend(Index index, new_vectors, new_indices, handle=None):
 
 
 cdef class SearchParams:
+    """
+    IVF-FLAT search parameters
+
+    Parameters
+    ----------
+    n_probes: int, default = 1024
+        The number of course clusters to select for the fine search.
+    """
     cdef c_ivf_flat.search_params params
 
     def __init__(self, *, n_probes=20):
-        """
-        IVF-FLAT search parameters
-
-        Parameters
-        ----------
-        n_probes: int, default = 1024
-            The number of course clusters to select for the fine search.
-        """
         self.params.n_probes = n_probes
 
     def __repr__(self):
@@ -595,20 +584,17 @@ def search(SearchParams search_params,
     Examples
     --------
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_flat
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-
     >>> # Build index
     >>> handle = DeviceResources()
-    >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle)
-
+    >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset,
+    ...                        handle=handle)
     >>> # Search using the built index
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
@@ -616,13 +602,11 @@ def search(SearchParams search_params,
     >>> search_params = ivf_flat.SearchParams(
     ...     n_probes=20
     ... )
-    >>> distances, neighbors = ivf_flat.search(search_params, index, queries,
-    ...                                      k, handle=handle)
-
+    >>> distances, neighbors = ivf_flat.search(search_params, index,
+    ...                                        queries, k, handle=handle)
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
-
     >>> neighbors = cp.asarray(neighbors)
     >>> distances = cp.asarray(distances)
     """
@@ -697,7 +681,7 @@ def search(SearchParams search_params,
 @auto_sync_handle
 def save(filename, Index index, handle=None):
     """
-    Saves the index to file.
+    Saves the index to a file.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change.
@@ -713,18 +697,16 @@ def save(filename, Index index, handle=None):
     Examples
     --------
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_flat
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-
     >>> # Build index
     >>> handle = DeviceResources()
-    >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle)
+    >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset,
+    ...                        handle=handle)
     >>> ivf_flat.save("my_index.bin", index, handle=handle)
     """
     if not index.trained:
@@ -761,7 +743,7 @@ def save(filename, Index index, handle=None):
 @auto_sync_handle
 def load(filename, handle=None):
     """
-    Loads index from file.
+    Loads index from a file.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change, therefore loading an index saved with a previous
@@ -780,29 +762,26 @@ def load(filename, handle=None):
     Examples
     --------
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_flat
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-
     >>> # Build and save index
     >>> handle = DeviceResources()
-    >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset, handle=handle)
+    >>> index = ivf_flat.build(ivf_flat.IndexParams(), dataset,
+    ...                        handle=handle)
     >>> ivf_flat.save("my_index.bin", index, handle=handle)
     >>> del index
-
     >>> n_queries = 100
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
     >>> handle = DeviceResources()
     >>> index = ivf_flat.load("my_index.bin", handle=handle)
-
-    >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(), index,
-    ...                                      queries, k=10, handle=handle)
+    >>> distances, neighbors = ivf_flat.search(ivf_flat.SearchParams(),
+    ...                                        index, queries, k=10,
+    ...                                        handle=handle)
     """
     if handle is None:
         handle = DeviceResources()
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
index 413a9a1d4b..0c1bbf6b9c 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/ivf_pq.pyx
@@ -95,7 +95,68 @@ cdef _get_dtype_string(dtype):
 
 
 cdef class IndexParams:
+    """
+    Parameters to build index for IVF-PQ nearest neighbor search
 
+    Parameters
+    ----------
+    n_list : int, default = 1024
+        The number of clusters used in the coarse quantizer.
+    metric : string denoting the metric type, default="sqeuclidean"
+        Valid values for metric: ["sqeuclidean", "inner_product",
+        "euclidean"], where
+            - sqeuclidean is the euclidean distance without the square root
+              operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
+            - euclidean is the euclidean distance
+            - inner product distance is defined as
+              distance(a, b) = \\sum_i a_i * b_i.
+    kmeans_n_iters : int, default = 20
+        The number of iterations searching for kmeans centers during index
+        building.
+    kmeans_trainset_fraction : int, default = 0.5
+        If kmeans_trainset_fraction is less than 1, then the dataset is
+        subsampled, and only n_samples * kmeans_trainset_fraction rows
+        are used for training.
+    pq_bits : int, default = 8
+        The bit length of the vector element after quantization.
+    pq_dim : int, default = 0
+        The dimensionality of a the vector after product quantization.
+        When zero, an optimal value is selected using a heuristic. Note
+        pq_dim * pq_bits must be a multiple of 8. Hint: a smaller 'pq_dim'
+        results in a smaller index size and better search performance, but
+        lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number,
+        but multiple of 8 are desirable for good performance. If 'pq_bits'
+        is not 8, 'pq_dim' should be a multiple of 8. For good performance,
+        it is desirable that 'pq_dim' is a multiple of 32. Ideally,
+        'pq_dim' should be also a divisor of the dataset dim.
+    codebook_kind : string, default = "subspace"
+        Valid values ["subspace", "cluster"]
+    force_random_rotation : bool, default = False
+        Apply a random rotation matrix on the input data and queries even
+        if `dim % pq_dim == 0`. Note: if `dim` is not multiple of `pq_dim`,
+        a random rotation is always applied to the input data and queries
+        to transform the working space from `dim` to `rot_dim`, which may
+        be slightly larger than the original space and and is a multiple
+        of `pq_dim` (`rot_dim % pq_dim == 0`). However, this transform is
+        not necessary when `dim` is multiple of `pq_dim` (`dim == rot_dim`,
+        hence no need in adding "extra" data columns / features). By
+        default, if `dim == rot_dim`, the rotation transform is
+        initialized with the identity matrix. When
+        `force_random_rotation == True`, a random orthogonal transform
+        matrix is generated regardless of the values of `dim` and `pq_dim`.
+    add_data_on_build : bool, default = True
+        After training the coarse and fine quantizers, we will populate
+        the index with the dataset if add_data_on_build == True, otherwise
+        the index is left empty, and the extend method can be used
+        to add new vectors to the index.
+    conservative_memory_allocation : bool, default = True
+        By default, the algorithm allocates more space than necessary for
+        individual clusters (`list_data`). This allows to amortize the cost
+        of memory allocation and reduce the number of data copies during
+        repeated calls to `extend` (extending the database).
+        To disable this behavior and use as little GPU memory for the
+        database as possible, set this flat to `True`.
+    """
     def __init__(self, *,
                  n_lists=1024,
                  metric="sqeuclidean",
@@ -107,69 +168,6 @@ cdef class IndexParams:
                  force_random_rotation=False,
                  add_data_on_build=True,
                  conservative_memory_allocation=False):
-        """"
-        Parameters to build index for IVF-PQ nearest neighbor search
-
-        Parameters
-        ----------
-        n_list : int, default = 1024
-            The number of clusters used in the coarse quantizer.
-        metric : string denoting the metric type, default="sqeuclidean"
-            Valid values for metric: ["sqeuclidean", "inner_product",
-            "euclidean"], where
-            - sqeuclidean is the euclidean distance without the square root
-              operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2,
-            - euclidean is the euclidean distance
-            - inner product distance is defined as
-              distance(a, b) = \\sum_i a_i * b_i.
-        kmeans_n_iters : int, default = 20
-            The number of iterations searching for kmeans centers during index
-            building.
-        kmeans_trainset_fraction : int, default = 0.5
-            If kmeans_trainset_fraction is less than 1, then the dataset is
-            subsampled, and only n_samples * kmeans_trainset_fraction rows
-            are used for training.
-        pq_bits : int, default = 8
-            The bit length of the vector element after quantization.
-        pq_dim : int, default = 0
-            The dimensionality of a the vector after product quantization.
-            When zero, an optimal value is selected using a heuristic. Note
-            pq_dim * pq_bits must be a multiple of 8. Hint: a smaller 'pq_dim'
-            results in a smaller index size and better search performance, but
-            lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number,
-            but multiple of 8 are desirable for good performance. If 'pq_bits'
-            is not 8, 'pq_dim' should be a multiple of 8. For good performance,
-            it is desirable that 'pq_dim' is a multiple of 32. Ideally,
-            'pq_dim' should be also a divisor of the dataset dim.
-        codebook_kind : string, default = "subspace"
-            Valid values ["subspace", "cluster"]
-        force_random_rotation : bool, default = False
-            Apply a random rotation matrix on the input data and queries even
-            if `dim % pq_dim == 0`. Note: if `dim` is not multiple of `pq_dim`,
-            a random rotation is always applied to the input data and queries
-            to transform the working space from `dim` to `rot_dim`, which may
-            be slightly larger than the original space and and is a multiple
-            of `pq_dim` (`rot_dim % pq_dim == 0`). However, this transform is
-            not necessary when `dim` is multiple of `pq_dim` (`dim == rot_dim`,
-            hence no need in adding "extra" data columns / features). By
-            default, if `dim == rot_dim`, the rotation transform is
-            initialized with the identity matrix. When
-            `force_random_rotation == True`, a random orthogonal transform
-            matrix is generated regardless of the values of `dim` and `pq_dim`.
-        add_data_on_build : bool, default = True
-            After training the coarse and fine quantizers, we will populate
-            the index with the dataset if add_data_on_build == True, otherwise
-            the index is left empty, and the extend method can be used
-            to add new vectors to the index.
-        conservative_memory_allocation : bool, default = True
-            By default, the algorithm allocates more space than necessary for
-            individual clusters (`list_data`). This allows to amortize the cost
-            of memory allocation and reduce the number of data copies during
-            repeated calls to `extend` (extending the database).
-            To disable this behavior and use as little GPU memory for the
-            database as possible, set this flat to `True`.
-
-        """
         self.params.n_lists = n_lists
         self.params.metric = _get_metric(metric)
         self.params.metric_arg = 0
@@ -333,14 +331,11 @@ def build(IndexParams index_params, dataset, handle=None):
     --------
 
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
-
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> handle = DeviceResources()
@@ -349,17 +344,14 @@ def build(IndexParams index_params, dataset, handle=None):
     ...     metric="sqeuclidean",
     ...     pq_dim=10)
     >>> index = ivf_pq.build(index_params, dataset, handle=handle)
-
     >>> # Search using the built index
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
     >>> k = 10
     >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index,
     ...                                      queries, k, handle=handle)
-
     >>> distances = cp.asarray(distances)
     >>> neighbors = cp.asarray(neighbors)
-
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
@@ -433,25 +425,20 @@ def extend(Index index, new_vectors, new_indices, handle=None):
     --------
 
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
-
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> handle = DeviceResources()
     >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
-
     >>> n_rows = 100
     >>> more_data = cp.random.random_sample((n_rows, n_features),
     ...                                     dtype=cp.float32)
     >>> indices = index.size + cp.arange(n_rows, dtype=cp.int64)
     >>> index = ivf_pq.extend(index, more_data, indices)
-
     >>> # Search using the built index
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
@@ -459,11 +446,9 @@ def extend(Index index, new_vectors, new_indices, handle=None):
     >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(),
     ...                                      index, queries,
     ...                                      k, handle=handle)
-
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
-
     >>> distances = cp.asarray(distances)
     >>> neighbors = cp.asarray(neighbors)
     """
@@ -520,29 +505,27 @@ def extend(Index index, new_vectors, new_indices, handle=None):
 
 
 cdef class SearchParams:
+    """
+    IVF-PQ search parameters
 
+    Parameters
+    ----------
+    n_probes: int, default = 1024
+        The number of course clusters to select for the fine search.
+    lut_dtype: default = np.float32
+        Data type of look up table to be created dynamically at search
+        time. The use of low-precision types reduces the amount of shared
+        memory required at search time, so fast shared memory kernels can
+        be used even for datasets with large dimansionality. Note that
+        the recall is slightly degraded when low-precision type is
+        selected. Possible values [np.float32, np.float16, np.uint8]
+    internal_distance_dtype: default = np.float32
+        Storage data type for distance/similarity computation.
+        Possible values [np.float32, np.float16]
+    """
     def __init__(self, *, n_probes=20,
                  lut_dtype=np.float32,
                  internal_distance_dtype=np.float32):
-        """
-        IVF-PQ search parameters
-
-        Parameters
-        ----------
-        n_probes: int, default = 1024
-            The number of course clusters to select for the fine search.
-        lut_dtype: default = np.float32
-            Data type of look up table to be created dynamically at search
-            time. The use of low-precision types reduces the amount of shared
-            memory required at search time, so fast shared memory kernels can
-            be used even for datasets with large dimansionality. Note that
-            the recall is slightly degraded when low-precision type is
-            selected. Possible values [np.float32, np.float16, np.uint8]
-        internal_distance_dtype: default = np.float32
-            Storage data type for distance/similarity computation.
-            Possible values [np.float32, np.float16]
-        """
-
         self.params.n_probes = n_probes
         self.params.lut_dtype = _map_dtype_np_to_cuda(lut_dtype)
         self.params.internal_distance_dtype = \
@@ -611,20 +594,16 @@ def search(SearchParams search_params,
     Examples
     --------
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-
     >>> # Build index
     >>> handle = DeviceResources()
     >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
-
     >>> # Search using the built index
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
@@ -634,7 +613,6 @@ def search(SearchParams search_params,
     ...     lut_dtype=cp.float16,
     ...     internal_distance_dtype=cp.float32
     ... )
-
     >>> # Using a pooling allocator reduces overhead of temporary array
     >>> # creation during search. This is useful if multiple searches
     >>> # are performad with same query size.
@@ -647,11 +625,9 @@ def search(SearchParams search_params,
     >>> distances, neighbors = ivf_pq.search(search_params, index, queries,
     ...                                      k, memory_resource=mr,
     ...                                      handle=handle)
-
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()
-
     >>> neighbors = cp.asarray(neighbors)
     >>> distances = cp.asarray(distances)
     """
@@ -728,7 +704,7 @@ def search(SearchParams search_params,
 @auto_sync_handle
 def save(filename, Index index, handle=None):
     """
-    Saves the index to file.
+    Saves the index to a file.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change.
@@ -744,15 +720,12 @@ def save(filename, Index index, handle=None):
     Examples
     --------
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-
     >>> # Build index
     >>> handle = DeviceResources()
     >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
@@ -774,7 +747,7 @@ def save(filename, Index index, handle=None):
 @auto_sync_handle
 def load(filename, handle=None):
     """
-    Loads index from file.
+    Loads index from a file.
 
     Saving / loading the index is experimental. The serialization format is
     subject to change, therefore loading an index saved with a previous
@@ -793,27 +766,22 @@ def load(filename, handle=None):
     Examples
     --------
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
-
     >>> # Build and save index
     >>> handle = DeviceResources()
     >>> index = ivf_pq.build(ivf_pq.IndexParams(), dataset, handle=handle)
     >>> ivf_pq.save("my_index.bin", index, handle=handle)
     >>> del index
-
     >>> n_queries = 100
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
     >>> handle = DeviceResources()
     >>> index = ivf_pq.load("my_index.bin", handle=handle)
-
     >>> distances, neighbors = ivf_pq.search(ivf_pq.SearchParams(), index,
     ...                                      queries, k=10, handle=handle)
     """
diff --git a/python/pylibraft/pylibraft/neighbors/refine.pyx b/python/pylibraft/pylibraft/neighbors/refine.pyx
index 5e57da713c..a9bf811c9f 100644
--- a/python/pylibraft/pylibraft/neighbors/refine.pyx
+++ b/python/pylibraft/pylibraft/neighbors/refine.pyx
@@ -192,19 +192,19 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
     queries : array interface compliant matrix, shape (n_queries, dim)
         Supported dtype [float, int8, uint8]
     candidates : array interface compliant matrix, shape (n_queries, k0)
-        dtype int64
+        Supported dtype int64
     k : int
         Number of neighbors to search (k <= k0). Optional if indices or
         distances arrays are given (in which case their second dimension
         is k).
-    indices :  Optional array interface compliant matrix shape
-                (n_queries, k), dtype int64. If supplied, neighbor
-                indices will be written here in-place. (default None)
-        Supported dtype int64
-    distances :  Optional array interface compliant matrix shape
-                (n_queries, k), dtype float. If supplied, neighbor
-                indices will be written here in-place. (default None)
-
+    indices :  Optional array interface compliant matrix shape \
+               (n_queries, k).
+        If supplied, neighbor indices will be written here in-place.
+        (default None). Supported dtype int64.
+    distances :  Optional array interface compliant matrix shape \
+                 (n_queries, k).
+        If supplied, neighbor indices will be written here in-place.
+        (default None) Supported dtype float.
     {handle_docstring}
 
     Returns
@@ -213,36 +213,30 @@ def refine(dataset, queries, candidates, k=None, indices=None, distances=None,
 
     Examples
     --------
-
     >>> import cupy as cp
-
     >>> from pylibraft.common import DeviceResources
     >>> from pylibraft.neighbors import ivf_pq, refine
-
     >>> n_samples = 50000
     >>> n_features = 50
     >>> n_queries = 1000
-
     >>> dataset = cp.random.random_sample((n_samples, n_features),
     ...                                   dtype=cp.float32)
     >>> handle = DeviceResources()
-    >>> index_params = ivf_pq.IndexParams(n_lists=1024, metric="sqeuclidean",
+    >>> index_params = ivf_pq.IndexParams(n_lists=1024,
+    ...                                   metric="sqeuclidean",
     ...                                   pq_dim=10)
     >>> index = ivf_pq.build(index_params, dataset, handle=handle)
-
     >>> # Search using the built index
     >>> queries = cp.random.random_sample((n_queries, n_features),
     ...                                   dtype=cp.float32)
     >>> k = 40
     >>> _, candidates = ivf_pq.search(ivf_pq.SearchParams(), index,
     ...                               queries, k, handle=handle)
-
     >>> k = 10
     >>> distances, neighbors = refine(dataset, queries, candidates, k,
     ...                               handle=handle)
     >>> distances = cp.asarray(distances)
     >>> neighbors = cp.asarray(neighbors)
-
     >>> # pylibraft functions are often asynchronous so the
     >>> # handle needs to be explicitly synchronized
     >>> handle.sync()