Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cython-lint configuration. #5439

Merged
merged 15 commits into from
Jul 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ repos:
types_or: [python, cython]
exclude: thirdparty
additional_dependencies: [flake8-force]
- repo: https://github.com/MarcoGorelli/cython-lint
rev: v0.15.0
hooks:
- id: cython-lint
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v16.0.1
hooks:
Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ please see the `.pre-commit-config.yaml` file.
- `clang-format`: Formats C++ and CUDA code for consistency and readability.
- `black`: Auto-formats Python code to conform to the PEP 8 style guide.
- `flake8`: Lints Python code for syntax errors and common code style issues.
- `cython-lint`: Lints Cython code for syntax errors and common code style issues.
- _`DeprecationWarning` checker_: Checks for new `DeprecationWarning` being
introduced in Python code, and instead `FutureWarning` should be used.
- _`#include` syntax checker_: Ensures consistent syntax for C++ `#include` statements.
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,8 @@ ignore-words-list = "inout,numer,startd,couldn,referr"
builtin = "clear"
# disable warnings about binary files and wrong encoding
quiet-level = 3

[tool.cython-lint]
# TODO: Re-enable E501 with a reasonable line length
max-line-length = 999
bdice marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to re-enable E501 with a reasonable line length limit (following our flake8/black conventions) in a follow-up PR. Wrapping lines takes a little more manual effort than I wanted to put in for this first pass.

ignore = ['E501']
4 changes: 2 additions & 2 deletions python/cuml/cluster/cpp/kmeans.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -167,4 +167,4 @@ cdef extern from "cuml/cluster/kmeans.hpp" namespace "ML::kmeans":
const double *X,
int64_t n_samples,
int64_t n_features,
double *X_new) except +
double *X_new) except +
8 changes: 2 additions & 6 deletions python/cuml/cluster/dbscan.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -16,15 +16,13 @@

# distutils: language = c++

import ctypes
from cuml.internals.safe_imports import cpu_only_import
np = cpu_only_import('numpy')
from cuml.internals.safe_imports import gpu_only_import
cp = gpu_only_import('cupy')

from libcpp cimport bool
from libc.stdint cimport uintptr_t, int64_t
from libc.stdlib cimport calloc, malloc, free

from cuml.internals.array import CumlArray
from cuml.internals.base import Base
Expand All @@ -37,8 +35,6 @@ from cuml.internals.mixins import ClusterMixin
from cuml.internals.mixins import CMajorInputTagMixin
from cuml.metrics.distance_type cimport DistanceType

from collections import defaultdict

cdef extern from "cuml/cluster/dbscan.hpp" \
namespace "ML::Dbscan":

Expand Down Expand Up @@ -345,7 +341,7 @@ class DBSCAN(Base,
# make sure that the `fit` is complete before the following
# delete call happens
self.handle.sync()
del(X_m)
del X_m

# Finally, resize the core_sample_indices array if necessary
if self.calc_core_sample_indices:
Expand Down
7 changes: 3 additions & 4 deletions python/cuml/cluster/hdbscan/hdbscan.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ from cuml.internals.api_decorators import device_interop_preparation
from cuml.internals.api_decorators import enable_device_interop
from cuml.internals.mixins import ClusterMixin
from cuml.internals.mixins import CMajorInputTagMixin
from cuml.internals import logger
from cuml.internals.import_utils import has_hdbscan

import cuml
Expand Down Expand Up @@ -257,7 +256,7 @@ def condense_hierarchy(dendrogram,
new CondensedHierarchy[int, float](
handle_[0], <size_t>n_leaves)

children, n_rows, _, _ = \
children, _, _, _ = \
input_to_cuml_array(dendrogram[:, 0:2].astype('int32'), order='C',
check_dtype=[np.int32],
convert_to_dtype=(np.int32))
Expand Down Expand Up @@ -457,7 +456,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
A score of how persistent each cluster is. A score of 1.0 represents
a perfectly stable cluster that persists over all distance scales,
while a score of 0.0 represents a perfectly ephemeral cluster. These
scores can be used to gauge the relative coherence of the
scores can be used to gauge the relative coherence of the
clusters output by the algorithm.

condensed_tree_ : CondensedTree object
Expand Down Expand Up @@ -1026,7 +1025,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):

cdef handle_t* handle_ = <handle_t*><size_t>self.handle.getHandle()

self.X_m, self.n_rows, self.n_cols, dtype = \
self.X_m, self.n_rows, self.n_cols, _ = \
input_to_cuml_array(self._cpu_model._raw_data, order='C',
check_dtype=[np.float32],
convert_to_dtype=(np.float32
Expand Down
17 changes: 5 additions & 12 deletions python/cuml/cluster/hdbscan/prediction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,13 @@ from cuml.internals.safe_imports import gpu_only_import
cp = gpu_only_import('cupy')

from cuml.internals.array import CumlArray
from cuml.internals.base import Base
from cuml.common.doc_utils import generate_docstring
from pylibraft.common.handle cimport handle_t

from pylibraft.common.handle import Handle
from cuml.common import (
input_to_cuml_array,
input_to_host_array
)
from cuml.common.array_descriptor import CumlArrayDescriptor
from cuml.internals.available_devices import is_cuda_available
from cuml.internals.device_type import DeviceType
from cuml.internals.mixins import ClusterMixin
from cuml.internals.mixins import CMajorInputTagMixin
from cuml.internals import logger
from cuml.internals.import_utils import has_hdbscan

Expand Down Expand Up @@ -96,7 +89,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
DistanceType metric,
float* membership_vec,
size_t batch_size)

void compute_membership_vector(
const handle_t& handle,
CondensedHierarchy[int, float] &condensed_tree,
Expand All @@ -107,7 +100,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
int min_samples,
DistanceType metric,
float* membership_vec,
size_t batch_size);
size_t batch_size)

void out_of_sample_predict(const handle_t &handle,
CondensedHierarchy[int, float] &condensed_tree,
Expand Down Expand Up @@ -250,7 +243,7 @@ def membership_vector(clusterer, points_to_predict, batch_size=4096, convert_dty
The new data points to predict cluster labels for. They should
have the same dimensionality as the original dataset over which
clusterer was fit.

batch_size : int, optional, default=min(4096, n_points_to_predict)
Lowers memory requirement by computing distance-based membership
in smaller batches of points in the prediction data. For example, a
Expand Down Expand Up @@ -308,7 +301,7 @@ def membership_vector(clusterer, points_to_predict, batch_size=4096, convert_dty
convert_to_dtype=(np.float32
if convert_dtype
else None))

if clusterer.n_clusters_ == 0:
return np.zeros(n_prediction_points, dtype=np.float32)

Expand All @@ -317,7 +310,7 @@ def membership_vector(clusterer, points_to_predict, batch_size=4096, convert_dty

cdef uintptr_t prediction_ptr = points_to_predict_m.ptr
cdef uintptr_t input_ptr = clusterer.X_m.ptr

membership_vec = CumlArray.empty(
(n_prediction_points * clusterer.n_clusters_,),
dtype="float32")
Expand Down
25 changes: 10 additions & 15 deletions python/cuml/cluster/kmeans.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -16,24 +16,21 @@

# distutils: language = c++

import ctypes
from cuml.internals.safe_imports import cpu_only_import
np = cpu_only_import('numpy')
from cuml.internals.safe_imports import gpu_only_import
rmm = gpu_only_import('rmm')
import warnings
import typing

from cython.operator cimport dereference as deref
from libcpp cimport bool
from libc.stdint cimport uintptr_t, int64_t
from libc.stdlib cimport calloc, malloc, free
from libc.stdlib cimport calloc, free

from cuml.cluster.cpp.kmeans cimport fit_predict as cpp_fit_predict
from cuml.cluster.cpp.kmeans cimport predict as cpp_predict
from cuml.cluster.cpp.kmeans cimport transform as cpp_transform
from cuml.cluster.cpp.kmeans cimport KMeansParams
from cuml.cluster.cpp.kmeans cimport InitMethod

from cuml.internals.array import CumlArray
from cuml.common.array_descriptor import CumlArrayDescriptor
Expand Down Expand Up @@ -246,7 +243,7 @@ class KMeans(Base,
else:
self.init = 'preset'
self._params_init = Array
self.cluster_centers_, n_rows, self.n_cols, self.dtype = \
self.cluster_centers_, _n_rows, self.n_cols, self.dtype = \
input_to_cuml_array(init, order='C',
check_dtype=[np.float32, np.float64])

Expand Down Expand Up @@ -369,8 +366,8 @@ class KMeans(Base,
' passed.')

self.handle.sync()
del(X_m)
del(sample_weight_m)
del X_m
del sample_weight_m
free(params)
return self

Expand Down Expand Up @@ -417,7 +414,7 @@ class KMeans(Base,
Sum of squared distances of samples to their closest cluster center.
"""

X_m, n_rows, n_cols, dtype = \
X_m, n_rows, n_cols, _ = \
input_to_cuml_array(X, order='C', check_dtype=self.dtype,
convert_to_dtype=(self.dtype if convert_dtype
else None),
Expand Down Expand Up @@ -452,8 +449,6 @@ class KMeans(Base,
cdef KMeansParams* params = \
<KMeansParams*><size_t>self._get_kmeans_params()

cur_int_dtype = labels_.dtype

if self.dtype == np.float32:
if int_dtype == np.int32:
cpp_predict(
Expand Down Expand Up @@ -515,8 +510,8 @@ class KMeans(Base,
' passed.')

self.handle.sync()
del(X_m)
del(sample_weight_m)
del X_m
del sample_weight_m
free(params)
return labels_, inertia

Expand Down Expand Up @@ -548,7 +543,7 @@ class KMeans(Base,

"""

X_m, n_rows, n_cols, dtype = \
X_m, n_rows, _n_cols, _dtype = \
input_to_cuml_array(X, order='C', check_dtype=self.dtype,
convert_to_dtype=(self.dtype if convert_dtype
else None),
Expand Down Expand Up @@ -620,7 +615,7 @@ class KMeans(Base,

self.handle.sync()

del(X_m)
del X_m
free(params)
return preds

Expand Down
26 changes: 8 additions & 18 deletions python/cuml/cluster/kmeans_mg.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -16,21 +16,17 @@

# distutils: language = c++

import ctypes
from cuml.internals.safe_imports import cpu_only_import
np = cpu_only_import('numpy')
import warnings

from cuml.internals.safe_imports import gpu_only_import
rmm = gpu_only_import('rmm')

from cython.operator cimport dereference as deref
from libcpp cimport bool
from libc.stdint cimport uintptr_t, int64_t
from libc.stdlib cimport calloc, malloc, free
from libc.stdlib cimport free

from cuml.internals.array import CumlArray
from cuml.internals.base import Base
from pylibraft.common.handle cimport handle_t
from cuml.common import input_to_cuml_array

Expand Down Expand Up @@ -79,10 +75,10 @@ cdef extern from "cuml/cluster/kmeans_mg.hpp" \
const double *sample_weight,
double *centroids,
double &inertia,
int64_t &n_iter) except +
int64_t &n_iter) except +

class KMeansMG(KMeans):

class KMeansMG(KMeans):
"""
A Multi-Node Multi-GPU implementation of KMeans

Expand Down Expand Up @@ -141,16 +137,10 @@ class KMeansMG(KMeans):

cdef uintptr_t cluster_centers_ptr = self.cluster_centers_.ptr


int_dtype = np.int32 if np.int64(n_rows) * np.int64(n_cols) < 2**31-1 else np.int64

print(str(n_rows * n_cols))

labels_ = CumlArray.zeros(shape=n_rows, dtype=int_dtype,
index=X_m.index)

cdef uintptr_t labels_ptr = labels_.ptr

cdef float inertiaf = 0
cdef double inertiad = 0

Expand Down Expand Up @@ -224,11 +214,11 @@ class KMeansMG(KMeans):

self.handle.sync()

self.labels_, _, _, _ = input_to_cuml_array(self.predict(X,
sample_weight=sample_weight), order='C',
convert_to_dtype=self.dtype)
self.labels_, _, _, _ = input_to_cuml_array(self.predict(X,
sample_weight=sample_weight), order='C',
convert_to_dtype=self.dtype)

del(X_m)
del X_m
free(params)

return self
6 changes: 3 additions & 3 deletions python/cuml/common/opg_data_utils_mg.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -19,7 +19,7 @@ np = cpu_only_import('numpy')

from cuml.common.opg_data_utils_mg cimport *
from libc.stdlib cimport malloc, free
from libc.stdint cimport uintptr_t, uint32_t, uint64_t
from libc.stdint cimport uintptr_t
from cuml.common import input_to_cuml_array
from cython.operator cimport dereference as deref
from cuml.internals.array import CumlArray
Expand Down Expand Up @@ -213,7 +213,7 @@ def _build_part_inputs(cuda_arr_ifaces,

cuml_arr_ifaces = []
for arr in cuda_arr_ifaces:
X_m, n_rows, n_cols, dtype = \
X_m, _, _, _ = \
input_to_cuml_array(arr, order="F",
convert_to_dtype=(np.float32
if convert_dtype
Expand Down
Loading