[REVIEW] Correcting labels meta dtype for `cuml.dask.make_classificat…

…ion` (#2940) * correct meta * adding a test * changelog * using np.int64 explicitly
rapidsai · Oct 9, 2020 · bcc5bc3 · bcc5bc3
1 parent d6ff833
commit bcc5bc3
Show file tree

Hide file tree

Showing 4 changed files with 10 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -80,6 +80,7 @@
 - PR #2932: Marking KBinsDiscretizer pytests as xfail
 - PR #2925: Fixing Owner Bug When Slicing CumlArray Objects
 - PR #2931: Fix notebook error handling in gpuCI
+- PR #2940: Correcting labels meta dtype for `cuml.dask.make_classification`
 
 
 # cuML 0.15.0 (Date TBD)

diff --git a/python/cuml/dask/datasets/classification.py b/python/cuml/dask/datasets/classification.py
@@ -253,7 +253,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
                for idx, f in enumerate(parts)]
 
     X_dela = _create_delayed(X_parts, dtype, worker_rows, n_features)
-    y_dela = _create_delayed(y_parts, dtype, worker_rows)
+    y_dela = _create_delayed(y_parts, np.int64, worker_rows)
 
     X = da.concatenate(X_dela)
     y = da.concatenate(y_dela)

diff --git a/python/cuml/datasets/classification.py b/python/cuml/datasets/classification.py
@@ -247,7 +247,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2,
     # Initialize X and y
     X = generator.randn(n_samples * n_features, dtype=dtype)
     X = X.reshape((n_samples, n_features), order=order)
-    y = cp.zeros(n_samples, dtype=np.int)
+    y = cp.zeros(n_samples, dtype=np.int64)
 
     # Build the polytope whose vertices become cluster centroids
     if _centroids is None:

diff --git a/python/cuml/test/dask/test_datasets.py b/python/cuml/test/dask/test_datasets.py
@@ -180,23 +180,28 @@ def test_make_regression(n_samples, n_features, n_informative,
 @pytest.mark.parametrize('random_state', [None, 1234])
 @pytest.mark.parametrize('n_parts', [2, 23])
 @pytest.mark.parametrize('order', ['C', 'F'])
+@pytest.mark.parametrize('dtype', ['float32', 'float64'])
 def test_make_classification(n_samples, n_features, hypercube, n_classes,
                              n_clusters_per_class, n_informative,
-                             random_state, n_parts, order, client):
+                             random_state, n_parts, order, dtype,
+                             client):
     from cuml.dask.datasets.classification import make_classification
 
     X, y = make_classification(n_samples=n_samples, n_features=n_features,
                                n_classes=n_classes, hypercube=hypercube,
                                n_clusters_per_class=n_clusters_per_class,
                                n_informative=n_informative,
                                random_state=random_state, n_parts=n_parts,
-                               order=order)
+                               order=order, dtype=dtype)
     assert(len(X.chunks[0])) == n_parts
     assert(len(X.chunks[1])) == 1
 
     assert X.shape == (n_samples, n_features)
     assert y.shape == (n_samples, )
 
+    assert X.dtype == dtype
+    assert y.dtype == np.int64
+
     assert len(X.chunks[0]) == n_parts
     assert len(y.chunks[0]) == n_parts