From f67983a8e0f99b26033d90edb9e807ac193b9079 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 26 Feb 2019 21:36:46 -0800 Subject: [PATCH 001/156] cleaned up copy method to use cudaMemcpy --- ml-prims/src/matrix/matrix.h | 19 ++++--------------- ml-prims/src/utils.h | 7 +++++++ 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/ml-prims/src/matrix/matrix.h b/ml-prims/src/matrix/matrix.h index 3889b60449..e42356319a 100644 --- a/ml-prims/src/matrix/matrix.h +++ b/ml-prims/src/matrix/matrix.h @@ -30,28 +30,17 @@ namespace Matrix { using namespace std; /** - * @defgroup copy matrix operation for column major matrices. + * @brief copy operation for matrices. * @param in: input matrix * @param out: output matrix * @param n_rows: number of rows of output matrix * @param n_cols: number of columns of output matrix - * @{ */ template -void copy(const m_t *in, m_t *out, int n_rows, int n_cols) { - auto m = n_rows; - auto size = n_rows * n_rows; - auto d_q_in = in; - auto d_q_out = out; - auto counting = thrust::make_counting_iterator(0); - - thrust::for_each(counting, counting + size, [=] __device__(int idx) { - int row = idx % m; - int col = idx / m; - d_q_out[col * m + row] = d_q_in[col * m + row]; - }); +void copy(const m_t *in, m_t *out, int n_rows, int n_cols, + cudaStream_t stream = 0) { + copyAsync(out, in, n_rows * n_cols, stream); } -/** @} */ /** * @defgroup copy matrix operation for column major matrices. First n_rows and diff --git a/ml-prims/src/utils.h b/ml-prims/src/utils.h index 43f107b469..b1d7b7275a 100644 --- a/ml-prims/src/utils.h +++ b/ml-prims/src/utils.h @@ -151,6 +151,13 @@ void copy(Type* dPtr1, const Type* dPtr2, size_t len) { CUDA_CHECK(cudaMemcpy(dPtr1, dPtr2, len*sizeof(Type), cudaMemcpyDeviceToDevice)); } + +template +void copyAsync(Type* dPtr1, const Type* dPtr2, size_t len, + cudaStream_t stream) { + CUDA_CHECK(cudaMemcpyAsync(dPtr1, dPtr2, len * sizeof(Type), + cudaMemcpyDeviceToDevice, stream)); +} /** @} */ /** Helper function to calculate need memory for allocate to store dense matrix. From a547c968430f98968c906d6d86d588b444949bec Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Wed, 27 Feb 2019 09:21:43 -0800 Subject: [PATCH 002/156] added a faster and more API-complete "reverse" primitive --- ml-prims/src/cuda_utils.h | 13 ++++ ml-prims/src/matrix/reverse.h | 137 ++++++++++++++++++++++++++++++++++ ml-prims/test/CMakeLists.txt | 2 +- ml-prims/test/reverse.cu | 98 ++++++++++++++++++++++++ 4 files changed, 249 insertions(+), 1 deletion(-) create mode 100644 ml-prims/src/matrix/reverse.h create mode 100644 ml-prims/test/reverse.cu diff --git a/ml-prims/src/cuda_utils.h b/ml-prims/src/cuda_utils.h index c66d4166c4..2f60fe8718 100644 --- a/ml-prims/src/cuda_utils.h +++ b/ml-prims/src/cuda_utils.h @@ -93,6 +93,19 @@ DI int laneId() { return id; } +/** + * @brief Swap two values + * @tparam T the datatype of the values + * @param a first input + * @param b second input + */ +template +HDI void swap(T& a, T& b) { + T tmp = a; + a = b; + b = tmp; +} + /** Device function to have atomic add support for older archs */ #if __CUDA_ARCH__ < 600 template diff --git a/ml-prims/src/matrix/reverse.h b/ml-prims/src/matrix/reverse.h new file mode 100644 index 0000000000..0ceec939af --- /dev/null +++ b/ml-prims/src/matrix/reverse.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuda_utils.h" +#include "vectorized.h" + +namespace MLCommon { +namespace Matrix { + +template +__global__ void reverseKernel(math_t *out, const math_t *in, int nrows, + int ncols, bool rowMajor, bool alongRows, int len, + Lambda op) { + typedef TxN_t VecType; + int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; + if (idx >= len) + return; + int srcIdx, dstIdx; + if (!rowMajor && !alongRows) { + int srcRow = idx % nrows; + int srcCol = idx / nrows; + int dstRow = srcRow; + int dstCol = ncols - srcCol - 1; + srcIdx = idx; + dstIdx = dstCol * nrows + dstRow; + } else if(!rowMajor && alongRows) { + int mod = ceildiv(nrows, 2); + int srcRow = idx % mod; + int srcCol = idx / mod; + int dstRow = nrows - srcRow - VecType::Ratio; + int dstCol = srcCol; + srcIdx = srcCol * nrows + srcRow; + dstIdx = dstCol * nrows + dstRow; + } else if(rowMajor && !alongRows) { + int mod = ceildiv(ncols, 2); + int srcRow = idx / mod; + int srcCol = idx % mod; + int dstRow = srcRow; + int dstCol = ncols - srcCol - VecType::Ratio; + srcIdx = srcCol + srcRow * ncols; + dstIdx = dstCol + dstRow * ncols; + } else { + int srcRow = idx / ncols; + int srcCol = idx % ncols; + int dstRow = nrows - srcRow - 1; + int dstCol = srcCol; + srcIdx = idx; + dstIdx = dstCol + dstRow * ncols; + } + VecType a, b; + a.load(in, srcIdx); + b.load(in, dstIdx); + // while reversing along coalesced dimension, also reverse the elements + if ((rowMajor && !alongRows) || (!rowMajor && alongRows)) { +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) { + swap(a.val.data[i], a.val.data[VecType::Ratio - i - 1]); + swap(b.val.data[i], b.val.data[VecType::Ratio - i - 1]); + } + } +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) { + a.val.data[i] = op(a.val.data[i]); + b.val.data[i] = op(b.val.data[i]); + } + a.store(out, dstIdx); + b.store(out, srcIdx); +} + +template +void reverseImpl(math_t *out, const math_t *in, int nrows, int ncols, + bool rowMajor, bool alongRows, Lambda op, + cudaStream_t stream) { + int len = alongRows? ceildiv(nrows, 2) * ncols : nrows * ceildiv(ncols, 2); + const int nblks = ceildiv(veclen_ ? len / veclen_ : len, TPB); + reverseKernel<<>>( + out, in, nrows, ncols, rowMajor, alongRows, len, op); + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief perform element-wise binary operation on the input arrays + * @tparam math_t data-type upon which the math operation will be performed + * @tparam Lambda the device-lambda performing the actual operation + * @tparam TPB threads-per-block in the final kernel launched + * @param out the output matrix (supports inplace operation) + * @param in the input matrix + * @param nrows number of rows in the input matrix + * @param ncols number of cols in the input matrix + * @param rowMajor input matrix is row major or not + * @param alongRows whether to reverse along rows or not + * @param op the device-lambda to perform any unary operations on each element + * @param stream cuda stream where to launch work + */ +template , int TPB = 256> +void reverse(math_t *out, const math_t *in, int nrows, int ncols, + bool rowMajor, bool alongRows, Lambda op = Nop(), + cudaStream_t stream = 0) { + size_t bytes = (rowMajor? ncols : nrows) * sizeof(math_t); + if (16 / sizeof(math_t) && bytes % 16 == 0) { + reverseImpl( + out, in, nrows, ncols, rowMajor, alongRows, op, stream); + } else if (8 / sizeof(math_t) && bytes % 8 == 0) { + reverseImpl( + out, in, nrows, ncols, rowMajor, alongRows, op, stream); + } else if (4 / sizeof(math_t) && bytes % 4 == 0) { + reverseImpl( + out, in, nrows, ncols, rowMajor, alongRows, op, stream); + } else if (2 / sizeof(math_t) && bytes % 2 == 0) { + reverseImpl( + out, in, nrows, ncols, rowMajor, alongRows, op, stream); + } else if (1 / sizeof(math_t)) { + reverseImpl( + out, in, nrows, ncols, rowMajor, alongRows, op, stream); + } else { + reverseImpl( + out, in, nrows, ncols, rowMajor, alongRows, op, stream); + } +} + +}; // end namespace Matrix +}; // end namespace MLCommon diff --git a/ml-prims/test/CMakeLists.txt b/ml-prims/test/CMakeLists.txt index e327ccf474..2489174011 100644 --- a/ml-prims/test/CMakeLists.txt +++ b/ml-prims/test/CMakeLists.txt @@ -29,7 +29,6 @@ add_executable(mlcommon_test cov.cu decoupled_lookback.cu distance.cu - # distance_adj.cu divide.cu eig.cu eltwise.cu @@ -47,6 +46,7 @@ add_executable(mlcommon_test norm.cu permute.cu power.cu + reverse.cu rng.cu rng_int.cu rsvd.cu diff --git a/ml-prims/test/reverse.cu b/ml-prims/test/reverse.cu new file mode 100644 index 0000000000..0d993ec10c --- /dev/null +++ b/ml-prims/test/reverse.cu @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "test_utils.h" +#include "matrix/reverse.h" +#include "random/rng.h" + +namespace MLCommon { +namespace Matrix { + +template +struct ReverseInputs { + T tolerance; + int nrows, ncols; + bool rowMajor, alongRows; + unsigned long long seed; +}; + +template +class ReverseTest : public ::testing::TestWithParam> { +protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + Random::Rng r(params.seed); + int len = params.nrows * params.ncols; + allocate(in, len); + allocate(out, len); + r.uniform(in, len, T(-1.0), T(1.0)); + // applying reverse twice should yield the same output! + // this will in turn also verify the inplace mode of reverse method + reverse(out, in, params.nrows, params.ncols, params.rowMajor, + params.alongRows); + reverse(out, out, params.nrows, params.ncols, params.rowMajor, + params.alongRows); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in)); + CUDA_CHECK(cudaFree(out)); + } + +protected: + ReverseInputs params; + T *in, *out; +}; + +const std::vector> inputsf = { + {0.000001f, 32, 32, false, false, 1234ULL}, + {0.000001f, 32, 32, false, true, 1234ULL}, + {0.000001f, 32, 32, true, false, 1234ULL}, + {0.000001f, 32, 32, true, true, 1234ULL}, + + {0.000001f, 41, 41, false, false, 1234ULL}, + {0.000001f, 41, 41, false, true, 1234ULL}, + {0.000001f, 41, 41, true, false, 1234ULL}, + {0.000001f, 41, 41, true, true, 1234ULL}}; +typedef ReverseTest ReverseTestF; +TEST_P(ReverseTestF, Result) { + ASSERT_TRUE(devArrMatch(in, out, params.nrows, params.ncols, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReverseTests, ReverseTestF, + ::testing::ValuesIn(inputsf)); + +typedef ReverseTest ReverseTestD; +const std::vector> inputsd = { + {0.000001, 32, 32, false, false, 1234ULL}, + {0.000001, 32, 32, false, true, 1234ULL}, + {0.000001, 32, 32, true, false, 1234ULL}, + {0.000001, 32, 32, true, true, 1234ULL}, + + {0.000001, 41, 41, false, false, 1234ULL}, + {0.000001, 41, 41, false, true, 1234ULL}, + {0.000001, 41, 41, true, false, 1234ULL}, + {0.000001, 41, 41, true, true, 1234ULL}}; +TEST_P(ReverseTestD, Result) { + ASSERT_TRUE(devArrMatch(in, out, params.nrows, params.ncols, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(ReverseTests, ReverseTestD, + ::testing::ValuesIn(inputsd)); + +} // end namespace Matrix +} // end namespace MLCommon From 8875b465d405191cc6997d073f8129b7818c8036 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 9 Apr 2019 10:37:49 -0500 Subject: [PATCH 003/156] FIX Changed setup_pip runtime lib path --- setup_pip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup_pip.py b/setup_pip.py index aabdd4f473..d4fa713220 100644 --- a/setup_pip.py +++ b/setup_pip.py @@ -61,7 +61,7 @@ library_dirs=[get_python_lib(), distutils_dir_name('lib')], libraries=['cuml'], language='c++', - runtime_library_dirs=['$ORIGIN', cuda_lib_dir], + runtime_library_dirs=[get_python_lib(), cuda_lib_dir], extra_compile_args=['-std=c++11']) ] From d0c619fb67cea4c62aa3b234c9a9ecf22243581c Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 9 Apr 2019 10:49:25 -0500 Subject: [PATCH 004/156] DOC Add entry to changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 082ff03c16..75d6bb4c34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# cuML 0.6.1 (09 Apr 2019) + +## Bug Fixes + +- PR #462 Runtime library path fix for cuML pip package + + # cuML 0.6.0 (22 Mar 2019) ## New Features From 2def25c20acdf71fea2f7898a63cf7256d85e0f5 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 9 Apr 2019 16:57:49 -0500 Subject: [PATCH 005/156] FIX Ported back pca test skip for version 0.6 driver 418 compatibility issue --- cuML/test/pca_test.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuML/test/pca_test.cu b/cuML/test/pca_test.cu index 4bd1b8ef67..13569987ee 100644 --- a/cuML/test/pca_test.cu +++ b/cuML/test/pca_test.cu @@ -283,7 +283,7 @@ TEST_P(PcaTestDataVecSmallD, Result) { } typedef PcaTest PcaTestDataVecF; -TEST_P(PcaTestDataVecF, Result) { +TEST_P(PcaTestDataVecF, DISABLED_Fit) { ASSERT_TRUE( devArrMatch(data2, data2_back, (params.n_col2 * params.n_col2), @@ -292,7 +292,7 @@ TEST_P(PcaTestDataVecF, Result) { } typedef PcaTest PcaTestDataVecD; -TEST_P(PcaTestDataVecD, Result) { +TEST_P(PcaTestDataVecD, DISABLED_Fit) { ASSERT_TRUE( devArrMatch(data2, data2_back, (params.n_col2 * params.n_col2), From 57eda467009f28e7f94187d079e7d09e29c51c04 Mon Sep 17 00:00:00 2001 From: gpuCI <38199262+GPUtester@users.noreply.github.com> Date: Thu, 11 Apr 2019 09:24:16 -0700 Subject: [PATCH 006/156] REL v0.6.1 release --- docs/source/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 636d5aa04c..44b48e1190 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -60,7 +60,7 @@ # The short X.Y version. version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.6.0' +release = '0.6.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From 294dbd184b7b98142a60a51c0c516197a248e2f8 Mon Sep 17 00:00:00 2001 From: Chirayu Date: Tue, 23 Apr 2019 18:34:11 -0700 Subject: [PATCH 007/156] Add contingency matrix implementation --- ml-prims/src/metrics/contingencyMatrix.h | 260 +++++++++++++++++++++++ ml-prims/test/CMakeLists.txt | 1 + ml-prims/test/contingencyMatrix.cu | 134 ++++++++++++ 3 files changed, 395 insertions(+) create mode 100644 ml-prims/src/metrics/contingencyMatrix.h create mode 100644 ml-prims/test/contingencyMatrix.cu diff --git a/ml-prims/src/metrics/contingencyMatrix.h b/ml-prims/src/metrics/contingencyMatrix.h new file mode 100644 index 0000000000..b1704f0058 --- /dev/null +++ b/ml-prims/src/metrics/contingencyMatrix.h @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cuda_utils.h" +#include +#include +#include +#include + +#define ALIGN_BYTE 256 +#define ALIGN_MEMORY(x) (x + ALIGN_BYTE - 1) & ~(ALIGN_BYTE - 1) + +namespace MLCommon { +namespace Metrics { + +typedef enum { + IMPL_NONE, + SMEM_ATOMICS, + GLOBAL_ATOMICS, + SORT_AND_GATOMICS +} ContingencyMatrixImplType; + +template +__global__ void devConstructContingencyMatrix(T *groundTruth, T *predicted, + int nSamples, int *outMat, + int outIdxOffset, int outMatWidth) { + int elementId = threadIdx.x + blockDim.x * blockIdx.x; + if (elementId < nSamples) { + T gt = groundTruth[elementId]; + T pd = predicted[elementId]; + + int outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset; + myAtomicAdd(&outMat[outputIdx], 1); + } +} + +template +__global__ void devConstructContingencyMatrixSmem(T *groundTruth, T *predicted, + int nSamples, int *outMat, + int outIdxOffset, int outMatWidth) { + + extern __shared__ int sMemMatrix[]; // init smem to zero + + // get linear smem ids form threadIdx's to smem to set to zero + // set to zero + for (int smemIdx=threadIdx.x; smemIdx < outMatWidth*outMatWidth; smemIdx+=blockDim.x) { + sMemMatrix[smemIdx] = 0; + } + __syncthreads(); + + int elementId = threadIdx.x + blockDim.x * blockIdx.x; + if (elementId < nSamples) { + T gt = groundTruth[elementId]; + T pd = predicted[elementId]; + + int outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset; + myAtomicAdd(&sMemMatrix[outputIdx], 1); + } + __syncthreads(); + + // upstream atomic updates to global matrix + for (int smemIdx=threadIdx.x; smemIdx < outMatWidth*outMatWidth; smemIdx+=blockDim.x) { + myAtomicAdd(&outMat[smemIdx], sMemMatrix[smemIdx]); + } +} + +template +cudaError_t computeCMatWAtomics(T *groundTruth, T *predictedLabel, int nSamples, + int *outMat, int outIdxOffset, int outDimN, + cudaStream_t stream) { + CUDA_CHECK(cudaFuncSetCacheConfig(devConstructContingencyMatrix, cudaFuncCachePreferL1)); + dim3 block(128,1,1); + dim3 grid((nSamples + block.x - 1) / block.x); + + // launch kernel - global atomic ops per groundTruth - predictedValue pair + devConstructContingencyMatrix<<>>(groundTruth, + predictedLabel, nSamples, outMat, outIdxOffset, outDimN); + + return cudaGetLastError(); +} + +template +cudaError_t computeCMatWSmemAtomics(T *groundTruth, T *predictedLabel, int nSamples, + int *outMat, int outIdxOffset, int outDimN, + cudaStream_t stream) { + dim3 block(128,1,1); + dim3 grid((nSamples + block.x - 1) / block.x); + size_t smemSizePerBlock = outDimN * outDimN * sizeof(int); + + devConstructContingencyMatrixSmem<<>>(groundTruth, + predictedLabel, nSamples, outMat, outIdxOffset, outDimN); + + return cudaGetLastError(); +} + +template +void contingencyMatrixWSort(T *groundTruth, T *predictedLabel, int nSamples, + int *outMat, T minLabel, T maxLabel, void *workspace, + size_t workspaceSize, cudaStream_t stream) { + + T *outKeys = reinterpret_cast(workspace); + size_t alignedBufferSz = ALIGN_MEMORY((size_t)nSamples * sizeof(T)); + T *outValue = reinterpret_cast((size_t)workspace + alignedBufferSz); + + void *pWorkspaceCub = reinterpret_cast((size_t)workspace + 2*alignedBufferSz); + int bitsToSort = int(std::ceil(std::log2f((float)maxLabel))); + + + // we dont really need perfect sorting, should get by with some sort of binning-reordering operation + // future work - explore "efficient" custom binning kernels vs cub sort + CUDA_CHECK(cub::DeviceRadixSort::SortPairs(pWorkspaceCub, workspaceSize, groundTruth, outKeys, + predictedLabel, outValue, nSamples, 0, + bitsToSort, stream)); + int outDimM_N = (int)(maxLabel - minLabel + T(1)); + + computeCMatWAtomics(outKeys, outValue, nSamples, outMat, minLabel, outDimM_N, stream); +} + +inline ContingencyMatrixImplType getImplVersion(int outDimN) { + int currDevice = 0; + int l2CacheSize = 0; + int maxSmemPerBlock = 0; + int maxBlocksResidentPerSM = 16; // no way to query this from CUDA APIs, value for CC 7.0, 3.0 + + CUDA_CHECK(cudaGetDevice(&currDevice)); + CUDA_CHECK(cudaDeviceGetAttribute(&l2CacheSize, cudaDevAttrL2CacheSize, currDevice)); + CUDA_CHECK(cudaDeviceGetAttribute(&maxSmemPerBlock, cudaDevAttrMaxSharedMemoryPerBlock, currDevice)); + + ContingencyMatrixImplType implVersion = IMPL_NONE; + + // keeping 8 block per SM to get good utilization + // can go higher but reduced L1 size degrades perf + int upperLimitSmemAtomics = std::floor(std::sqrt(maxSmemPerBlock / (sizeof(int) *(maxBlocksResidentPerSM / 2)))); + int upperLimitL2Atomics = std::floor(std::sqrt(l2CacheSize / sizeof(int) )); + + if (outDimN <= upperLimitSmemAtomics) + implVersion = SMEM_ATOMICS; + else if(outDimN <= upperLimitL2Atomics) + implVersion = GLOBAL_ATOMICS; + else + implVersion = SORT_AND_GATOMICS; + + return implVersion; +} + +// use this to allocate output matrix size +// size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int) +template +void getInputClassCardinality(T* groundTruth, int nSamples, cudaStream_t stream, T &minLabel, T &maxLabel) { + thrust::device_ptr dTrueLabel = thrust::device_pointer_cast(groundTruth); + auto min_max = thrust::minmax_element(thrust::cuda::par.on(stream), + dTrueLabel, dTrueLabel + nSamples); + minLabel = *min_max.first; + maxLabel = *min_max.second; +} + +template +size_t getWorkspaceSize(int nSamples, T* groundTruth, cudaStream_t stream, + T minLabel=std::numeric_limits::max(), + T maxLabel=std::numeric_limits::max()) { + size_t workspaceSize = 0; + // below is a redundant computation - can be avoided + if (minLabel == std::numeric_limits::max() || + maxLabel == std::numeric_limits::max()) { + thrust::device_ptr dTrueLabel = thrust::device_pointer_cast(groundTruth); + auto min_max = thrust::minmax_element(thrust::cuda::par.on(stream), + dTrueLabel, dTrueLabel + nSamples); + minLabel = *min_max.first; + maxLabel = *min_max.second; + } + + int outDimN = int(maxLabel - minLabel + T(1)); + ContingencyMatrixImplType implVersion = getImplVersion(outDimN); + + if (implVersion == SORT_AND_GATOMICS) { + void *pWorkspaceCub = NULL; + size_t tmpStorageBytes = 0; + // bunch of no-op pointers to get workspace size + T *pTmpKey, *pTmpValue, *pTmpKeyOut, *pTmpValueOut; + + CUDA_CHECK(cub::DeviceRadixSort::SortPairs(pWorkspaceCub, tmpStorageBytes, pTmpKey, pTmpValue, + pTmpKeyOut, pTmpValueOut, nSamples)); + + size_t tmpStagingMemorySize = ALIGN_MEMORY(nSamples * sizeof(T)); + tmpStagingMemorySize *= 2; + workspaceSize = tmpStagingMemorySize + tmpStorageBytes; + } + return workspaceSize; +} + +template +void contingencyMatrix(T *groundTruth, T *predictedLabel, int nSamples, int *outMat, + cudaStream_t stream, void *workspace=NULL, size_t workspaceSize=0, + T minLabel=std::numeric_limits::max(), + T maxLabel=std::numeric_limits::max()) { + // assumptions: + // output is not at par with scikit learn - output will be square matrix always with + // numRows = numColumns = numOfClassesInTrueLabel + // it is also assumed that true labels are monotically increasing with step count 1 + // if for some reason groundTruth completely skips some labels + // eg: {0,1,2,5} instead of {0,1,2,3} . Output matrix will still have empty rows for label value {3,4} + + // this also serves as way to measure co-occurence/joint counts for NLP tasks which + // can be used to then compute pointwise mutual information and mutual information + + if (minLabel == std::numeric_limits::max() || + maxLabel == std::numeric_limits::max()) { + thrust::device_ptr dTrueLabel = thrust::device_pointer_cast(groundTruth); + auto min_max = thrust::minmax_element(thrust::cuda::par.on(stream), + dTrueLabel, dTrueLabel + nSamples); + minLabel = *min_max.first; + maxLabel = *min_max.second; + } + + int outDimM_N = (int)(maxLabel - minLabel + T(1)); + + //memset outMat to zero before atomic increments + cudaMemsetAsync((void*)outMat, 0, sizeof(int) * outDimM_N * outDimM_N, stream); + + ContingencyMatrixImplType implVersion = getImplVersion(outDimM_N); + + switch (implVersion){ + case SMEM_ATOMICS: + // smem atomics and then single global mem atomics only works + // when all label count can fit in smem for a block + // helps when GLOBAL_ATOMICS performance blocked by atomic update serialization + // -when very less labels ~10 labels + computeCMatWSmemAtomics(groundTruth, predictedLabel, nSamples, outMat, + minLabel, outDimM_N, stream); + break; + case GLOBAL_ATOMICS: + // launch kernel - global atomic ops per (groundTruth,predictedValue) pair + computeCMatWAtomics(groundTruth, predictedLabel, nSamples, outMat, + minLabel, outDimM_N, stream); + break; + // more L2 thrashing if atomic OPs land in completely different mem segment - when more labels + case SORT_AND_GATOMICS: + contingencyMatrixWSort(groundTruth, predictedLabel, nSamples, + outMat, minLabel, maxLabel, workspace, + workspaceSize, stream); + break; + } +} +}; +}; \ No newline at end of file diff --git a/ml-prims/test/CMakeLists.txt b/ml-prims/test/CMakeLists.txt index 7f8a5be326..d69cf482ca 100644 --- a/ml-prims/test/CMakeLists.txt +++ b/ml-prims/test/CMakeLists.txt @@ -75,6 +75,7 @@ add_executable(mlcommon_test penalty.cu sigmoid.cu weighted_mean.cu + contingencyMatrix.cu ) target_link_libraries(mlcommon_test diff --git a/ml-prims/test/contingencyMatrix.cu b/ml-prims/test/contingencyMatrix.cu new file mode 100644 index 0000000000..2c64dd3000 --- /dev/null +++ b/ml-prims/test/contingencyMatrix.cu @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #include + #include "test_utils.h" + #include + #include + #include + #include "metrics/contingencyMatrix.h" + +namespace MLCommon { +namespace Metrics { + +struct contingencyMatrixParam { + int nElements; + int minClass; + int maxClass; + bool calcCardinality; + float tolerance; +}; + +template +class ContingencyMatrixTestImpl : public ::testing::TestWithParam { +protected: + void SetUp() override { + params = ::testing::TestWithParam::GetParam(); + + int numElements = params.nElements; + int lowerLabelRange = params.minClass; + int upperLabelRange = params.maxClass; + + std::vector y(numElements, 0); + std::vector y_hat(numElements, 0); + std::random_device rd; + std::default_random_engine dre(rd()); + std::uniform_int_distribution intGenerator(lowerLabelRange, upperLabelRange); + + std::generate(y.begin(), y.end(), [&](){return intGenerator(dre); }); + std::generate(y_hat.begin(), y_hat.end(), [&](){return intGenerator(dre); }); + + numUniqueClasses = upperLabelRange - lowerLabelRange + 1; + + // generate golden output on CPU + size_t sizeOfMat = numUniqueClasses*numUniqueClasses * sizeof(int); + int *hGoldenOutput = (int *)malloc(sizeOfMat); + memset(hGoldenOutput, 0, sizeOfMat); + + for (int i = 0; i < numElements; i++) { + int row = y[i] - lowerLabelRange; + int column = y_hat[i] - lowerLabelRange; + + hGoldenOutput[row * numUniqueClasses + column] += 1; + } + + CUDA_CHECK(cudaStreamCreate(&stream)); + MLCommon::allocate(dY, numElements); + MLCommon::allocate(dYHat, numElements); + MLCommon::allocate(dComputedOutput, numUniqueClasses*numUniqueClasses); + MLCommon::allocate(dGoldenOutput, numUniqueClasses*numUniqueClasses); + + size_t workspaceSz = MLCommon::Metrics::getWorkspaceSize(numElements, dY, + stream, lowerLabelRange, upperLabelRange); + + if (workspaceSz != 0) + MLCommon::allocate(pWorkspace, workspaceSz); + + MLCommon::updateDeviceAsync(dYHat, &y_hat[0], numElements, stream); + MLCommon::updateDeviceAsync(dY, &y[0], numElements, stream); + MLCommon::updateDeviceAsync(dGoldenOutput, hGoldenOutput, + numUniqueClasses*numUniqueClasses, stream); + + if (params.calcCardinality) + MLCommon::Metrics::contingencyMatrix(dY, dYHat, numElements, dComputedOutput, + stream, (void*)pWorkspace, workspaceSz); + else + MLCommon::Metrics::contingencyMatrix(dY, dYHat, numElements, dComputedOutput, + stream, (void*)pWorkspace, workspaceSz, + lowerLabelRange, upperLabelRange); + } + + void TearDown() override { + free(hGoldenOutput); + CUDA_CHECK(cudaStreamDestroy(stream)); + CUDA_CHECK(cudaFree(dY)); + CUDA_CHECK(cudaFree(dYHat)); + CUDA_CHECK(cudaFree(dComputedOutput)); + CUDA_CHECK(cudaFree(dGoldenOutput)); + if (pWorkspace) + CUDA_CHECK(cudaFree(pWorkspace)); + } + + contingencyMatrixParam params; + int numUniqueClasses = -1; + T* dY=NULL; + T* dYHat=NULL; + int *dComputedOutput = NULL; + int *dGoldenOutput = NULL; + int *hGoldenOutput = NULL; + char *pWorkspace = NULL; + cudaStream_t stream; +}; + +const std::vector inputs = { + {10000, 1, 10, true, 0.000001}, + {100000, 1, 100, false, 0.000001}, + {1000000, 1, 1200, true, 0.000001}, + {1000000, 1, 10000, false, 0.000001} +}; + +typedef ContingencyMatrixTestImpl ContingencyMatrixTestImplS; +TEST_P(ContingencyMatrixTestImplS, Result) { + ASSERT_TRUE(devArrMatch(dComputedOutput, dGoldenOutput, numUniqueClasses * numUniqueClasses, + CompareApprox(params.tolerance))); +} + +INSTANTIATE_TEST_CASE_P(ContingencyMatrix, ContingencyMatrixTestImplS, + ::testing::ValuesIn(inputs)); +} +} + + \ No newline at end of file From 6c9a1a7f3a68f25b86c74f179941d47d57d7258b Mon Sep 17 00:00:00 2001 From: Chirayu Date: Wed, 24 Apr 2019 10:50:22 -0700 Subject: [PATCH 008/156] Add doxygen style comments --- ml-prims/src/metrics/contingencyMatrix.h | 38 ++++++++++++++++++++++-- ml-prims/test/contingencyMatrix.cu | 11 +++++-- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/ml-prims/src/metrics/contingencyMatrix.h b/ml-prims/src/metrics/contingencyMatrix.h index b1704f0058..19387db429 100644 --- a/ml-prims/src/metrics/contingencyMatrix.h +++ b/ml-prims/src/metrics/contingencyMatrix.h @@ -79,6 +79,7 @@ __global__ void devConstructContingencyMatrixSmem(T *groundTruth, T *predicted, } } +// helper functions to launch kernel for global atomic add template cudaError_t computeCMatWAtomics(T *groundTruth, T *predictedLabel, int nSamples, int *outMat, int outIdxOffset, int outDimN, @@ -94,6 +95,7 @@ cudaError_t computeCMatWAtomics(T *groundTruth, T *predictedLabel, int nSamples, return cudaGetLastError(); } +// helper function to launch share memory atomic add kernel template cudaError_t computeCMatWSmemAtomics(T *groundTruth, T *predictedLabel, int nSamples, int *outMat, int outIdxOffset, int outDimN, @@ -108,6 +110,7 @@ cudaError_t computeCMatWSmemAtomics(T *groundTruth, T *predictedLabel, int nSamp return cudaGetLastError(); } +// helper function to sort and global atomic update template void contingencyMatrixWSort(T *groundTruth, T *predictedLabel, int nSamples, int *outMat, T minLabel, T maxLabel, void *workspace, @@ -158,8 +161,15 @@ inline ContingencyMatrixImplType getImplVersion(int outDimN) { return implVersion; } -// use this to allocate output matrix size -// size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int) +/** + * @brief use this to allocate output matrix size + * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int) + * @param groundTruth: device 1-d array for ground truth (num of rows) + * @param nSamples: number of elements in input array + * @param stream: cuda stream for execution + * @param minLabel: [out] calculated min value in input array + * @param maxLabel: [out] calculated max value in input array +*/ template void getInputClassCardinality(T* groundTruth, int nSamples, cudaStream_t stream, T &minLabel, T &maxLabel) { thrust::device_ptr dTrueLabel = thrust::device_pointer_cast(groundTruth); @@ -169,8 +179,16 @@ void getInputClassCardinality(T* groundTruth, int nSamples, cudaStream_t stream, maxLabel = *min_max.second; } +/** + * @brief Calculate workspace size for running contingency matrix calculations + * @param nSamples: number of elements in input array + * @param groundTruth: device 1-d array for ground truth (num of rows) + * @param stream: cuda stream for execution + * @param minLabel: Optional, min value in input array + * @param maxLabel: Optional, max value in input array + */ template -size_t getWorkspaceSize(int nSamples, T* groundTruth, cudaStream_t stream, +size_t getCMatrixWorkspaceSize(int nSamples, T* groundTruth, cudaStream_t stream, T minLabel=std::numeric_limits::max(), T maxLabel=std::numeric_limits::max()) { size_t workspaceSize = 0; @@ -203,6 +221,20 @@ size_t getWorkspaceSize(int nSamples, T* groundTruth, cudaStream_t stream, return workspaceSize; } +/** + * @brief contruct contingency matrix given input ground truth and prediction labels + * users should call function getInputClassCardinality to find and allocate memory for + * output. Similarly workspace requirements should be checked using function getCMatrixWorkspaceSize + * @param groundTruth: device 1-d array for ground truth (num of rows) + * @param predictedLabel: device 1-d array for prediction (num of columns) + * @param nSamples: number of elements in input array + * @param outMat: output buffer for contingecy matrix + * @param stream: cuda stream for execution + * @param workspace: Optional, workspace memory allocation + * @param workspaceSize: Optional, size of workspace memory + * @param minLabel: Optional, min value in input ground truth array + * @param maxLabel: Optional, max value in input ground truth array + */ template void contingencyMatrix(T *groundTruth, T *predictedLabel, int nSamples, int *outMat, cudaStream_t stream, void *workspace=NULL, size_t workspaceSize=0, diff --git a/ml-prims/test/contingencyMatrix.cu b/ml-prims/test/contingencyMatrix.cu index 2c64dd3000..39e8b0b853 100644 --- a/ml-prims/test/contingencyMatrix.cu +++ b/ml-prims/test/contingencyMatrix.cu @@ -71,7 +71,7 @@ protected: MLCommon::allocate(dComputedOutput, numUniqueClasses*numUniqueClasses); MLCommon::allocate(dGoldenOutput, numUniqueClasses*numUniqueClasses); - size_t workspaceSz = MLCommon::Metrics::getWorkspaceSize(numElements, dY, + size_t workspaceSz = MLCommon::Metrics::getCMatrixWorkspaceSize(numElements, dY, stream, lowerLabelRange, upperLabelRange); if (workspaceSz != 0) @@ -82,9 +82,14 @@ protected: MLCommon::updateDeviceAsync(dGoldenOutput, hGoldenOutput, numUniqueClasses*numUniqueClasses, stream); - if (params.calcCardinality) + if (params.calcCardinality) { + T minLabel, maxLabel; + MLCommon::Metrics::getInputClassCardinality(dY, numElements, stream, minLabel, maxLabel); + // allocate dComputedOutput using minLabel, maxLabel count - already done above MLCommon::Metrics::contingencyMatrix(dY, dYHat, numElements, dComputedOutput, - stream, (void*)pWorkspace, workspaceSz); + stream, (void*)pWorkspace, workspaceSz, + minLabel, maxLabel); + } else MLCommon::Metrics::contingencyMatrix(dY, dYHat, numElements, dComputedOutput, stream, (void*)pWorkspace, workspaceSz, From a882ad90959b4e4ef4f0f36b5fe16fe7a779371f Mon Sep 17 00:00:00 2001 From: wxbn Date: Mon, 29 Apr 2019 18:11:58 +0000 Subject: [PATCH 009/156] Trustworthiness score --- cuML/src/metrics/trustworthiness.cu | 147 ++++++++++++++++++++++++ cuML/src/metrics/trustworthiness.h | 26 +++++ python/cuml/metrics/__init__.py | 17 +++ python/cuml/metrics/trustworthiness.pyx | 77 +++++++++++++ 4 files changed, 267 insertions(+) create mode 100644 cuML/src/metrics/trustworthiness.cu create mode 100644 cuML/src/metrics/trustworthiness.h create mode 100644 python/cuml/metrics/__init__.py create mode 100644 python/cuml/metrics/trustworthiness.pyx diff --git a/cuML/src/metrics/trustworthiness.cu b/cuML/src/metrics/trustworthiness.cu new file mode 100644 index 0000000000..b04235393e --- /dev/null +++ b/cuML/src/metrics/trustworthiness.cu @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "trustworthiness.h" +#include +#include "distance/distance.h" +#include +#include "../knn/knn.h" + +using namespace MLCommon; +using namespace MLCommon::Selection; +using namespace ML; + +/** +* @brief Compute a kNN and returns the indexes of the nearest neighbors +* @input param input: Input matrix holding the dataset +* @input param n: Number of samples +* @input param d: Number of features +* @return Matrix holding the indexes of the nearest neighbors +*/ +template +long* get_knn(math_t* input, int n, int d, int n_neighbors) +{ + long* d_pred_I; + math_t* d_pred_D; + allocate(d_pred_I, n*n_neighbors); + allocate(d_pred_D, n*n_neighbors); + + kNNParams params = {input, n}; + kNN knn(d); + knn.fit(¶ms, 1); + knn.search(input, n, d_pred_I, d_pred_D, n_neighbors); + + long* h_pred_I = new long[n*n_neighbors]; + updateHost(h_pred_I, d_pred_I, n*n_neighbors); + + CUDA_CHECK(cudaFree(d_pred_I)); + CUDA_CHECK(cudaFree(d_pred_D)); + return h_pred_I; +} + +namespace ML { + + /** + * @brief Compute the trustworthiness score + * @input param X: Data in original dimension + * @input param X_embedded: Data in target dimension (embedding) + * @input param n: Number of samples + * @input param m: Number of features in high/original dimension + * @input param d: Number of features in low/embedded dimension + * @input param n_neighbors: Number of neighbors considered by trustworthiness score + * @return Trustworthiness score + */ + template + double cuml_trustworthiness(math_t* X, math_t* X_embedded, int n, int m, int d, int n_neighbors) + { + const int TMP_SIZE = MAX_BATCH_SIZE * n; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + constexpr auto distance_type = MLCommon::Distance::DistanceType::EucUnexpandedL2Sqrt; + size_t workspaceSize = 0; // EucUnexpandedL2Sqrt does not need any workspace + typedef cutlass::Shape<8, 128, 128> OutputTile_t; + bool bAllocWorkspace = false; + + math_t* d_pdist_tmp; + allocate(d_pdist_tmp, TMP_SIZE); + int* d_ind_X_tmp; + allocate(d_ind_X_tmp, TMP_SIZE); + int* h_ind_X = new int[n*n]; + + int toDo = n; + while (toDo > 0) + { + int batchSize = min(toDo, MAX_BATCH_SIZE); // Takes at most MAX_BATCH_SIZE vectors at a time + + MLCommon::Distance::distance + (&X[(n - toDo) * m], X, + d_pdist_tmp, + batchSize, n, m, + (void*)nullptr, workspaceSize, + stream + ); + CUDA_CHECK(cudaPeekAtLastError()); + + sortColumnsPerRow(d_pdist_tmp, d_ind_X_tmp, + batchSize, n, + bAllocWorkspace, NULL, workspaceSize, + stream); + CUDA_CHECK(cudaPeekAtLastError()); + + updateHost(&h_ind_X[(n - toDo) * n], d_ind_X_tmp, batchSize * n, stream); + + toDo -= batchSize; + } + + long* ind_X_embedded = get_knn(X_embedded, n, d, n_neighbors + 1); + + double t = 0.0; + for (size_t i = 0; i < n; i++) + { + int* sample_i = &h_ind_X[i * n + 1]; + for (size_t j = 1; j <= n_neighbors; j++) + { + long idx = ind_X_embedded[i * (n_neighbors+1) + j]; + for (int r = 0; r < n-1; r++) + { + if (sample_i[r] == idx) + { + t += max(0.0, double(r - n_neighbors)); + break; + } + } + } + } + + delete[] h_ind_X; + delete[] ind_X_embedded; + + t = 1.0 - ((2.0 / ((n * n_neighbors) * ((2.0 * n) - (3.0 * n_neighbors) - 1.0))) * t); + + CUDA_CHECK(cudaStreamDestroy(stream)); + + return t; + } + + + + template double cuml_trustworthiness(float* X, float* X_embedded, int n, int m, int d, int n_neighbors); + //template double cuml_trustworthiness(double* X, double* X_embedded, int n, int m, int d, int n_neighbors); + // Disabled for now as knn only takes floats + +} \ No newline at end of file diff --git a/cuML/src/metrics/trustworthiness.h b/cuML/src/metrics/trustworthiness.h new file mode 100644 index 0000000000..aaad5eae2a --- /dev/null +++ b/cuML/src/metrics/trustworthiness.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#define MAX_BATCH_SIZE 512 + +namespace ML { + + template + double cuml_trustworthiness(math_t* X, math_t* X_embedded, int n, int m, int d, int n_neighbors); + +} \ No newline at end of file diff --git a/python/cuml/metrics/__init__.py b/python/cuml/metrics/__init__.py new file mode 100644 index 0000000000..98d6a0fd8e --- /dev/null +++ b/python/cuml/metrics/__init__.py @@ -0,0 +1,17 @@ +# +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from cuml.metrics.trustworthiness import trustworthiness \ No newline at end of file diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx new file mode 100644 index 0000000000..46023f07cd --- /dev/null +++ b/python/cuml/metrics/trustworthiness.pyx @@ -0,0 +1,77 @@ +# +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +import cudf +import numpy as np + +from numba import cuda + +from libc.stdint cimport uintptr_t + +cdef extern from "metrics/trustworthiness.h" namespace "ML": + cdef double cuml_trustworthiness[T](T* X, T* X_embedded, int n, int m, int d, int n_neighbors) + + +""" +Expresses to what extent the local structure is retained in embedding. +The score is defined in the range [0, 1]. + +Parameters +---------- + X : cuDF DataFrame or Numpy array (n_samples, n_features) + Data in original dimension + + X : cuDF DataFrame or Numpy array (n_samples, n_components) + Data in target dimension (embedding) + + n_neighbors : int, optional (default: 5) + Number of neighbors considered + +Returns +------- + trustworthiness score : double + Trustworthiness of the low-dimensional embedding +""" +def trustworthiness(X, X_embedded, n_neighbors=5): + n, m = X.shape + d = X_embedded.shape[1] + + if X.dtype != X_embedded.dtype: + raise TypeError("X and X_embedded parameters must be of same type") + + if X.dtype != np.float32 or X_embedded.dtype != np.float32: # currently only float32 is available + return TypeError("X and X_embedded parameters must be of type float32") + + cdef uintptr_t d_X = get_ctype_ptr(cuda.to_device(X)) + cdef uintptr_t d_X_embedded = get_ctype_ptr(cuda.to_device(X_embedded)) + + if X.dtype == np.float32: + return cuml_trustworthiness[float](d_X, d_X_embedded, n, m, d, n_neighbors) + #else: + # return cuml_trustworthiness(d_X, d_X_embedded, n, m, d, n_neighbors) + + + +def get_ctype_ptr(obj): + # The manner to access the pointers in the gdf's might change, so + # encapsulating access in the following 3 methods. They might also be + # part of future gdf versions. + return obj.device_ctypes_pointer.value \ No newline at end of file From e9ba355d5d3d83409bbc12a3137bab667ad38563 Mon Sep 17 00:00:00 2001 From: wxbn Date: Mon, 29 Apr 2019 21:23:59 +0000 Subject: [PATCH 010/156] Random projection --- cuML/CMakeLists.txt | 3 +- cuML/src/random_projection/rproj.cu | 31 ++ cuML/src/random_projection/rproj.hxx | 218 ++++++++++++ cuML/src/random_projection/rproj_c.h | 77 +++++ cuML/src/random_projection/utils.hxx | 155 +++++++++ ml-prims/src/linalg/cusparse_wrappers.h | 84 +++++ ml-prims/src/random/rng.h | 2 +- python/cuml/random_projection/__init__.py | 18 + python/cuml/random_projection/rproj.pxi | 28 ++ python/cuml/random_projection/rproj.pyx | 392 ++++++++++++++++++++++ 10 files changed, 1006 insertions(+), 2 deletions(-) create mode 100644 cuML/src/random_projection/rproj.cu create mode 100644 cuML/src/random_projection/rproj.hxx create mode 100644 cuML/src/random_projection/rproj_c.h create mode 100644 cuML/src/random_projection/utils.hxx create mode 100644 ml-prims/src/linalg/cusparse_wrappers.h create mode 100644 python/cuml/random_projection/__init__.py create mode 100644 python/cuml/random_projection/rproj.pxi create mode 100644 python/cuml/random_projection/rproj.pyx diff --git a/cuML/CMakeLists.txt b/cuML/CMakeLists.txt index 2927d66f47..6f4382fb10 100644 --- a/cuML/CMakeLists.txt +++ b/cuML/CMakeLists.txt @@ -216,7 +216,8 @@ add_library(${CUML_CPP_TARGET} SHARED src/common/cumlHandle.cpp src/common/cuml_api.cpp src/umap/umap.cu - src/solver/solver.cu) + src/solver/solver.cu + src/random_projection/rproj.cu) set(CUML_LINK_LIBRARIES ${CUDA_cublas_LIBRARY} diff --git a/cuML/src/random_projection/rproj.cu b/cuML/src/random_projection/rproj.cu new file mode 100644 index 0000000000..f591255550 --- /dev/null +++ b/cuML/src/random_projection/rproj.cu @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + #include "rproj.hxx" + #include "rproj_c.h" + + +namespace ML { + + using namespace MLCommon; + + template void RPROJfit(rand_mat *random_matrix, paramsRPROJ* params); + template void RPROJfit(rand_mat *random_matrix, paramsRPROJ* params); + template void RPROJtransform(float *input, rand_mat *random_matrix, float *output, paramsRPROJ* params); + template void RPROJtransform(double *input, rand_mat *random_matrix, double *output, paramsRPROJ* params); + + }; \ No newline at end of file diff --git a/cuML/src/random_projection/rproj.hxx b/cuML/src/random_projection/rproj.hxx new file mode 100644 index 0000000000..e7e1a2782b --- /dev/null +++ b/cuML/src/random_projection/rproj.hxx @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "rproj_c.h" +#include "utils.hxx" +#include +#include +#include +#include + +namespace ML { + + using namespace MLCommon; + using namespace MLCommon::LinAlg; + + /** + * @brief generates a gaussian random matrix + * @output param random_matrix: the random matrix to be allocated and generated + * @input param params: data structure that includes all the parameters of the model + * @input param stream: cuda stream + */ + template + void gaussian_random_matrix(rand_mat *random_matrix, paramsRPROJ& params, + cudaStream_t stream) + { + int len = params.n_components * params.n_features; + allocate(random_matrix->dense_data, len); + auto rng = Random::Rng(params.random_state); + math_t scale = 1.0 / sqrt(double(params.n_components)); + rng.normal(random_matrix->dense_data, len, math_t(0), scale, stream); + } + + /** + * @brief generates a sparse random matrix + * @output param random_matrix: the random matrix to be allocated and generated + * @input param params: data structure that includes all the parameters of the model + * @input param stream: cuda stream + */ + template + void sparse_random_matrix(rand_mat *random_matrix, paramsRPROJ& params, + cudaStream_t stream) + { + if (params.density == 1.0f) + { + int len = params.n_components * params.n_features; + allocate(random_matrix->dense_data, len); + auto rproj_rng = RPROJ_Rng(params.random_state); + math_t scale = 1.0 / sqrt(math_t(params.n_components)); + rproj_rng.sparse_rand_gen(random_matrix->dense_data, len, scale, stream); + } + else + { + ML::cumlHandle h; + auto alloc = h.getHostAllocator(); + + double max_total_density = params.density * 1.2; + size_t indices_alloc = (params.n_features * params.n_components * max_total_density) * sizeof(int); + size_t indptr_alloc = (params.n_components + 1) * sizeof(int); + int* indices = (int*)alloc->allocate(indices_alloc, stream); + int* indptr = (int*)alloc->allocate(indptr_alloc, stream); + + size_t offset = 0; + size_t indices_idx = 0; + size_t indptr_idx = 0; + + for (size_t i = 0; i < params.n_components; i++) + { + int n_nonzero = binomial(params.n_features, params.density); + sample_without_replacement(params.n_features, n_nonzero, indices, indices_idx); + indptr[indptr_idx] = offset; + indptr_idx++; + offset += n_nonzero; + } + indptr[indptr_idx] = offset; + + size_t len = offset; + allocate(random_matrix->indices, len); + updateDevice(random_matrix->indices, indices, len, stream); + alloc->deallocate(indices, indices_alloc, stream); + + len = indptr_idx+1; + allocate(random_matrix->indptr, len); + updateDevice(random_matrix->indptr, indptr, len, stream); + alloc->deallocate(indptr, indptr_alloc, stream); + + len = offset; + allocate(random_matrix->sparse_data, len); + auto rproj_rng = RPROJ_Rng(params.random_state); + math_t scale = sqrt(1.0 / params.density) / sqrt(params.n_components); + rproj_rng.sparse_rand_gen(random_matrix->sparse_data, len, scale, stream); + + random_matrix->sparse_data_size = len; + } + } + + /** + * @brief fits the model by generating appropriate random matrix + * @output param random_matrix: the random matrix to be allocated and generated + * @input param params: data structure that includes all the parameters of the model + */ + template + void RPROJfit(rand_mat *random_matrix, paramsRPROJ* params) + { + cublasHandle_t cublas_handle; + CUBLAS_CHECK(cublasCreate(&cublas_handle)); + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + random_matrix->reset(); + + build_parameters(*params); + check_parameters(*params); + + if (params->gaussian_method) + { + gaussian_random_matrix(random_matrix, *params, stream); + } + else + { + sparse_random_matrix(random_matrix, *params, stream); + } + + CUBLAS_CHECK(cublasDestroy(cublas_handle)); + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + /** + * @brief transforms data according to generated random matrix + * @input param input: unprojected original dataset + * @input param random_matrix: the random matrix to be allocated and generated + * @output param output: projected dataset + * @input param params: data structure that includes all the parameters of the model + */ + template + void RPROJtransform(math_t *input, rand_mat *random_matrix, math_t *output, + paramsRPROJ* params) + { + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + check_parameters(*params); + + if (random_matrix->dense_data) + { + cublasHandle_t cublas_handle; + CUBLAS_CHECK(cublasCreate(&cublas_handle)); + + const math_t alfa = 1; + const math_t beta = 0; + + int& m = params->n_samples; + int& n = params->n_components; + int& k = params->n_features; + + int& lda = m; + int& ldb = k; + int& ldc = m; + + CUBLAS_CHECK(cublasgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, + &alfa, input, lda, random_matrix->dense_data, ldb, &beta, output, ldc, stream)); + + CUBLAS_CHECK(cublasDestroy(cublas_handle)); + } + else if (random_matrix->sparse_data) + { + cusparseHandle_t cusparse_handle; + CUSPARSE_CHECK(cusparseCreate(&cusparse_handle)); + CUSPARSE_CHECK(cusparseSetStream(cusparse_handle, stream)); + + const math_t alfa = 1; + const math_t beta = 0; + + int& m = params->n_samples; + int& n = params->n_components; + int& k = params->n_features; + size_t& nnz = random_matrix->sparse_data_size; + + int& lda = m; + int& ldc = m; + + CUSPARSE_CHECK(cusparsegemmi(cusparse_handle, m, n, k, nnz, &alfa, input, lda, + random_matrix->sparse_data, random_matrix->indptr, + random_matrix->indices, &beta, output, ldc)); + + CUSPARSE_CHECK(cusparseDestroy(cusparse_handle)); + } + else + { + ASSERT(false, + "Could not find a random matrix. Please perform a fit operation before applying transformation"); + } + + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + /** @} */ +}; +// end namespace ML \ No newline at end of file diff --git a/cuML/src/random_projection/rproj_c.h b/cuML/src/random_projection/rproj_c.h new file mode 100644 index 0000000000..5db8fb053f --- /dev/null +++ b/cuML/src/random_projection/rproj_c.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace ML{ + + /** + * @defgroup paramsRPROJ: structure holding parameters used by random projection model + * @param n_samples: Number of samples + * @param n_features: Number of features (original dimension) + * @param n_components: Number of components (target dimension) + * @param eps: error tolerance used to decide automatically of n_components + * @param gaussian_method: boolean describing random matrix generation method + * @param density: Density of the random matrix + * @param dense_output: boolean describing sparsity of transformed matrix + * @param random_state: seed used by random generator + * @{ + */ + struct paramsRPROJ + { + int n_samples; + int n_features; + int n_components; + double eps; + bool gaussian_method; + double density; + bool dense_output; + int random_state; + }; + + template + struct rand_mat + { + rand_mat() + : dense_data(nullptr), indices(nullptr), indptr(nullptr), + sparse_data(nullptr), sparse_data_size(0) + {} + + ~rand_mat() + { + this->reset(); + } + + // For dense matrices + math_t *dense_data; + + // For sparse CSC matrices + int *indices; + int *indptr; + math_t *sparse_data; + size_t sparse_data_size; + + void reset(); + }; + + template + void RPROJfit(rand_mat *random_matrix, paramsRPROJ* params); + + template + void RPROJtransform(math_t *input, rand_mat *random_matrix, + math_t *output, paramsRPROJ* params); + +} \ No newline at end of file diff --git a/cuML/src/random_projection/utils.hxx b/cuML/src/random_projection/utils.hxx new file mode 100644 index 0000000000..8d816c4c7b --- /dev/null +++ b/cuML/src/random_projection/utils.hxx @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "rproj_c.h" +#include + +using namespace MLCommon::Random; + +namespace MLCommon { + namespace Random { + class RPROJ_Rng : protected Rng { + public: + RPROJ_Rng(int random_state) + : Rng(random_state) + {} + + template + void sparse_rand_gen(Type *ptr, LenType len, Type scale, + cudaStream_t stream) + { + static_assert(std::is_floating_point::value, + "Type for 'uniform' can only be floating point type!"); + randImpl(offset, ptr, len, + [=] __device__(Type val, LenType idx) { + return val < Type(0.5) ? -scale : scale; + }, + NumThreads, nBlocks, type, stream); + } + }; + } +} + +inline void sample_without_replacement(size_t n_population, size_t n_samples, + int* indices, size_t& indices_idx) +{ + std::random_device dev; + std::mt19937 gen(dev()); + std::uniform_int_distribution uni_dist(0, n_population-1); + + std::unordered_set s; + + for (size_t i = 0; i < n_samples; i++) + { + int rand_idx = uni_dist(gen); + while (s.find(rand_idx) != s.end()) + { + rand_idx = uni_dist(gen); + } + s.insert(rand_idx); + indices[indices_idx] = rand_idx; + indices_idx++; + } +} + +inline size_t binomial(size_t n, double p) +{ + std::random_device dev; + std::mt19937 gen(dev()); + std::binomial_distribution bin_dist(n, p); + return bin_dist(gen); +} + +inline double check_density(double density, size_t n_features) +{ + if (density == -1.0) + { + return 1.0 / sqrt(n_features); + } + return density; +} + +/** + * @brief computes minimum target dimension to preserve information according to error tolerance (eps parameter) + * @input param n_samples: number of samples + * @input param eps: error tolerance + * @return minimum target dimension + */ +inline size_t johnson_lindenstrauss_min_dim(size_t n_samples, double eps) +{ + ASSERT(eps > 0.0 && eps < 1.0, + "Parameter eps: must be in range ]0, 1["); + ASSERT(n_samples > 0, + "Parameter n_samples: must be strictly positive"); + + double denominator = (pow(eps, 2.0) / 2.0) - (pow(eps, 3) / 3.0); + size_t res = 4.0 * log(n_samples) / denominator; + return res; +} + +namespace ML{ + + inline void check_parameters(paramsRPROJ& params) + { + ASSERT(params.n_components > 0, + "Parameter n_components: must be strictly positive"); + + ASSERT(params.n_features > 0, + "Parameter n_features: must be strictly positive"); + + ASSERT(params.n_features >= params.n_components, + "Parameters n_features and n_components: n_features must superior " + "or equal to n_components. If you set eps parameter, please modify its value." + "\nCurrent values :\n\tn_features : %d\n\tn_components : %d\n\teps : %lf", + params.n_features, params.n_components, params.eps); + + ASSERT(params.gaussian_method || (params.density > 0.0 && params.density <= 1.0), + "Parameter density: must be in range ]0, 1]"); + } + + inline void build_parameters(paramsRPROJ& params) + { + if (params.n_components == -1) + { + params.n_components = johnson_lindenstrauss_min_dim(params.n_samples, params.eps); + } + if (!params.gaussian_method) + { + params.density = check_density(params.density, params.n_features); + } + } + + template + void rand_mat::reset() + { + if (this->dense_data) + CUDA_CHECK(cudaFree(this->dense_data)); + if (this->indices) + CUDA_CHECK(cudaFree(this->indices)); + if (this->indptr) + CUDA_CHECK(cudaFree(this->indptr)); + if (this->sparse_data) + CUDA_CHECK(cudaFree(this->sparse_data)); + + this->dense_data = nullptr; + this->indices = nullptr; + this->indptr = nullptr; + this->sparse_data = nullptr; + this->sparse_data_size = 0; + } +} \ No newline at end of file diff --git a/ml-prims/src/linalg/cusparse_wrappers.h b/ml-prims/src/linalg/cusparse_wrappers.h new file mode 100644 index 0000000000..7ad6a51d3e --- /dev/null +++ b/ml-prims/src/linalg/cusparse_wrappers.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace MLCommon { +namespace LinAlg { + +/** check for cusparse runtime API errors and assert accordingly */ +#define CUSPARSE_CHECK(call) \ + { \ + cusparseStatus_t err; \ + if ((err = (call)) != CUSPARSE_STATUS_SUCCESS) { \ + fprintf(stderr, "Got CUSPARSE error %d at %s:%d\n", err, __FILE__, \ + __LINE__); \ + switch (err) { \ + case CUSPARSE_STATUS_NOT_INITIALIZED: \ + fprintf(stderr, "%s\n", "CUSPARSE_STATUS_NOT_INITIALIZED"); \ + exit(1); \ + case CUSPARSE_STATUS_ALLOC_FAILED: \ + fprintf(stderr, "%s\n", "CUSPARSE_STATUS_ALLOC_FAILED"); \ + exit(1); \ + case CUSPARSE_STATUS_INVALID_VALUE: \ + fprintf(stderr, "%s\n", "CUSPARSE_STATUS_INVALID_VALUE"); \ + exit(1); \ + case CUSPARSE_STATUS_ARCH_MISMATCH: \ + fprintf(stderr, "%s\n", "CUSPARSE_STATUS_ARCH_MISMATCH"); \ + exit(1); \ + case CUSPARSE_STATUS_MAPPING_ERROR: \ + fprintf(stderr, "%s\n", "CUSPARSE_STATUS_MAPPING_ERROR"); \ + exit(1); \ + case CUSPARSE_STATUS_EXECUTION_FAILED: \ + fprintf(stderr, "%s\n", "CUSPARSE_STATUS_EXECUTION_FAILED"); \ + exit(1); \ + case CUSPARSE_STATUS_INTERNAL_ERROR: \ + fprintf(stderr, "%s\n", "CUSPARSE_STATUS_INTERNAL_ERROR"); \ + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: \ + fprintf(stderr, "%s\n", "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"); \ + } \ + exit(1); \ + exit(1); \ + } \ + } + +cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, int k, int nnz, + const float *alpha, const float *A, int lda, + const float *cscValB, const int *cscColPtrB, + const int *cscRowIndB, const float *beta, + float *C, int ldc) +{ + return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, + cscRowIndB, beta, C, ldc); +} + +cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n, int k, int nnz, + const double *alpha, const double *A, int lda, + const double *cscValB, const int *cscColPtrB, + const int *cscRowIndB, const double *beta, + double *C, int ldc) +{ + return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, + cscRowIndB, beta, C, ldc); +} + + +/** @} */ + +}; // namespace LinAlg +}; // namespace MLCommon \ No newline at end of file diff --git a/ml-prims/src/random/rng.h b/ml-prims/src/random/rng.h index 3af893a0ac..f99d65de7c 100644 --- a/ml-prims/src/random/rng.h +++ b/ml-prims/src/random/rng.h @@ -422,7 +422,7 @@ class Rng { NumThreads, nBlocks, type, stream); } -private: +protected: /** generator type */ GeneratorType type; /** diff --git a/python/cuml/random_projection/__init__.py b/python/cuml/random_projection/__init__.py new file mode 100644 index 0000000000..648753d875 --- /dev/null +++ b/python/cuml/random_projection/__init__.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from cuml.random_projection.rproj import GaussianRandomProjection +from cuml.random_projection.rproj import SparseRandomProjection \ No newline at end of file diff --git a/python/cuml/random_projection/rproj.pxi b/python/cuml/random_projection/rproj.pxi new file mode 100644 index 0000000000..d254af3066 --- /dev/null +++ b/python/cuml/random_projection/rproj.pxi @@ -0,0 +1,28 @@ +cdef extern from "random_projection/rproj_c.h" namespace "ML": + + # Structure holding random projection hyperparameters + cdef struct paramsRPROJ: + int n_samples # number of samples + int n_features # number of features (original dimension) + int n_components # number of components (target dimension) + double eps # error tolerance according to Johnson-Lindenstrauss lemma + bool gaussian_method # toggle Gaussian or Sparse random projection methods + double density # ratio of non-zero component in the random projection matrix (used for sparse random projection) + bool dense_output # toggle random projection's transformation as a dense or sparse matrix + int random_state # seed used by random generator + + # Structure describing random matrix + cdef cppclass rand_mat[T]: + rand_mat() except + # random matrix structure constructor (set all to nullptr) + T *dense_data # dense random matrix data + int *indices # sparse CSC random matrix indices + int *indptr # sparse CSC random matrix indptr + T *sparse_data # sparse CSC random matrix data + size_t sparse_data_size # sparse CSC random matrix number of non-zero elements + + # Method used to fit the model + cdef void RPROJfit[T](rand_mat[T] *random_matrix, paramsRPROJ* params) + + # Method used to apply data transformation + cdef void RPROJtransform[T](T *input, rand_mat[T] *random_matrix, + T *output, paramsRPROJ* params) diff --git a/python/cuml/random_projection/rproj.pyx b/python/cuml/random_projection/rproj.pyx new file mode 100644 index 0000000000..2c61ee139e --- /dev/null +++ b/python/cuml/random_projection/rproj.pyx @@ -0,0 +1,392 @@ +# +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +import cudf +import numpy as np + +from numba import cuda + +from libc.stdint cimport uintptr_t +from libcpp cimport bool + +include "rproj.pxi" + +cdef class BaseRandomProjection(): + """ + Base class for random projections. + This class is not intended to be used directly. + + Random projection is a dimensionality reduction technique. Random projection methods + are powerful methods known for their simplicity, computational efficiency and restricted model size. + This algorithm also has the advantage to preserve distances well between any two samples + and is thus suitable for methods having this requirement. + + Parameters + ---------- + + n_components : int (default = 'auto') + Dimensionality of the target projection space. If set to 'auto', + the parameter is deducted thanks to Johnson–Lindenstrauss lemma. + The automatic deduction make use of the number of samples and + the eps parameter. + + The Johnson–Lindenstrauss lemma can produce very conservative + n_components parameter as it makes no assumption on dataset structure. + + eps : float (default = 0.1) + Error tolerance during projection. Used by Johnson–Lindenstrauss + automatic deduction when n_components is set to 'auto'. + + dense_output : boolean (default = True) + If set to True transformed matrix will be dense otherwise sparse. + + random_state : int (default = 42) + Seed used to initilize random generator + + Attributes + ---------- + params : Cython structure + Structure holding model's hyperparameters + + rand_matS/rand_matD : Cython pointers to structures + Structures holding pointers to data describing random matrix. + S for simple/float and D for double. + + Notes + ------ + Inspired from sklearn's implementation : https://scikit-learn.org/stable/modules/random_projection.html + + """ + + cdef paramsRPROJ params + cdef rand_mat[float]* rand_matS + cdef rand_mat[double]* rand_matD + + def __cinit__(self): + self.rand_matS = new rand_mat[float]() + self.rand_matD = new rand_mat[double]() + + def __dealloc__(self): + del self.rand_matS + del self.rand_matD + + def __init__(self, n_components='auto', eps=0.1, dense_output=True, + random_state=42): + self.params.n_components = n_components if n_components != 'auto' else -1 + self.params.eps = eps + self.params.dense_output = dense_output + self.params.random_state = random_state + + self.params.gaussian_method = self.gaussian_method + self.params.density = self.density + + # Gets device pointer from Numba's Cuda array + def _get_ctype_ptr(self, obj): + # The manner to access the pointers in the gdf's might change, so + # encapsulating access in the following 3 methods. They might also be + # part of future gdf versions. + return obj.device_ctypes_pointer.value + + # Gets device pointer from cuDF dataframe's column + def _get_column_ptr(self, obj): + return self._get_ctype_ptr(obj._column._data.to_gpu_array()) + + def fit(self, X, y=None): + """ + Fit the model. This function generates the random matrix on GPU. + + Parameters + ---------- + X : cuDF DataFrame or Numpy array + Dense matrix (floats or doubles) of shape (n_samples, n_features) + Used to provide shape information + + Returns + ------- + The transformer itself with deducted 'auto' parameters and + generated random matrix as attributes + + """ + if (isinstance(X, cudf.DataFrame)): + self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype) + n_samples = len(X) + n_features = len(X._cols) + + elif (isinstance(X, np.ndarray)): + self.gdf_datatype = X.dtype + n_samples, n_features = X.shape + + else: + msg = "X matrix format not supported" + raise TypeError(msg) + + self.params.n_samples = n_samples + self.params.n_features = n_features + + if self.gdf_datatype.type == np.float32: + RPROJfit[float](self.rand_matS, &self.params) + else: + RPROJfit[double](self.rand_matD, &self.params) + + return self + + def transform(self, X): + """ + Apply transformation on provided data. This function outputs + a multiplication between the input matrix and the generated random matrix + + Parameters + ---------- + X : cuDF DataFrame or Numpy array + Dense matrix (floats or doubles) of shape (n_samples, n_features) + Used as input matrix + + Returns + ------- + The output projected matrix of shape (n_samples, n_components) + Result of multiplication between input matrix and random matrix + + """ + if (isinstance(X, cudf.DataFrame)): + self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype) + X_m = X.as_gpu_matrix() + n_samples = len(X) + n_features = len(X._cols) + + elif (isinstance(X, np.ndarray)): + self.gdf_datatype = X.dtype + X_m = cuda.to_device(X) + n_samples, n_features = X.shape + + else: + msg = "X matrix format not supported" + raise TypeError(msg) + + X_new = cuda.to_device(np.zeros((n_samples,self.params.n_components), + dtype=self.gdf_datatype)) + + cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m) + cdef uintptr_t output_ptr = self._get_ctype_ptr(X_new) + + if self.params.n_features != n_features: + raise ValueError("n_features must be same as on fitting: %d" % + self.params.n_features) + + if self.gdf_datatype.type == np.float32: + RPROJtransform[float]( input_ptr, + self.rand_matS, + output_ptr, + &self.params) + else: + RPROJtransform[double]( input_ptr, + self.rand_matD, + output_ptr, + &self.params) + + if (isinstance(X, cudf.DataFrame)): + del(X_m) + + return X_new + + +class GaussianRandomProjection(BaseRandomProjection): + """ + Gaussian Random Projection method derivated from BaseRandomProjection class. + + Random projection is a dimensionality reduction technique. Random projection methods + are powerful methods known for their simplicity, computational efficiency and restricted model size. + This algorithm also has the advantage to preserve distances well between any two samples + and is thus suitable for methods having this requirement. + + The components of the random matrix are drawn from N(0, 1 / n_components). + + Example + --------- + + .. code-block:: python + from cuml.random_projection import GaussianRandomProjection + from sklearn.datasets.samples_generator import make_blobs + from sklearn.svm import SVC + + # dataset generation + data, target = make_blobs(n_samples=800, centers=400, n_features=3000, random_state=0) + + # model fitting + model = GaussianRandomProjection(n_components=5, random_state=42).fit(data) + + # dataset transformation + transformed_data = model.transform(data) + + # classifier training + classifier = SVC(gamma=0.001).fit(transformed_data, target) + + # classifier scoring + score = classifier.score(transformed_data, target) + + # measure information preservation + print("Score: {}".format(score)) + + Output: + + .. code-block:: python + Score: 1.0 + + Parameters + ---------- + + n_components : int (default = 'auto') + Dimensionality of the target projection space. If set to 'auto', + the parameter is deducted thanks to Johnson–Lindenstrauss lemma. + The automatic deduction make use of the number of samples and + the eps parameter. + + The Johnson–Lindenstrauss lemma can produce very conservative + n_components parameter as it makes no assumption on dataset structure. + + eps : float (default = 0.1) + Error tolerance during projection. Used by Johnson–Lindenstrauss + automatic deduction when n_components is set to 'auto'. + + random_state : int (default = 42) + Seed used to initilize random generator + + Attributes + ---------- + gaussian_method : boolean + To be passed to base class in order to determine + random matrix generation method + + Notes + ------ + Inspired from sklearn's implementation : https://scikit-learn.org/stable/modules/random_projection.html + + """ + + def __init__(self, n_components='auto', eps=0.1, random_state=42): + self.gaussian_method = True + self.density = -1.0 # not used + + super().__init__( + n_components=n_components, + eps=eps, + dense_output=True, + random_state=random_state) + + +class SparseRandomProjection(BaseRandomProjection): + """ + Sparse Random Projection method derivated from BaseRandomProjection class. + + Random projection is a dimensionality reduction technique. Random projection methods + are powerful methods known for their simplicity, computational efficiency and restricted model size. + This algorithm also has the advantage to preserve distances well between any two samples + and is thus suitable for methods having this requirement. + + Sparse random matrix is an alternative to dense random projection matrix (e.g. Gaussian) + that guarantees similar embedding quality while being much more memory efficient + and allowing faster computation of the projected data (with sparse enough matrices). + If we note 's = 1 / density' the components of the random matrix are + drawn from: + - -sqrt(s) / sqrt(n_components) with probability 1 / 2s + - 0 with probability 1 - 1 / s + - +sqrt(s) / sqrt(n_components) with probability 1 / 2s + + Example + --------- + + .. code-block:: python + from cuml.random_projection import SparseRandomProjection + from sklearn.datasets.samples_generator import make_blobs + from sklearn.svm import SVC + + # dataset generation + data, target = make_blobs(n_samples=800, centers=400, n_features=3000, random_state=0) + + # model fitting + model = SparseRandomProjection(n_components=5, random_state=42).fit(data) + + # dataset transformation + transformed_data = model.transform(data) + + # classifier training + classifier = SVC(gamma=0.001).fit(transformed_data, target) + + # classifier scoring + score = classifier.score(transformed_data, target) + + # measure information preservation + print("Score: {}".format(score)) + + Output: + + .. code-block:: python + Score: 1.0 + + Parameters + ---------- + + n_components : int (default = 'auto') + Dimensionality of the target projection space. If set to 'auto', + the parameter is deducted thanks to Johnson–Lindenstrauss lemma. + The automatic deduction make use of the number of samples and + the eps parameter. + + The Johnson–Lindenstrauss lemma can produce very conservative + n_components parameter as it makes no assumption on dataset structure. + + density : float in range ]0, 1] (default = 'auto') + Ratio of non-zero component in the random projection matrix. + + If density = 'auto', the value is set to the minimum density + as recommended by Ping Li et al.: 1 / sqrt(n_features). + + eps : float (default = 0.1) + Error tolerance during projection. Used by Johnson–Lindenstrauss + automatic deduction when n_components is set to 'auto'. + + dense_output : boolean (default = True) + If set to True transformed matrix will be dense otherwise sparse. + + random_state : int (default = 42) + Seed used to initilize random generator + + Attributes + ---------- + gaussian_method : boolean + To be passed to base class in order to determine + random matrix generation method + + Notes + ------ + Inspired from sklearn's implementation : https://scikit-learn.org/stable/modules/random_projection.html + + """ + + def __init__(self, n_components='auto', density='auto', eps=0.1, + dense_output=True, random_state=42): + self.gaussian_method = False + self.density = density if density != 'auto' else -1.0 + + super().__init__( + n_components=n_components, + eps=eps, + dense_output=dense_output, + random_state=random_state) \ No newline at end of file From be77b83dd899ce176cb209f0e46e07bf0f71f52c Mon Sep 17 00:00:00 2001 From: wxbn Date: Mon, 29 Apr 2019 23:09:29 +0000 Subject: [PATCH 011/156] Random Projection tests --- cuML/test/rproj_test.cu | 170 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 cuML/test/rproj_test.cu diff --git a/cuML/test/rproj_test.cu b/cuML/test/rproj_test.cu new file mode 100644 index 0000000000..24cafa2ea1 --- /dev/null +++ b/cuML/test/rproj_test.cu @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "random_projection/rproj_c.h" +#include +#include +#include +#include +#include + +namespace ML { + +using namespace MLCommon; + +template +class RPROJTest: public ::testing::Test { +protected: + void gaussianTest() { + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + const int N = 1000; + const int M = 100; + const int D = 10; + + params1 = new paramsRPROJ(); + *params1 = { + N, // number of samples + M, // number of features + D, // number of components + 0.1, // error tolerance + true, // gaussian or sparse method + -1.0, // auto density + false, // not used + 42 // random seed + }; + + std::vector v(N*M); + std::generate(v.begin(), v.end(), std::rand); + allocate(d_input1, v.size()); + updateDevice(d_input1, v.data(), v.size(), stream); + allocate(d_output1, N*D); + random_matrix1 = new rand_mat(); + RPROJfit(random_matrix1, params1); + RPROJtransform(d_input1, random_matrix1, d_output1, params1); + + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + + void sparseTest() { + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + const int N = 500; + const int M = 700; + + params2 = new paramsRPROJ(); + *params2 = { + N, // number of samples + M, // number of features + -1, // number of components (-1: auto-deduction) + 0.3, // error tolerance + false, // gaussian or sparse method + -1.0, // auto density (-1: auto-deduction) + false, // not used + 42 // random seed + }; + + std::vector v(N*M); + std::generate(v.begin(), v.end(), std::rand); + allocate(d_input2, v.size()); + updateDevice(d_input2, v.data(), v.size(), stream); + random_matrix2 = new rand_mat(); + RPROJfit(random_matrix2, params2); + allocate(d_output2, N*params2->n_components); + RPROJtransform(d_input2, random_matrix2, d_output2, params2); + + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void SetUp() override { + gaussianTest(); + sparseTest(); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(d_input1)); + CUDA_CHECK(cudaFree(d_output1)); + CUDA_CHECK(cudaFree(d_input2)); + CUDA_CHECK(cudaFree(d_output2)); + delete params1; + delete random_matrix1; + delete params2; + delete random_matrix2; + } + +protected: + paramsRPROJ* params1; + T* d_input1; + rand_mat *random_matrix1; + T* d_output1; + + paramsRPROJ* params2; + T* d_input2; + rand_mat *random_matrix2; + T* d_output2; +}; + + +typedef RPROJTest RPROJTestF; +TEST_F(RPROJTestF, Result) { + int M = params1->n_features; + int D = params1->n_components; + ASSERT_TRUE(D == 10); + ASSERT_TRUE(random_matrix1->dense_data); + + float* h_dense_data = new float[M*D]; + updateHost(h_dense_data, random_matrix1->dense_data, M*D, NULL); + ASSERT_TRUE(!std::any_of(h_dense_data, &h_dense_data[M*D-1], [](int i){return i < -1.0 || i > 1.0;})); + delete[] h_dense_data; + + int N = params2->n_samples; + M = params2->n_features; + D = params2->n_components; + ASSERT_TRUE(D == 690); + ASSERT_TRUE(params2->density == 1/sqrt(M)); + ASSERT_TRUE(random_matrix2->indices); + ASSERT_TRUE(random_matrix2->indptr); + ASSERT_TRUE(random_matrix2->sparse_data); + ASSERT_TRUE(random_matrix2->sparse_data_size = N*D); +} + +typedef RPROJTest RPROJTestD; +TEST_F(RPROJTestD, Result) { + int M = params1->n_features; + int D = params1->n_components; + ASSERT_TRUE(D == 10); + ASSERT_TRUE(random_matrix1->dense_data); + + double* h_dense_data = new double[M*D]; + updateHost(h_dense_data, random_matrix1->dense_data, M*D, NULL); + ASSERT_TRUE(!std::any_of(h_dense_data, &h_dense_data[M*D-1], [](int i){return i < -1.0 || i > 1.0;})); + delete[] h_dense_data; + + int N = params2->n_samples; + M = params2->n_features; + D = params2->n_components; + ASSERT_TRUE(D == 690); + ASSERT_TRUE(params2->density == 1/sqrt(M)); + ASSERT_TRUE(random_matrix2->indices); + ASSERT_TRUE(random_matrix2->indptr); + ASSERT_TRUE(random_matrix2->sparse_data); + ASSERT_TRUE(random_matrix2->sparse_data_size = N*D); +} + +} // end namespace ML From 91ea8e43d891e93a1a1fd608bc9b79be2c2f6ca5 Mon Sep 17 00:00:00 2001 From: wxbn Date: Mon, 29 Apr 2019 23:23:20 +0000 Subject: [PATCH 012/156] Integrating scaled_bernoulli in Rng class --- cuML/src/random_projection/rproj.hxx | 8 ++++---- cuML/src/random_projection/utils.hxx | 24 ------------------------ ml-prims/src/random/rng.h | 22 +++++++++++++++++++++- 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/cuML/src/random_projection/rproj.hxx b/cuML/src/random_projection/rproj.hxx index e7e1a2782b..83d3e46c2d 100644 --- a/cuML/src/random_projection/rproj.hxx +++ b/cuML/src/random_projection/rproj.hxx @@ -63,9 +63,9 @@ namespace ML { { int len = params.n_components * params.n_features; allocate(random_matrix->dense_data, len); - auto rproj_rng = RPROJ_Rng(params.random_state); + auto rng = Random::Rng(params.random_state); math_t scale = 1.0 / sqrt(math_t(params.n_components)); - rproj_rng.sparse_rand_gen(random_matrix->dense_data, len, scale, stream); + rng.scaled_bernoulli(random_matrix->dense_data, len, math_t(0.5), scale, stream); } else { @@ -104,9 +104,9 @@ namespace ML { len = offset; allocate(random_matrix->sparse_data, len); - auto rproj_rng = RPROJ_Rng(params.random_state); + auto rng = Random::Rng(params.random_state); math_t scale = sqrt(1.0 / params.density) / sqrt(params.n_components); - rproj_rng.sparse_rand_gen(random_matrix->sparse_data, len, scale, stream); + rng.scaled_bernoulli(random_matrix->sparse_data, len, math_t(0.5), scale, stream); random_matrix->sparse_data_size = len; } diff --git a/cuML/src/random_projection/utils.hxx b/cuML/src/random_projection/utils.hxx index 8d816c4c7b..b6d61caf96 100644 --- a/cuML/src/random_projection/utils.hxx +++ b/cuML/src/random_projection/utils.hxx @@ -21,30 +21,6 @@ using namespace MLCommon::Random; -namespace MLCommon { - namespace Random { - class RPROJ_Rng : protected Rng { - public: - RPROJ_Rng(int random_state) - : Rng(random_state) - {} - - template - void sparse_rand_gen(Type *ptr, LenType len, Type scale, - cudaStream_t stream) - { - static_assert(std::is_floating_point::value, - "Type for 'uniform' can only be floating point type!"); - randImpl(offset, ptr, len, - [=] __device__(Type val, LenType idx) { - return val < Type(0.5) ? -scale : scale; - }, - NumThreads, nBlocks, type, stream); - } - }; - } -} - inline void sample_without_replacement(size_t n_population, size_t n_samples, int* indices, size_t& indices_idx) { diff --git a/ml-prims/src/random/rng.h b/ml-prims/src/random/rng.h index f99d65de7c..4abefa9351 100644 --- a/ml-prims/src/random/rng.h +++ b/ml-prims/src/random/rng.h @@ -283,6 +283,26 @@ class Rng { NumThreads, nBlocks, type, stream); } + /** + * @brief Generate bernoulli distributed array and applies scale + * @tparam Type data type in which to compute the probabilities + * @tparam LenType data type used to represent length of the arrays + * @param ptr the output array + * @param len the number of elements in the output + * @param prob coin-toss probability for heads + * @param stream stream where to launch the kernel + */ + template + void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale, + cudaStream_t stream) + { + static_assert(std::is_floating_point::value, + "Type for 'uniform' can only be floating point type!"); + randImpl(offset, ptr, len, + [=] __device__(Type val, LenType idx) { return val < prob ? -scale : scale; }, + NumThreads, nBlocks, type, stream); + } + /** * @brief Generate gumbel distributed random numbers * @tparam Type data type of output random number @@ -422,7 +442,7 @@ class Rng { NumThreads, nBlocks, type, stream); } -protected: +private: /** generator type */ GeneratorType type; /** From ae8604319b5d8fea87d003084a89df079c8009e6 Mon Sep 17 00:00:00 2001 From: wxbn Date: Tue, 30 Apr 2019 01:29:14 +0000 Subject: [PATCH 013/156] Changelog update --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bc87bf59d..f4459b7d35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ - PR #455: Remove default-stream arguement across ml-prims and cuML - PR #375: cuml cpp shared library renamed to libcuml++.so - PR #491: added doxygen build target for ml-prims +- PR #515: Added Random Projection feature ## Improvements From 8ffe5c10fc529412ce9b4ed33f3a0ba62a9cec17 Mon Sep 17 00:00:00 2001 From: wxbn Date: Tue, 30 Apr 2019 21:57:23 +0000 Subject: [PATCH 014/156] Multiple fixes (no tests yet) --- cuML/src/random_projection/rproj.cu | 10 +-- cuML/src/random_projection/rproj.hxx | 53 +++++------ cuML/src/random_projection/rproj_c.h | 11 ++- cuML/src/random_projection/utils.hxx | 39 ++++----- ml-prims/src/linalg/cusparse_wrappers.h | 2 +- python/cuml/random_projection/__init__.py | 5 +- python/cuml/random_projection/rproj.pxi | 28 ------ python/cuml/random_projection/rproj.pyx | 102 +++++++++++++++++----- 8 files changed, 137 insertions(+), 113 deletions(-) delete mode 100644 python/cuml/random_projection/rproj.pxi diff --git a/cuML/src/random_projection/rproj.cu b/cuML/src/random_projection/rproj.cu index f591255550..24665cf308 100644 --- a/cuML/src/random_projection/rproj.cu +++ b/cuML/src/random_projection/rproj.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,9 +23,9 @@ namespace ML { using namespace MLCommon; - template void RPROJfit(rand_mat *random_matrix, paramsRPROJ* params); - template void RPROJfit(rand_mat *random_matrix, paramsRPROJ* params); - template void RPROJtransform(float *input, rand_mat *random_matrix, float *output, paramsRPROJ* params); - template void RPROJtransform(double *input, rand_mat *random_matrix, double *output, paramsRPROJ* params); + template void RPROJfit(cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params); + template void RPROJfit(cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params); + template void RPROJtransform(cumlHandle& handle, float *input, rand_mat *random_matrix, float *output, paramsRPROJ* params); + template void RPROJtransform(cumlHandle& handle, double *input, rand_mat *random_matrix, double *output, paramsRPROJ* params); }; \ No newline at end of file diff --git a/cuML/src/random_projection/rproj.hxx b/cuML/src/random_projection/rproj.hxx index 83d3e46c2d..e5dcbe1601 100644 --- a/cuML/src/random_projection/rproj.hxx +++ b/cuML/src/random_projection/rproj.hxx @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,14 +34,15 @@ namespace ML { /** * @brief generates a gaussian random matrix + * @input param h: cuML handle * @output param random_matrix: the random matrix to be allocated and generated * @input param params: data structure that includes all the parameters of the model - * @input param stream: cuda stream */ template - void gaussian_random_matrix(rand_mat *random_matrix, paramsRPROJ& params, - cudaStream_t stream) + void gaussian_random_matrix(cumlHandle& h, rand_mat *random_matrix, + paramsRPROJ& params) { + cudaStream_t stream = h.getStream(); int len = params.n_components * params.n_features; allocate(random_matrix->dense_data, len); auto rng = Random::Rng(params.random_state); @@ -51,14 +52,16 @@ namespace ML { /** * @brief generates a sparse random matrix + * @input param h: cuML handle * @output param random_matrix: the random matrix to be allocated and generated * @input param params: data structure that includes all the parameters of the model - * @input param stream: cuda stream */ template - void sparse_random_matrix(rand_mat *random_matrix, paramsRPROJ& params, - cudaStream_t stream) + void sparse_random_matrix(cumlHandle& h, rand_mat *random_matrix, + paramsRPROJ& params) { + cudaStream_t stream = h.getStream(); + if (params.density == 1.0f) { int len = params.n_components * params.n_features; @@ -69,7 +72,6 @@ namespace ML { } else { - ML::cumlHandle h; auto alloc = h.getHostAllocator(); double max_total_density = params.density * 1.2; @@ -114,18 +116,13 @@ namespace ML { /** * @brief fits the model by generating appropriate random matrix + * @input param handle: cuML handle * @output param random_matrix: the random matrix to be allocated and generated * @input param params: data structure that includes all the parameters of the model */ template - void RPROJfit(rand_mat *random_matrix, paramsRPROJ* params) + void RPROJfit(cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params) { - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - random_matrix->reset(); build_parameters(*params); @@ -133,37 +130,33 @@ namespace ML { if (params->gaussian_method) { - gaussian_random_matrix(random_matrix, *params, stream); + gaussian_random_matrix(handle, random_matrix, *params); } else { - sparse_random_matrix(random_matrix, *params, stream); + sparse_random_matrix(handle, random_matrix, *params); } - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); } /** * @brief transforms data according to generated random matrix + * @input param handle: cuML handle * @input param input: unprojected original dataset * @input param random_matrix: the random matrix to be allocated and generated * @output param output: projected dataset * @input param params: data structure that includes all the parameters of the model */ template - void RPROJtransform(math_t *input, rand_mat *random_matrix, math_t *output, - paramsRPROJ* params) + void RPROJtransform(cumlHandle& handle, math_t *input, rand_mat *random_matrix, + math_t *output, paramsRPROJ* params) { - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + cudaStream_t stream = handle.getStream(); check_parameters(*params); if (random_matrix->dense_data) { - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); + cublasHandle_t cublas_handle = handle.getImpl().getCublasHandle(); const math_t alfa = 1; const math_t beta = 0; @@ -179,12 +172,10 @@ namespace ML { CUBLAS_CHECK(cublasgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alfa, input, lda, random_matrix->dense_data, ldb, &beta, output, ldc, stream)); - CUBLAS_CHECK(cublasDestroy(cublas_handle)); } else if (random_matrix->sparse_data) { - cusparseHandle_t cusparse_handle; - CUSPARSE_CHECK(cusparseCreate(&cusparse_handle)); + cusparseHandle_t cusparse_handle = handle.getImpl().getcusparseHandle(); CUSPARSE_CHECK(cusparseSetStream(cusparse_handle, stream)); const math_t alfa = 1; @@ -201,16 +192,12 @@ namespace ML { CUSPARSE_CHECK(cusparsegemmi(cusparse_handle, m, n, k, nnz, &alfa, input, lda, random_matrix->sparse_data, random_matrix->indptr, random_matrix->indices, &beta, output, ldc)); - - CUSPARSE_CHECK(cusparseDestroy(cusparse_handle)); } else { ASSERT(false, "Could not find a random matrix. Please perform a fit operation before applying transformation"); } - - CUDA_CHECK(cudaStreamDestroy(stream)); } /** @} */ diff --git a/cuML/src/random_projection/rproj_c.h b/cuML/src/random_projection/rproj_c.h index 5db8fb053f..af0715e0a2 100644 --- a/cuML/src/random_projection/rproj_c.h +++ b/cuML/src/random_projection/rproj_c.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #pragma once +#include namespace ML{ @@ -68,10 +69,14 @@ namespace ML{ }; template - void RPROJfit(rand_mat *random_matrix, paramsRPROJ* params); + void RPROJfit(cumlHandle& handle, rand_mat *random_matrix, + paramsRPROJ* params); template - void RPROJtransform(math_t *input, rand_mat *random_matrix, + void RPROJtransform(cumlHandle& handle, math_t *input, + rand_mat *random_matrix, math_t *output, paramsRPROJ* params); + + size_t johnson_lindenstrauss_min_dim(size_t n_samples, double eps); } \ No newline at end of file diff --git a/cuML/src/random_projection/utils.hxx b/cuML/src/random_projection/utils.hxx index b6d61caf96..91556f3056 100644 --- a/cuML/src/random_projection/utils.hxx +++ b/cuML/src/random_projection/utils.hxx @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,25 +60,24 @@ inline double check_density(double density, size_t n_features) return density; } -/** - * @brief computes minimum target dimension to preserve information according to error tolerance (eps parameter) - * @input param n_samples: number of samples - * @input param eps: error tolerance - * @return minimum target dimension - */ -inline size_t johnson_lindenstrauss_min_dim(size_t n_samples, double eps) -{ - ASSERT(eps > 0.0 && eps < 1.0, - "Parameter eps: must be in range ]0, 1["); - ASSERT(n_samples > 0, - "Parameter n_samples: must be strictly positive"); - - double denominator = (pow(eps, 2.0) / 2.0) - (pow(eps, 3) / 3.0); - size_t res = 4.0 * log(n_samples) / denominator; - return res; -} - namespace ML{ + /** + * @brief computes minimum target dimension to preserve information according to error tolerance (eps parameter) + * @input param n_samples: number of samples + * @input param eps: error tolerance + * @return minimum target dimension + */ + size_t johnson_lindenstrauss_min_dim(size_t n_samples, double eps) + { + ASSERT(eps > 0.0 && eps < 1.0, + "Parameter eps: must be in range (0, 1)"); + ASSERT(n_samples > 0, + "Parameter n_samples: must be strictly positive"); + + double denominator = (pow(eps, 2.0) / 2.0) - (pow(eps, 3) / 3.0); + size_t res = 4.0 * log(n_samples) / denominator; + return res; + } inline void check_parameters(paramsRPROJ& params) { @@ -95,7 +94,7 @@ namespace ML{ params.n_features, params.n_components, params.eps); ASSERT(params.gaussian_method || (params.density > 0.0 && params.density <= 1.0), - "Parameter density: must be in range ]0, 1]"); + "Parameter density: must be in range (0, 1]"); } inline void build_parameters(paramsRPROJ& params) diff --git a/ml-prims/src/linalg/cusparse_wrappers.h b/ml-prims/src/linalg/cusparse_wrappers.h index 7ad6a51d3e..6f2dc01a2e 100644 --- a/ml-prims/src/linalg/cusparse_wrappers.h +++ b/ml-prims/src/linalg/cusparse_wrappers.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, NVIDIA CORPORATION. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/python/cuml/random_projection/__init__.py b/python/cuml/random_projection/__init__.py index 648753d875..7643ca8da6 100644 --- a/python/cuml/random_projection/__init__.py +++ b/python/cuml/random_projection/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,4 +15,5 @@ # from cuml.random_projection.rproj import GaussianRandomProjection -from cuml.random_projection.rproj import SparseRandomProjection \ No newline at end of file +from cuml.random_projection.rproj import SparseRandomProjection +from cuml.random_projection.rproj import johnson_lindenstrauss_min_dim \ No newline at end of file diff --git a/python/cuml/random_projection/rproj.pxi b/python/cuml/random_projection/rproj.pxi deleted file mode 100644 index d254af3066..0000000000 --- a/python/cuml/random_projection/rproj.pxi +++ /dev/null @@ -1,28 +0,0 @@ -cdef extern from "random_projection/rproj_c.h" namespace "ML": - - # Structure holding random projection hyperparameters - cdef struct paramsRPROJ: - int n_samples # number of samples - int n_features # number of features (original dimension) - int n_components # number of components (target dimension) - double eps # error tolerance according to Johnson-Lindenstrauss lemma - bool gaussian_method # toggle Gaussian or Sparse random projection methods - double density # ratio of non-zero component in the random projection matrix (used for sparse random projection) - bool dense_output # toggle random projection's transformation as a dense or sparse matrix - int random_state # seed used by random generator - - # Structure describing random matrix - cdef cppclass rand_mat[T]: - rand_mat() except + # random matrix structure constructor (set all to nullptr) - T *dense_data # dense random matrix data - int *indices # sparse CSC random matrix indices - int *indptr # sparse CSC random matrix indptr - T *sparse_data # sparse CSC random matrix data - size_t sparse_data_size # sparse CSC random matrix number of non-zero elements - - # Method used to fit the model - cdef void RPROJfit[T](rand_mat[T] *random_matrix, paramsRPROJ* params) - - # Method used to apply data transformation - cdef void RPROJtransform[T](T *input, rand_mat[T] *random_matrix, - T *output, paramsRPROJ* params) diff --git a/python/cuml/random_projection/rproj.pyx b/python/cuml/random_projection/rproj.pyx index 2c61ee139e..001f071c01 100644 --- a/python/cuml/random_projection/rproj.pyx +++ b/python/cuml/random_projection/rproj.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,7 +27,46 @@ from numba import cuda from libc.stdint cimport uintptr_t from libcpp cimport bool -include "rproj.pxi" +from cuml.common.base import Base +from cuml.common.handle cimport cumlHandle + +cdef extern from "random_projection/rproj_c.h" namespace "ML": + + # Structure holding random projection hyperparameters + cdef struct paramsRPROJ: + int n_samples # number of samples + int n_features # number of features (original dimension) + int n_components # number of components (target dimension) + double eps # error tolerance according to Johnson-Lindenstrauss lemma + bool gaussian_method # toggle Gaussian or Sparse random projection methods + double density # ratio of non-zero component in the random projection matrix (used for sparse random projection) + bool dense_output # toggle random projection's transformation as a dense or sparse matrix + int random_state # seed used by random generator + + # Structure describing random matrix + cdef cppclass rand_mat[T]: + rand_mat() except + # random matrix structure constructor (set all to nullptr) + T *dense_data # dense random matrix data + int *indices # sparse CSC random matrix indices + int *indptr # sparse CSC random matrix indptr + T *sparse_data # sparse CSC random matrix data + size_t sparse_data_size # sparse CSC random matrix number of non-zero elements + + # Function used to fit the model + cdef void RPROJfit[T](cumlHandle& handle, rand_mat[T] *random_matrix, + paramsRPROJ* params) + + # Function used to apply data transformation + cdef void RPROJtransform[T](cumlHandle& handle, T *input, + rand_mat[T] *random_matrix, T *output, + paramsRPROJ* params) + + # Function used to compute the Johnson Lindenstrauss minimal distance + cdef size_t c_johnson_lindenstrauss_min_dim "ML::johnson_lindenstrauss_min_dim" (size_t n_samples, double eps) + + +def johnson_lindenstrauss_min_dim(n_samples, eps=0.1): + return c_johnson_lindenstrauss_min_dim(n_samples, eps) cdef class BaseRandomProjection(): """ @@ -58,7 +97,7 @@ cdef class BaseRandomProjection(): dense_output : boolean (default = True) If set to True transformed matrix will be dense otherwise sparse. - random_state : int (default = 42) + random_state : int (default = None) Seed used to initilize random generator Attributes @@ -88,12 +127,13 @@ cdef class BaseRandomProjection(): del self.rand_matS del self.rand_matD - def __init__(self, n_components='auto', eps=0.1, dense_output=True, - random_state=42): + def __init__(self, n_components='auto', eps=0.1, + dense_output=True, random_state=None): self.params.n_components = n_components if n_components != 'auto' else -1 self.params.eps = eps self.params.dense_output = dense_output - self.params.random_state = random_state + if random_state is not None: + self.params.random_state = random_state self.params.gaussian_method = self.gaussian_method self.params.density = self.density @@ -138,13 +178,16 @@ cdef class BaseRandomProjection(): msg = "X matrix format not supported" raise TypeError(msg) + cdef cumlHandle* handle_ = self.handle.getHandle() self.params.n_samples = n_samples self.params.n_features = n_features if self.gdf_datatype.type == np.float32: - RPROJfit[float](self.rand_matS, &self.params) + RPROJfit[float](handle_[0], self.rand_matS, &self.params) else: - RPROJfit[double](self.rand_matD, &self.params) + RPROJfit[double](handle_[0], self.rand_matD, &self.params) + + self.handle.sync() return self @@ -190,24 +233,30 @@ cdef class BaseRandomProjection(): raise ValueError("n_features must be same as on fitting: %d" % self.params.n_features) + cdef cumlHandle* handle_ = self.handle.getHandle() + if self.gdf_datatype.type == np.float32: - RPROJtransform[float]( input_ptr, + RPROJtransform[float](handle_[0], + input_ptr, self.rand_matS, output_ptr, &self.params) else: - RPROJtransform[double]( input_ptr, + RPROJtransform[double](handle_[0], + input_ptr, self.rand_matD, output_ptr, &self.params) + self.handle.sync() + if (isinstance(X, cudf.DataFrame)): del(X_m) return X_new -class GaussianRandomProjection(BaseRandomProjection): +class GaussianRandomProjection(Base, BaseRandomProjection): """ Gaussian Random Projection method derivated from BaseRandomProjection class. @@ -227,7 +276,7 @@ class GaussianRandomProjection(BaseRandomProjection): from sklearn.svm import SVC # dataset generation - data, target = make_blobs(n_samples=800, centers=400, n_features=3000, random_state=0) + data, target = make_blobs(n_samples=800, centers=400, n_features=3000, random_state=42) # model fitting model = GaussianRandomProjection(n_components=5, random_state=42).fit(data) @@ -252,6 +301,9 @@ class GaussianRandomProjection(BaseRandomProjection): Parameters ---------- + handle : cuml.Handle + If it is None, a new one is created just for this class + n_components : int (default = 'auto') Dimensionality of the target projection space. If set to 'auto', the parameter is deducted thanks to Johnson–Lindenstrauss lemma. @@ -265,7 +317,7 @@ class GaussianRandomProjection(BaseRandomProjection): Error tolerance during projection. Used by Johnson–Lindenstrauss automatic deduction when n_components is set to 'auto'. - random_state : int (default = 42) + random_state : int (default = None) Seed used to initilize random generator Attributes @@ -280,18 +332,21 @@ class GaussianRandomProjection(BaseRandomProjection): """ - def __init__(self, n_components='auto', eps=0.1, random_state=42): + def __init__(self, handle=None, n_components='auto', eps=0.1, + random_state=None, verbose=False): + Base.__init__(self, handle, verbose) self.gaussian_method = True self.density = -1.0 # not used - super().__init__( + BaseRandomProjection.__init__( + self, n_components=n_components, eps=eps, dense_output=True, random_state=random_state) -class SparseRandomProjection(BaseRandomProjection): +class SparseRandomProjection(Base, BaseRandomProjection): """ Sparse Random Projection method derivated from BaseRandomProjection class. @@ -318,7 +373,7 @@ class SparseRandomProjection(BaseRandomProjection): from sklearn.svm import SVC # dataset generation - data, target = make_blobs(n_samples=800, centers=400, n_features=3000, random_state=0) + data, target = make_blobs(n_samples=800, centers=400, n_features=3000, random_state=42) # model fitting model = SparseRandomProjection(n_components=5, random_state=42).fit(data) @@ -343,6 +398,9 @@ class SparseRandomProjection(BaseRandomProjection): Parameters ---------- + handle : cuml.Handle + If it is None, a new one is created just for this class + n_components : int (default = 'auto') Dimensionality of the target projection space. If set to 'auto', the parameter is deducted thanks to Johnson–Lindenstrauss lemma. @@ -365,7 +423,7 @@ class SparseRandomProjection(BaseRandomProjection): dense_output : boolean (default = True) If set to True transformed matrix will be dense otherwise sparse. - random_state : int (default = 42) + random_state : int (default = None) Seed used to initilize random generator Attributes @@ -380,12 +438,14 @@ class SparseRandomProjection(BaseRandomProjection): """ - def __init__(self, n_components='auto', density='auto', eps=0.1, - dense_output=True, random_state=42): + def __init__(self, handle=None, n_components='auto', density='auto', + eps=0.1, dense_output=True, random_state=None, verbose=False): + Base.__init__(self, handle, verbose) self.gaussian_method = False self.density = density if density != 'auto' else -1.0 - super().__init__( + BaseRandomProjection.__init__( + self, n_components=n_components, eps=eps, dense_output=dense_output, From b92803b1d9519fbd3d9b31bf407c42f9ecc067c5 Mon Sep 17 00:00:00 2001 From: wxbn Date: Wed, 1 May 2019 21:21:49 +0000 Subject: [PATCH 015/156] Pytests (+ fix) --- ml-prims/src/random/rng.h | 2 +- python/cuml/__init__.py | 2 + python/cuml/random_projection/rproj.pyx | 25 ++++-- python/cuml/test/test_rproj.py | 106 ++++++++++++++++++++++++ 4 files changed, 126 insertions(+), 9 deletions(-) create mode 100644 python/cuml/test/test_rproj.py diff --git a/ml-prims/src/random/rng.h b/ml-prims/src/random/rng.h index 4abefa9351..0d1b9610c7 100644 --- a/ml-prims/src/random/rng.h +++ b/ml-prims/src/random/rng.h @@ -299,7 +299,7 @@ class Rng { static_assert(std::is_floating_point::value, "Type for 'uniform' can only be floating point type!"); randImpl(offset, ptr, len, - [=] __device__(Type val, LenType idx) { return val < prob ? -scale : scale; }, + [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; }, NumThreads, nBlocks, type, stream); } diff --git a/python/cuml/__init__.py b/python/cuml/__init__.py index 2cad9952f1..f993ca1f0a 100644 --- a/python/cuml/__init__.py +++ b/python/cuml/__init__.py @@ -37,6 +37,8 @@ from cuml.manifold.umap import UMAP +from cuml.random_projection.rproj import GaussianRandomProjection, SparseRandomProjection, johnson_lindenstrauss_min_dim + from ._version import get_versions __version__ = get_versions()['version'] del get_versions diff --git a/python/cuml/random_projection/rproj.pyx b/python/cuml/random_projection/rproj.pyx index 001f071c01..f7047a8d00 100644 --- a/python/cuml/random_projection/rproj.pyx +++ b/python/cuml/random_projection/rproj.pyx @@ -165,12 +165,12 @@ cdef class BaseRandomProjection(): generated random matrix as attributes """ - if (isinstance(X, cudf.DataFrame)): + if isinstance(X, cudf.DataFrame): self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype) n_samples = len(X) n_features = len(X._cols) - elif (isinstance(X, np.ndarray)): + elif isinstance(X, np.ndarray): self.gdf_datatype = X.dtype n_samples, n_features = X.shape @@ -208,23 +208,25 @@ cdef class BaseRandomProjection(): Result of multiplication between input matrix and random matrix """ - if (isinstance(X, cudf.DataFrame)): + + if isinstance(X, cudf.DataFrame): self.gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype) X_m = X.as_gpu_matrix() n_samples = len(X) n_features = len(X._cols) - elif (isinstance(X, np.ndarray)): + elif isinstance(X, np.ndarray): self.gdf_datatype = X.dtype - X_m = cuda.to_device(X) + X_m = cuda.to_device(np.array(X, order='F')) n_samples, n_features = X.shape else: msg = "X matrix format not supported" raise TypeError(msg) - X_new = cuda.to_device(np.zeros((n_samples,self.params.n_components), - dtype=self.gdf_datatype)) + X_new = cuda.device_array((n_samples, self.params.n_components), + dtype=self.gdf_datatype, + order='F') cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m) cdef uintptr_t output_ptr = self._get_ctype_ptr(X_new) @@ -252,8 +254,15 @@ cdef class BaseRandomProjection(): if (isinstance(X, cudf.DataFrame)): del(X_m) + h_X_new = X_new.copy_to_host() + del(X_new) + gdf_X_new = cudf.DataFrame() + for i in range(0, h_X_new.shape[1]): + gdf_X_new[str(i)] = h_X_new[:,i] + return gdf_X_new - return X_new + else: + return X_new.copy_to_host() class GaussianRandomProjection(Base, BaseRandomProjection): diff --git a/python/cuml/test/test_rproj.py b/python/cuml/test/test_rproj.py new file mode 100644 index 0000000000..09f44594f6 --- /dev/null +++ b/python/cuml/test/test_rproj.py @@ -0,0 +1,106 @@ +# Copyright (c) 2018-2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from cuml.random_projection import GaussianRandomProjection, SparseRandomProjection +from cuml.random_projection import johnson_lindenstrauss_min_dim as cuml_johnson_lindenstrauss_min_dim +from sklearn.random_projection import johnson_lindenstrauss_min_dim as sklearn_johnson_lindenstrauss_min_dim +from sklearn.datasets.samples_generator import make_blobs +from scipy.spatial.distance import pdist + +import cudf +import numpy as np + + +@pytest.mark.parametrize('datatype', [np.float32, np.float64]) +@pytest.mark.parametrize('input_type', ['dataframe', 'ndarray']) +@pytest.mark.parametrize('method', ['gaussian', 'sparse']) +def test_rproj_fit(datatype, input_type, method): + # dataset generation + data, target = make_blobs(n_samples=800, centers=400, n_features=3000) + + #conversion to input_type + data = data.astype(datatype) + target = target.astype(datatype) + + # creation of model + if method == 'gaussian': + model = GaussianRandomProjection(eps=0.2) + else: + model = SparseRandomProjection(eps=0.2) + + # fitting the model + if input_type == 'dataframe': + gdf = cudf.DataFrame() + for i in range(data.shape[1]): + gdf[str(i)] = np.asarray(data[:,i], dtype=datatype) + model.fit(gdf) + else: + model.fit(data) + + assert True # Did not crash + + +@pytest.mark.parametrize('datatype', [np.float32, np.float64]) +@pytest.mark.parametrize('input_type', ['dataframe', 'ndarray']) +@pytest.mark.parametrize('method', ['gaussian', 'sparse']) +def test_rproj_fit_transform(datatype, input_type, method): + eps = 0.2 + + # dataset generation + data, target = make_blobs(n_samples=800, centers=400, n_features=3000) + + #conversion to input_type + data = data.astype(datatype) + target = target.astype(datatype) + + # creation of model + if method == 'gaussian': + model = GaussianRandomProjection(eps=eps) + else: + model = SparseRandomProjection(eps=eps) + + # fitting the model + if input_type == 'dataframe': + gdf = cudf.DataFrame() + for i in range(data.shape[1]): + gdf[str(i)] = np.asarray(data[:,i], dtype=datatype) + model.fit(gdf) + else: + model.fit(data) + + # applying transformation + if input_type == 'dataframe': + transformed_data = model.transform(gdf).as_matrix() + else: + transformed_data = model.transform(data) + + original_pdist = pdist(data) + embedded_pdist = pdist(transformed_data) + + # check JL lemma + assert np.all(((1.0 - eps) * original_pdist) <= embedded_pdist) and np.all(embedded_pdist <= ((1.0 + eps) * original_pdist)) + + +def test_johnson_lindenstrauss_min_dim(): + n_tests = 10000 + n_samples = np.random.randint(low=50, high=1e10, size=n_tests) + eps_values = np.random.rand(n_tests) + 1e-17 # range (0,1) + tests = zip(n_samples, eps_values) + + for n_samples, eps in tests: + cuml_value = cuml_johnson_lindenstrauss_min_dim(n_samples, eps) + sklearn_value = sklearn_johnson_lindenstrauss_min_dim(n_samples, eps) + assert cuml_value == sklearn_value \ No newline at end of file From 5422e740a8971d71b6d72baf61908950e1457d3d Mon Sep 17 00:00:00 2001 From: wxbn Date: Thu, 2 May 2019 01:26:02 +0000 Subject: [PATCH 016/156] C++ tests --- cuML/test/rproj_test.cu | 356 +++++++++++++++++++++++++--------------- 1 file changed, 222 insertions(+), 134 deletions(-) diff --git a/cuML/test/rproj_test.cu b/cuML/test/rproj_test.cu index 24cafa2ea1..c6a4b9b9a8 100644 --- a/cuML/test/rproj_test.cu +++ b/cuML/test/rproj_test.cu @@ -15,156 +15,244 @@ */ #include "random_projection/rproj_c.h" +#include "distance/distance.h" +#include "linalg/transpose.h" #include #include #include #include #include +#include namespace ML { using namespace MLCommon; -template +template class RPROJTest: public ::testing::Test { -protected: - void gaussianTest() { - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - const int N = 1000; - const int M = 100; - const int D = 10; - - params1 = new paramsRPROJ(); - *params1 = { - N, // number of samples - M, // number of features - D, // number of components - 0.1, // error tolerance - true, // gaussian or sparse method - -1.0, // auto density - false, // not used - 42 // random seed - }; - - std::vector v(N*M); - std::generate(v.begin(), v.end(), std::rand); - allocate(d_input1, v.size()); - updateDevice(d_input1, v.data(), v.size(), stream); - allocate(d_output1, N*D); - random_matrix1 = new rand_mat(); - RPROJfit(random_matrix1, params1); - RPROJtransform(d_input1, random_matrix1, d_output1, params1); - - CUDA_CHECK(cudaStreamDestroy(stream)); - } - - - void sparseTest() { - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - const int N = 500; - const int M = 700; - - params2 = new paramsRPROJ(); - *params2 = { - N, // number of samples - M, // number of features - -1, // number of components (-1: auto-deduction) - 0.3, // error tolerance - false, // gaussian or sparse method - -1.0, // auto density (-1: auto-deduction) - false, // not used - 42 // random seed - }; - - std::vector v(N*M); - std::generate(v.begin(), v.end(), std::rand); - allocate(d_input2, v.size()); - updateDevice(d_input2, v.data(), v.size(), stream); - random_matrix2 = new rand_mat(); - RPROJfit(random_matrix2, params2); - allocate(d_output2, N*params2->n_components); - RPROJtransform(d_input2, random_matrix2, d_output2, params2); - - CUDA_CHECK(cudaStreamDestroy(stream)); - } - - void SetUp() override { - gaussianTest(); - sparseTest(); - } - - void TearDown() override { - CUDA_CHECK(cudaFree(d_input1)); - CUDA_CHECK(cudaFree(d_output1)); - CUDA_CHECK(cudaFree(d_input2)); - CUDA_CHECK(cudaFree(d_output2)); - delete params1; - delete random_matrix1; - delete params2; - delete random_matrix2; - } - -protected: - paramsRPROJ* params1; - T* d_input1; - rand_mat *random_matrix1; - T* d_output1; - - paramsRPROJ* params2; - T* d_input2; - rand_mat *random_matrix2; - T* d_output2; + + protected: + + T* transpose(T* in, int n_rows, int n_cols) { + cudaStream_t stream = h.getStream(); + cublasHandle_t cublas_handle = h.getImpl().getCublasHandle(); + T* result; + allocate(result, n_rows * n_cols); + MLCommon::LinAlg::transpose(in, result, n_rows, n_cols, cublas_handle, stream); + CUDA_CHECK(cudaPeekAtLastError()); + CUDA_CHECK(cudaFree(in)); + return result; + } + + void generate_data() { + std::random_device rd; + std::mt19937 rng(rd()); + std::uniform_real_distribution dist(0, 1); + + h_input.resize(N*M); + for (auto& i : h_input) { + i = dist(rng); + } + allocate(d_input, h_input.size()); + updateDevice(d_input, h_input.data(), h_input.size(), NULL); + //d_input = transpose(d_input, N, M); + // From row major to column major (this operation is only useful for non-random datasets) + } + + void gaussianTest() { + params1 = new paramsRPROJ(); + *params1 = { + N, // number of samples + M, // number of features + -1, // number of components + epsilon, // error tolerance + true, // gaussian or sparse method + -1.0, // auto density + false, // not used + 42 // random seed + }; + + random_matrix1 = new rand_mat(); + RPROJfit(h, random_matrix1, params1); + allocate(d_output1, N*params1->n_components); + RPROJtransform(h, d_input, random_matrix1, d_output1, params1); + d_output1 = transpose(d_output1, N, params1->n_components); // From column major to row major + } + + void sparseTest() { + params2 = new paramsRPROJ(); + *params2 = { + N, // number of samples + M, // number of features + -1, // number of components (-1: auto-deduction) + epsilon, // error tolerance + false, // gaussian or sparse method + -1.0, // auto density (-1: auto-deduction) + false, // not used + 42 // random seed + }; + + random_matrix2 = new rand_mat(); + RPROJfit(h, random_matrix2, params2); + allocate(d_output2, N*params2->n_components); + RPROJtransform(h, d_input, random_matrix2, d_output2, params2); + d_output2 = transpose(d_output2, N, params2->n_components); // From column major to row major + } + + void SetUp() override { + epsilon = 0.2; + generate_data(); + gaussianTest(); + sparseTest(); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(d_input)); + CUDA_CHECK(cudaFree(d_output1)); + CUDA_CHECK(cudaFree(d_output2)); + delete params1; + delete random_matrix1; + delete params2; + delete random_matrix2; + } + + void random_matrix_check() { + size_t D = johnson_lindenstrauss_min_dim(N, epsilon); + + ASSERT_TRUE(params1->n_components == D); + ASSERT_TRUE(random_matrix1->dense_data); + + ASSERT_TRUE(params2->n_components == D); + ASSERT_TRUE(params2->density == 1/sqrt(M)); + ASSERT_TRUE(random_matrix2->indices); + ASSERT_TRUE(random_matrix2->indptr); + ASSERT_TRUE(random_matrix2->sparse_data); + ASSERT_TRUE(random_matrix2->sparse_data_size = N * D); + } + + void epsilon_check() { + size_t D = johnson_lindenstrauss_min_dim(N, epsilon); + + constexpr auto distance_type = MLCommon::Distance::DistanceType::EucUnexpandedL2Sqrt; + size_t workspaceSize = 0; + typedef cutlass::Shape<8, 128, 128> OutputTile_t; + + T* d_pdist; + allocate(d_pdist, N*N); + + //d_input = transpose(d_input, M, N); + // Restoring row major (this operation is only useful for non-random datasets) + MLCommon::Distance::distance + (d_input, d_input, + d_pdist, + N, N, M, + (void*)nullptr, workspaceSize, + NULL + ); + CUDA_CHECK(cudaPeekAtLastError()); + + T* h_pdist = new T[N*N]; + updateHost(h_pdist, d_pdist, N*N, NULL); + CUDA_CHECK(cudaFree(d_pdist)); + + T* d_pdist1; + allocate(d_pdist1, N*N); + MLCommon::Distance::distance + (d_output1, d_output1, + d_pdist1, + N, N, D, + (void*)nullptr, workspaceSize, + NULL + ); + CUDA_CHECK(cudaPeekAtLastError()); + + T* h_pdist1 = new T[N*N]; + updateHost(h_pdist1, d_pdist1, N*N, NULL); + CUDA_CHECK(cudaFree(d_pdist1)); + + T* d_pdist2; + allocate(d_pdist2, N*N); + MLCommon::Distance::distance + (d_output2, d_output2, + d_pdist2, + N, N, D, + (void*)nullptr, workspaceSize, + NULL + ); + CUDA_CHECK(cudaPeekAtLastError()); + + T* h_pdist2 = new T[N*N]; + updateHost(h_pdist2, d_pdist2, N*N, NULL); + CUDA_CHECK(cudaFree(d_pdist2)); + + for (size_t i = 0; i < N; i++) + { + for (size_t j = 0; j <= i; j++) + { + T pdist = h_pdist[i*N + j]; + T pdist1 = h_pdist1[i*N + j]; + T pdist2 = h_pdist2[i*N + j]; + + T lower_bound = (1.0 - epsilon) * pdist; + T upper_bound = (1.0 + epsilon) * pdist; + + ASSERT_TRUE(lower_bound <= pdist1 && pdist1 <= upper_bound); + ASSERT_TRUE(lower_bound <= pdist2 && pdist2 <= upper_bound); + } + } + + delete[] h_pdist; + delete[] h_pdist1; + delete[] h_pdist2; + } + + protected: + ML::cumlHandle h; + paramsRPROJ* params1; + T epsilon; + + std::vector h_input; + T* d_input; + + rand_mat *random_matrix1; + T* d_output1; + + paramsRPROJ* params2; + rand_mat *random_matrix2; + T* d_output2; }; -typedef RPROJTest RPROJTestF; -TEST_F(RPROJTestF, Result) { - int M = params1->n_features; - int D = params1->n_components; - ASSERT_TRUE(D == 10); - ASSERT_TRUE(random_matrix1->dense_data); - - float* h_dense_data = new float[M*D]; - updateHost(h_dense_data, random_matrix1->dense_data, M*D, NULL); - ASSERT_TRUE(!std::any_of(h_dense_data, &h_dense_data[M*D-1], [](int i){return i < -1.0 || i > 1.0;})); - delete[] h_dense_data; - - int N = params2->n_samples; - M = params2->n_features; - D = params2->n_components; - ASSERT_TRUE(D == 690); - ASSERT_TRUE(params2->density == 1/sqrt(M)); - ASSERT_TRUE(random_matrix2->indices); - ASSERT_TRUE(random_matrix2->indptr); - ASSERT_TRUE(random_matrix2->sparse_data); - ASSERT_TRUE(random_matrix2->sparse_data_size = N*D); +typedef RPROJTest RPROJTestF1; +TEST_F(RPROJTestF1, RandomMatrixCheck) { + random_matrix_check(); +} +TEST_F(RPROJTestF1, EpsilonCheck) { + epsilon_check(); +} + +typedef RPROJTest RPROJTestD1; +TEST_F(RPROJTestD1, RandomMatrixCheck) { + random_matrix_check(); +} +TEST_F(RPROJTestD1, EpsilonCheck) { + epsilon_check(); } -typedef RPROJTest RPROJTestD; -TEST_F(RPROJTestD, Result) { - int M = params1->n_features; - int D = params1->n_components; - ASSERT_TRUE(D == 10); - ASSERT_TRUE(random_matrix1->dense_data); - - double* h_dense_data = new double[M*D]; - updateHost(h_dense_data, random_matrix1->dense_data, M*D, NULL); - ASSERT_TRUE(!std::any_of(h_dense_data, &h_dense_data[M*D-1], [](int i){return i < -1.0 || i > 1.0;})); - delete[] h_dense_data; - - int N = params2->n_samples; - M = params2->n_features; - D = params2->n_components; - ASSERT_TRUE(D == 690); - ASSERT_TRUE(params2->density == 1/sqrt(M)); - ASSERT_TRUE(random_matrix2->indices); - ASSERT_TRUE(random_matrix2->indptr); - ASSERT_TRUE(random_matrix2->sparse_data); - ASSERT_TRUE(random_matrix2->sparse_data_size = N*D); +typedef RPROJTest RPROJTestF2; +TEST_F(RPROJTestF2, RandomMatrixCheck) { + random_matrix_check(); +} +TEST_F(RPROJTestF2, EpsilonCheck) { + epsilon_check(); +} + +typedef RPROJTest RPROJTestD2; +TEST_F(RPROJTestD2, RandomMatrixCheck) { + random_matrix_check(); +} +TEST_F(RPROJTestD2, EpsilonCheck) { + epsilon_check(); } } // end namespace ML From 4706cea97a35351701901700a539c8aa81f18f25 Mon Sep 17 00:00:00 2001 From: wxbn Date: Thu, 2 May 2019 01:44:38 +0000 Subject: [PATCH 017/156] Fixed Python coding style --- python/cuml/test/test_rproj.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/python/cuml/test/test_rproj.py b/python/cuml/test/test_rproj.py index 09f44594f6..cf438947b7 100644 --- a/python/cuml/test/test_rproj.py +++ b/python/cuml/test/test_rproj.py @@ -14,9 +14,12 @@ # import pytest -from cuml.random_projection import GaussianRandomProjection, SparseRandomProjection -from cuml.random_projection import johnson_lindenstrauss_min_dim as cuml_johnson_lindenstrauss_min_dim -from sklearn.random_projection import johnson_lindenstrauss_min_dim as sklearn_johnson_lindenstrauss_min_dim +from cuml.random_projection import GaussianRandomProjection, \ + SparseRandomProjection +from cuml.random_projection import johnson_lindenstrauss_min_dim \ + as cuml_johnson_lindenstrauss_min_dim +from sklearn.random_projection import johnson_lindenstrauss_min_dim \ + as sklearn_johnson_lindenstrauss_min_dim from sklearn.datasets.samples_generator import make_blobs from scipy.spatial.distance import pdist @@ -31,7 +34,7 @@ def test_rproj_fit(datatype, input_type, method): # dataset generation data, target = make_blobs(n_samples=800, centers=400, n_features=3000) - #conversion to input_type + # conversion to input_type data = data.astype(datatype) target = target.astype(datatype) @@ -45,12 +48,12 @@ def test_rproj_fit(datatype, input_type, method): if input_type == 'dataframe': gdf = cudf.DataFrame() for i in range(data.shape[1]): - gdf[str(i)] = np.asarray(data[:,i], dtype=datatype) + gdf[str(i)] = np.asarray(data[:, i], dtype=datatype) model.fit(gdf) else: model.fit(data) - assert True # Did not crash + assert True # Did not crash @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @@ -62,7 +65,7 @@ def test_rproj_fit_transform(datatype, input_type, method): # dataset generation data, target = make_blobs(n_samples=800, centers=400, n_features=3000) - #conversion to input_type + # conversion to input_type data = data.astype(datatype) target = target.astype(datatype) @@ -76,7 +79,7 @@ def test_rproj_fit_transform(datatype, input_type, method): if input_type == 'dataframe': gdf = cudf.DataFrame() for i in range(data.shape[1]): - gdf[str(i)] = np.asarray(data[:,i], dtype=datatype) + gdf[str(i)] = np.asarray(data[:, i], dtype=datatype) model.fit(gdf) else: model.fit(data) @@ -91,16 +94,17 @@ def test_rproj_fit_transform(datatype, input_type, method): embedded_pdist = pdist(transformed_data) # check JL lemma - assert np.all(((1.0 - eps) * original_pdist) <= embedded_pdist) and np.all(embedded_pdist <= ((1.0 + eps) * original_pdist)) + assert (np.all(((1.0 - eps) * original_pdist) <= embedded_pdist) and + np.all(embedded_pdist <= ((1.0 + eps) * original_pdist))) def test_johnson_lindenstrauss_min_dim(): n_tests = 10000 n_samples = np.random.randint(low=50, high=1e10, size=n_tests) - eps_values = np.random.rand(n_tests) + 1e-17 # range (0,1) + eps_values = np.random.rand(n_tests) + 1e-17 # range (0,1) tests = zip(n_samples, eps_values) for n_samples, eps in tests: cuml_value = cuml_johnson_lindenstrauss_min_dim(n_samples, eps) sklearn_value = sklearn_johnson_lindenstrauss_min_dim(n_samples, eps) - assert cuml_value == sklearn_value \ No newline at end of file + assert cuml_value == sklearn_value From 269212da9f53398566cde1c050891daccdada439 Mon Sep 17 00:00:00 2001 From: wxbn Date: Fri, 3 May 2019 01:09:47 +0000 Subject: [PATCH 018/156] Pytests +miscellaneous --- CHANGELOG.md | 1 + cuML/CMakeLists.txt | 3 +- cuML/src/metrics/trustworthiness.cu | 205 ++++++++++++----------- cuML/src/metrics/trustworthiness.h | 10 +- python/cuml/metrics/__init__.py | 2 +- python/cuml/metrics/trustworthiness.pyx | 109 +++++++----- python/cuml/test/test_trustworthiness.py | 58 +++++++ 7 files changed, 250 insertions(+), 138 deletions(-) create mode 100644 python/cuml/test/test_trustworthiness.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b96512d637..de6436338b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ - PR #455: Remove default-stream arguement across ml-prims and cuML - PR #375: cuml cpp shared library renamed to libcuml++.so - PR #491: added doxygen build target for ml-prims +- PR #516: Added Trustworthiness score feature ## Improvements diff --git a/cuML/CMakeLists.txt b/cuML/CMakeLists.txt index 2927d66f47..d99ac35f73 100644 --- a/cuML/CMakeLists.txt +++ b/cuML/CMakeLists.txt @@ -216,7 +216,8 @@ add_library(${CUML_CPP_TARGET} SHARED src/common/cumlHandle.cpp src/common/cuml_api.cpp src/umap/umap.cu - src/solver/solver.cu) + src/solver/solver.cu + src/metrics/trustworthiness.cu) set(CUML_LINK_LIBRARIES ${CUDA_cublas_LIBRARY} diff --git a/cuML/src/metrics/trustworthiness.cu b/cuML/src/metrics/trustworthiness.cu index b04235393e..89e1f8e5c4 100644 --- a/cuML/src/metrics/trustworthiness.cu +++ b/cuML/src/metrics/trustworthiness.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,130 +18,145 @@ #include #include "distance/distance.h" #include +#include #include "../knn/knn.h" using namespace MLCommon; using namespace MLCommon::Selection; using namespace ML; -/** -* @brief Compute a kNN and returns the indexes of the nearest neighbors -* @input param input: Input matrix holding the dataset -* @input param n: Number of samples -* @input param d: Number of features -* @return Matrix holding the indexes of the nearest neighbors -*/ -template -long* get_knn(math_t* input, int n, int d, int n_neighbors) -{ - long* d_pred_I; - math_t* d_pred_D; - allocate(d_pred_I, n*n_neighbors); - allocate(d_pred_D, n*n_neighbors); - - kNNParams params = {input, n}; - kNN knn(d); - knn.fit(¶ms, 1); - knn.search(input, n, d_pred_I, d_pred_D, n_neighbors); - - long* h_pred_I = new long[n*n_neighbors]; - updateHost(h_pred_I, d_pred_I, n*n_neighbors); - - CUDA_CHECK(cudaFree(d_pred_I)); - CUDA_CHECK(cudaFree(d_pred_D)); - return h_pred_I; -} - namespace ML { /** - * @brief Compute the trustworthiness score - * @input param X: Data in original dimension - * @input param X_embedded: Data in target dimension (embedding) + * @brief Compute a kNN and returns the indexes of the nearest neighbors + * @input param input: Input matrix holding the dataset * @input param n: Number of samples - * @input param m: Number of features in high/original dimension - * @input param d: Number of features in low/embedded dimension - * @input param n_neighbors: Number of neighbors considered by trustworthiness score - * @return Trustworthiness score + * @input param d: Number of features + * @return Matrix holding the indexes of the nearest neighbors */ template - double cuml_trustworthiness(math_t* X, math_t* X_embedded, int n, int m, int d, int n_neighbors) + long* get_knn_indexes(const cumlHandle& h, math_t* input, int n, + int d, int n_neighbors) { - const int TMP_SIZE = MAX_BATCH_SIZE * n; + cudaStream_t stream = h.getStream(); + auto alloc = h.getHostAllocator(); + + long* d_pred_I; + math_t* d_pred_D; + allocate(d_pred_I, n * n_neighbors); + allocate(d_pred_D, n * n_neighbors); + + kNNParams params = {input, n}; + kNN knn(d); + knn.fit(¶ms, 1); + knn.search(input, n, d_pred_I, d_pred_D, n_neighbors); + + long* h_pred_I = (long*)alloc->allocate(n * n_neighbors * sizeof(long), stream); + updateHost(h_pred_I, d_pred_I, n * n_neighbors, stream); + + CUDA_CHECK(cudaFree(d_pred_I)); + CUDA_CHECK(cudaFree(d_pred_D)); + return h_pred_I; + } - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + namespace Metrics { + + /** + * @brief Compute the trustworthiness score + * @input param X: Data in original dimension + * @input param X_embedded: Data in target dimension (embedding) + * @input param n: Number of samples + * @input param m: Number of features in high/original dimension + * @input param d: Number of features in low/embedded dimension + * @input param n_neighbors: Number of neighbors considered by trustworthiness score + * @return Trustworthiness score + */ + template + double trustworthiness_score(const cumlHandle& h, math_t* X, + math_t* X_embedded, int n, int m, int d, int n_neighbors) + { + const int TMP_SIZE = MAX_BATCH_SIZE * n; - constexpr auto distance_type = MLCommon::Distance::DistanceType::EucUnexpandedL2Sqrt; - size_t workspaceSize = 0; // EucUnexpandedL2Sqrt does not need any workspace - typedef cutlass::Shape<8, 128, 128> OutputTile_t; - bool bAllocWorkspace = false; + cudaStream_t stream = h.getStream(); + auto alloc = h.getHostAllocator(); - math_t* d_pdist_tmp; - allocate(d_pdist_tmp, TMP_SIZE); - int* d_ind_X_tmp; - allocate(d_ind_X_tmp, TMP_SIZE); - int* h_ind_X = new int[n*n]; + constexpr auto distance_type = MLCommon::Distance::DistanceType::EucUnexpandedL2Sqrt; + size_t workspaceSize = 0; // EucUnexpandedL2Sqrt does not need any workspace + typedef cutlass::Shape<8, 128, 128> OutputTile_t; + bool bAllocWorkspace = false; - int toDo = n; - while (toDo > 0) - { - int batchSize = min(toDo, MAX_BATCH_SIZE); // Takes at most MAX_BATCH_SIZE vectors at a time - - MLCommon::Distance::distance - (&X[(n - toDo) * m], X, - d_pdist_tmp, - batchSize, n, m, - (void*)nullptr, workspaceSize, - stream - ); - CUDA_CHECK(cudaPeekAtLastError()); - - sortColumnsPerRow(d_pdist_tmp, d_ind_X_tmp, - batchSize, n, - bAllocWorkspace, NULL, workspaceSize, - stream); - CUDA_CHECK(cudaPeekAtLastError()); - - updateHost(&h_ind_X[(n - toDo) * n], d_ind_X_tmp, batchSize * n, stream); - - toDo -= batchSize; - } + math_t* d_pdist_tmp; + allocate(d_pdist_tmp, TMP_SIZE); + int* d_ind_X_tmp; + allocate(d_ind_X_tmp, TMP_SIZE); + int* h_ind_X = (int*)alloc->allocate(n * n * sizeof(int), stream); - long* ind_X_embedded = get_knn(X_embedded, n, d, n_neighbors + 1); + int toDo = n; + while (toDo > 0) + { + int batchSize = min(toDo, MAX_BATCH_SIZE); + // Takes at most MAX_BATCH_SIZE vectors at a time + + MLCommon::Distance::distance + (&X[(n - toDo) * m], X, + d_pdist_tmp, + batchSize, n, m, + (void*)nullptr, workspaceSize, + stream + ); + CUDA_CHECK(cudaPeekAtLastError()); + + sortColumnsPerRow(d_pdist_tmp, d_ind_X_tmp, + batchSize, n, + bAllocWorkspace, NULL, workspaceSize, + stream); + CUDA_CHECK(cudaPeekAtLastError()); + + updateHost(&h_ind_X[(n - toDo) * n], d_ind_X_tmp, + batchSize * n, stream); + + toDo -= batchSize; + } - double t = 0.0; - for (size_t i = 0; i < n; i++) - { - int* sample_i = &h_ind_X[i * n + 1]; - for (size_t j = 1; j <= n_neighbors; j++) + long* ind_X_embedded = get_knn_indexes(h, X_embedded, + n, d, n_neighbors + 1); + + double t = 0.0; + for (size_t i = 0; i < n; i++) { - long idx = ind_X_embedded[i * (n_neighbors+1) + j]; - for (int r = 0; r < n-1; r++) + int* sample_i = &h_ind_X[i * n]; + for (size_t j = 1; j <= n_neighbors; j++) { - if (sample_i[r] == idx) + long idx = ind_X_embedded[i * (n_neighbors+1) + j]; + for (int r = 1; r < n; r++) { - t += max(0.0, double(r - n_neighbors)); - break; + if (sample_i[r] == idx) + { + int tmp = r - n_neighbors; + if (tmp > 0) + t += tmp; + break; + } } } } - } - - delete[] h_ind_X; - delete[] ind_X_embedded; - t = 1.0 - ((2.0 / ((n * n_neighbors) * ((2.0 * n) - (3.0 * n_neighbors) - 1.0))) * t); + alloc->deallocate(h_ind_X, n * n * sizeof(int), stream); + alloc->deallocate(ind_X_embedded, n * (n_neighbors + 1) * sizeof(long), stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + t = 1.0 - ((2.0 / ((n * n_neighbors) * ((2.0 * n) - (3.0 * n_neighbors) - 1.0))) * t); - return t; - } + return t; + } - template double cuml_trustworthiness(float* X, float* X_embedded, int n, int m, int d, int n_neighbors); - //template double cuml_trustworthiness(double* X, double* X_embedded, int n, int m, int d, int n_neighbors); - // Disabled for now as knn only takes floats + template double trustworthiness_score(const cumlHandle& h, + float* X, float* X_embedded, int n, int m, int d, int n_neighbors); + // template double trustworthiness_score(const cumlHandle& h, + // double* X, double* X_embedded, int n, int m, int d, int n_neighbors); + // Disabled for now as knn only takes floats + } } \ No newline at end of file diff --git a/cuML/src/metrics/trustworthiness.h b/cuML/src/metrics/trustworthiness.h index aaad5eae2a..c909f5f1b9 100644 --- a/cuML/src/metrics/trustworthiness.h +++ b/cuML/src/metrics/trustworthiness.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,11 +16,15 @@ #pragma once +#include + #define MAX_BATCH_SIZE 512 namespace ML { + namespace Metrics { - template - double cuml_trustworthiness(math_t* X, math_t* X_embedded, int n, int m, int d, int n_neighbors); + template + double trustworthiness_score(const cumlHandle& h, math_t* X, math_t* X_embedded, int n, int m, int d, int n_neighbors); + } } \ No newline at end of file diff --git a/python/cuml/metrics/__init__.py b/python/cuml/metrics/__init__.py index 98d6a0fd8e..0304c9c6b6 100644 --- a/python/cuml/metrics/__init__.py +++ b/python/cuml/metrics/__init__.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx index 46023f07cd..9d3f72a58e 100644 --- a/python/cuml/metrics/trustworthiness.pyx +++ b/python/cuml/metrics/trustworthiness.pyx @@ -1,5 +1,5 @@ # -# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,49 +25,82 @@ import numpy as np from numba import cuda from libc.stdint cimport uintptr_t - -cdef extern from "metrics/trustworthiness.h" namespace "ML": - cdef double cuml_trustworthiness[T](T* X, T* X_embedded, int n, int m, int d, int n_neighbors) - - -""" -Expresses to what extent the local structure is retained in embedding. -The score is defined in the range [0, 1]. - -Parameters ----------- - X : cuDF DataFrame or Numpy array (n_samples, n_features) - Data in original dimension - - X : cuDF DataFrame or Numpy array (n_samples, n_components) - Data in target dimension (embedding) - - n_neighbors : int, optional (default: 5) - Number of neighbors considered - -Returns -------- - trustworthiness score : double - Trustworthiness of the low-dimensional embedding -""" -def trustworthiness(X, X_embedded, n_neighbors=5): - n, m = X.shape - d = X_embedded.shape[1] - - if X.dtype != X_embedded.dtype: +from cuml.common.handle cimport cumlHandle + +cdef extern from "metrics/trustworthiness.h" namespace "ML::Metrics": + cdef double trustworthiness_score[T](const cumlHandle& h, T* X, + T* X_embedded, int n, int m, int d, int n_neighbors) + +def trustworthiness(X, X_embedded, handle=None, n_neighbors=5): + """ + Expresses to what extent the local structure is retained in embedding. + The score is defined in the range [0, 1]. + + Parameters + ---------- + X : cuDF DataFrame or Numpy array (n_samples, n_features) + Data in original dimension + + X_embedded : cuDF DataFrame or Numpy array (n_samples, n_components) + Data in target dimension (embedding) + + n_neighbors : int, optional (default: 5) + Number of neighbors considered + + Returns + ------- + trustworthiness score : double + Trustworthiness of the low-dimensional embedding + """ + + if isinstance(X, cudf.DataFrame) and isinstance(X_embedded, cudf.DataFrame): + datatype1 = np.dtype(X[X.columns[0]]._column.dtype) + datatype2 = np.dtype(X_embedded[X_embedded.columns[0]]._column.dtype) + n_samples = len(X) + n_features = len(X._cols) + n_components = len(X_embedded._cols) + elif isinstance(X, np.ndarray) and isinstance(X_embedded, np.ndarray): + datatype1 = X.dtype + datatype2 = X_embedded.dtype + n_samples, n_features = X.shape + n_components = X_embedded.shape[1] + else: + raise TypeError("X and X_embedded parameters must both be cuDF Dataframes or Numpy ndarray") + + if datatype1 != datatype2: raise TypeError("X and X_embedded parameters must be of same type") - if X.dtype != np.float32 or X_embedded.dtype != np.float32: # currently only float32 is available + if datatype1 != np.float32 or datatype2 != np.float32: # currently only float32 is available return TypeError("X and X_embedded parameters must be of type float32") - cdef uintptr_t d_X = get_ctype_ptr(cuda.to_device(X)) - cdef uintptr_t d_X_embedded = get_ctype_ptr(cuda.to_device(X_embedded)) - - if X.dtype == np.float32: - return cuml_trustworthiness[float](d_X, d_X_embedded, n, m, d, n_neighbors) + if isinstance(X, cudf.DataFrame): + d_X = X.as_gpu_matrix(order='C') + d_X_embedded = X_embedded.as_gpu_matrix(order='C') + elif isinstance(X, np.ndarray): + d_X = cuda.to_device(X) + d_X_embedded = cuda.to_device(X_embedded) + + cdef uintptr_t d_X_ptr = get_ctype_ptr(d_X) + cdef uintptr_t d_X_embedded_ptr = get_ctype_ptr(d_X_embedded) + + cdef cumlHandle* handle_ = 0 + if handle is None: + handle_ = (new cumlHandle()) + else: + handle_ = handle.getHandle() + + if datatype1 == np.float32: + res = trustworthiness_score[float](handle_[0], d_X_ptr, + d_X_embedded_ptr, n_samples, n_features, + n_components, n_neighbors) #else: - # return cuml_trustworthiness(d_X, d_X_embedded, n, m, d, n_neighbors) + # res = trustworthiness_score[double](handle_[0], d_X_ptr, + # d_X_embedded_ptr, n_samples, n_features, + # n_components, n_neighbors) + if handle is None: + del handle_ + return res def get_ctype_ptr(obj): diff --git a/python/cuml/test/test_trustworthiness.py b/python/cuml/test/test_trustworthiness.py new file mode 100644 index 0000000000..7b001eeee5 --- /dev/null +++ b/python/cuml/test/test_trustworthiness.py @@ -0,0 +1,58 @@ +# Copyright (c) 2018-2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from sklearn.manifold.t_sne import trustworthiness as sklearn_trustworthiness +from cuml.metrics import trustworthiness as cuml_trustworthiness + +from sklearn.datasets.samples_generator import make_blobs +from cuml.manifold import UMAP + +import cudf +import numpy as np + + +@pytest.mark.parametrize('input_type', ['dataframe', 'ndarray']) +@pytest.mark.parametrize('n_samples', [1000, 2500]) +@pytest.mark.parametrize('n_features', [500, 1000]) +@pytest.mark.parametrize('n_components', [100, 200]) +def test_trustworthiness(input_type, n_samples, n_features, n_components): + centers = round(n_samples*0.4) + X, y = make_blobs(n_samples=n_samples, centers=centers, + n_features=n_features) + + X_embedded = UMAP(n_components=n_components).fit_transform(X) + X = X.astype(np.float32) + X_embedded = X_embedded.astype(np.float32) + + if input_type == 'dataframe': + gdf = cudf.DataFrame() + for i in range(X.shape[1]): + gdf[str(i)] = np.asarray(X[:, i], dtype=np.float32) + + gdf_embedded = cudf.DataFrame() + for i in range(X_embedded.shape[1]): + gdf_embedded[str(i)] = np.asarray(X_embedded[:, i], + dtype=np.float32) + + cu_score = cuml_trustworthiness(gdf, gdf_embedded) + else: + cu_score = cuml_trustworthiness(X, X_embedded) + + sk_score = sklearn_trustworthiness(X, X_embedded) + + eps = 0.0001 + assert (sk_score * (1 - eps) <= cu_score and + cu_score <= sk_score * (1 + eps)) + # assert cu_score == sk_score ideally From 2daa42c7194a545e57d81ebb846aa0b0f0f0d98c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 6 May 2019 16:20:03 -0400 Subject: [PATCH 019/156] Pulling the distance function from vertex degree into an epsilon_neighborhood prim --- cuML/src/dbscan/vertexdeg/algo.h | 52 +++++++++++++++++--------------- ml-prims/src/distance/distance.h | 38 +++++++++++++++++++++++ 2 files changed, 66 insertions(+), 24 deletions(-) diff --git a/cuML/src/dbscan/vertexdeg/algo.h b/cuML/src/dbscan/vertexdeg/algo.h index 33f7cb3a74..0c447f42f0 100644 --- a/cuML/src/dbscan/vertexdeg/algo.h +++ b/cuML/src/dbscan/vertexdeg/algo.h @@ -29,6 +29,17 @@ namespace VertexDeg { namespace Algo { +/** + * Calculates both the vertex degree array and the epsilon neighborhood in a single kernel. + * + * Proposed API for this should be an epsilon neighborhood primitive that accepts a lambda and + * executes the lambda with [n, acc, vertex]. + * + * template + * void epsilon_neighborhood(T *a, T *b, bool *adj, m, n, k, T eps, + * workspaceData, workspaceSize, fused_op, stream) + * + */ template void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVertexId, int batchSize, cudaStream_t stream) { data.resetArray(stream, batchSize+1); @@ -39,48 +50,41 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVe int n = min(data.N - startVertexId, batchSize); int k = data.D; - MLCommon::device_buffer workspace(handle.getDeviceAllocator(), stream); - size_t workspaceSize = 0; + int* vd = data.vd; + bool* adj = data.adj; value_t eps2 = data.eps * data.eps; - int* vd = data.vd; - bool* adj = data.adj; + MLCommon::device_buffer workspace(handle.getDeviceAllocator(), stream); + size_t workspaceSize = 0; + /** * Epilogue operator to fuse the construction of boolean eps neighborhood adjacency matrix, vertex degree array, * and the final distance matrix into a single kernel. */ - auto dbscan_op = [n, eps2, vd, adj] __device__ - (value_t val, // current value in gemm matrix - int global_c_idx) { // index of output in global memory - int acc = val <= eps2; - int vd_offset = global_c_idx - (n * (global_c_idx / n)); // bucket offset for the vertex degrees - - atomicAdd(vd+vd_offset, acc); - atomicAdd(vd+n, acc); - return bool(acc); + auto vertex_degree_op = [vd, n] __device__ (int global_c_idx, bool in_neigh) { + int batch_vertex = global_c_idx - (n * (global_c_idx / n)); + + atomicAdd(vd+batch_vertex, in_neigh); + atomicAdd(vd+n, in_neigh); }; constexpr auto distance_type = MLCommon::Distance::DistanceType::EucUnexpandedL2; workspaceSize = MLCommon::Distance::getWorkspaceSize - (data.x, data.x+startVertexId*k, // x & y inputs - m, n, k // Cutlass block params - ); - - CUDA_CHECK(cudaPeekAtLastError()); + (data.x, data.x+startVertexId*k, m, n, k); - if (workspaceSize != 0) { + if (workspaceSize != 0) workspace.resize(workspaceSize, stream); - } - MLCommon::Distance::distance + MLCommon::Distance::epsilon_neighborhood (data.x, data.x+startVertexId*k, // x & y inputs adj, - m, n, k, // Cutlass block params - (void*)workspace.data(), workspaceSize, // workspace params - dbscan_op, // epilogue operator + m, n, k, + eps2, + (void*)workspace.data(), workspaceSize, // workspace params + vertex_degree_op, // epilogue operator stream // cuda stream ); diff --git a/ml-prims/src/distance/distance.h b/ml-prims/src/distance/distance.h index 6b6f531067..ec1e66fd94 100644 --- a/ml-prims/src/distance/distance.h +++ b/ml-prims/src/distance/distance.h @@ -210,5 +210,43 @@ void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream); } + +/** + * @brief Constructs an epsilon neighborhood adjacency matrix by + * filtering the final distance by some epsilon. + * @tparam distanceType: distance metric to compute between a and b matrices + * @tparam T: the type of input matrices a and b + * @param a: row-major input matrix a + * @param b: row-major input matrix b + * @param adj: a boolean output adjacency matrix + * @param m: number of points in a + * @param n: number of points in b + * @param k: dimensionality + * @param eps: the epsilon value to use as a filter for neighborhood construction. + * it is important to note that if the distance type returns a squared + * variant for efficiency, the epsilon will need to be squared as well. + * @param workspace: temporary workspace needed for computations + * @param worksize: number of bytes of the workspace + * @param fused_op: a 2-argument lambda function taking the output index into c + * and a boolean denoting whether or not the inputs are part of + * the epsilon neighborhood. + * + * @param stream cuda stream + */ +template +size_t epsilon_neighborhood(T *a, T *b, bool *adj, int m, int n, int k, T eps, + void *workspace, size_t worksize, Lambda fused_op, cudaStream_t stream) { + auto epsilon_op = [n, eps, fused_op] __device__ (T val, int global_c_idx) { + int acc = val <= eps; + fused_op(global_c_idx, acc); + return bool(acc); + }; + + distance + (a, b, adj, m, n, k, (void*)workspace, worksize, epsilon_op, stream); + + return worksize; +} + }; // end namespace Distance }; // end namespace MLCommon From e47188d1fde57b1959282b4d51c896e5752c4bc3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 6 May 2019 22:01:08 -0400 Subject: [PATCH 020/156] Pulled adjacency/vertex degree and adjacency graph computations into sparse prims lib --- cuML/src/dbscan/adjgraph/algo.h | 48 ++++++++++++------------- cuML/src/dbscan/adjgraph/pack.h | 2 +- cuML/src/dbscan/adjgraph/runner.h | 2 +- cuML/src/dbscan/runner.h | 3 ++ cuML/src/dbscan/vertexdeg/algo.h | 36 +++++++++---------- cuML/src/dbscan/vertexdeg/runner.h | 1 + ml-prims/src/sparse/csr.h | 57 ++++++++++++++++++++++++++++++ 7 files changed, 101 insertions(+), 48 deletions(-) diff --git a/cuML/src/dbscan/adjgraph/algo.h b/cuML/src/dbscan/adjgraph/algo.h index 45a8aaf260..b648bcb797 100644 --- a/cuML/src/dbscan/adjgraph/algo.h +++ b/cuML/src/dbscan/adjgraph/algo.h @@ -24,6 +24,8 @@ #include #include +#include "sparse/csr.h" + using namespace thrust; namespace Dbscan { @@ -32,39 +34,33 @@ namespace Algo { using namespace MLCommon; -template -__global__ void adj_graph_kernel(Pack data, int batchSize) { - int row = blockIdx.x*TPB_X + threadIdx.x; - int N = data.N; - if(row < batchSize) { - int k = 0; - data.core_pts[row] = (data.vd[row] >= data.minPts); - Type scan_id = data.ex_scan[row]; - for(int i=0; i -void launcher(const ML::cumlHandle_impl& handle, Pack data, int batchSize, cudaStream_t stream) { - dim3 blocks(ceildiv(batchSize, TPB_X)); - dim3 threads(TPB_X); +void launcher(const ML::cumlHandle_impl& handle, Pack data, Type batchSize, cudaStream_t stream) { + device_ptr dev_vd = device_pointer_cast(data.vd); device_ptr dev_ex_scan = device_pointer_cast(data.ex_scan); ML::thrustAllocatorAdapter alloc( handle.getDeviceAllocator(), stream ); - auto execution_policy = thrust::cuda::par(alloc).on(stream); - exclusive_scan(execution_policy, dev_vd, dev_vd + batchSize, dev_ex_scan); - adj_graph_kernel<<>>(data, batchSize); + exclusive_scan(thrust::cuda::par(alloc).on(stream), + dev_vd, dev_vd + batchSize, dev_ex_scan); + + bool *core_pts = data.core_pts; + int minPts = data.minPts; + int *vd = data.vd; + + MLCommon::Sparse::csr_adj_graph_batched(data.ex_scan, data.N, batchSize, + data.adj, data.adj_graph, + [core_pts, minPts, vd] __device__ (Type row, Type start_idx) { + core_pts[row] = (vd[row] >= minPts); + }, stream); + CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cuML/src/dbscan/adjgraph/pack.h b/cuML/src/dbscan/adjgraph/pack.h index 2a9a7380eb..cab9f07e3d 100644 --- a/cuML/src/dbscan/adjgraph/pack.h +++ b/cuML/src/dbscan/adjgraph/pack.h @@ -36,7 +36,7 @@ struct Pack { /** array to store whether a vertex is core poType or not */ bool *core_pts; /** number of poTypes in the dataset */ - Type N; + Type N; /** Minpts for classifying core pts */ Type minPts; }; diff --git a/cuML/src/dbscan/adjgraph/runner.h b/cuML/src/dbscan/adjgraph/runner.h index 0ce1044648..8c184a5061 100644 --- a/cuML/src/dbscan/adjgraph/runner.h +++ b/cuML/src/dbscan/adjgraph/runner.h @@ -26,7 +26,7 @@ namespace AdjGraph { template void run(const ML::cumlHandle_impl& handle, bool* adj, int* vd, Type* adj_graph, Type* ex_scan, Type N, - Type minpts, bool* core_pts, int algo, int batchSize, cudaStream_t stream) { + Type minpts, bool* core_pts, int algo, Type batchSize, cudaStream_t stream) { Pack data = {vd, adj, adj_graph, ex_scan, core_pts, N, minpts}; switch(algo) { case 0: diff --git a/cuML/src/dbscan/runner.h b/cuML/src/dbscan/runner.h index 24316073cb..b39482f2fa 100644 --- a/cuML/src/dbscan/runner.h +++ b/cuML/src/dbscan/runner.h @@ -108,6 +108,9 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Type N, Type D, Type_f AdjGraph::run(handle, adj, vd, adj_graph.data(), ex_scan, N, minPts, core_pts, algoAdj, nPoints, stream); + std::cout << MLCommon::arr2Str(adj, batchSize*N, "adj", stream) << std::endl; + std::cout << MLCommon::arr2Str(adj_graph.data(), adjlen, "adj_graph", stream) << std::endl; + // Running Labelling Label::run(handle, adj, vd, adj_graph.data(), ex_scan, N, minPts, core_pts, visited, labels, xa, fa, m, map_id, algoCcl, startVertexId, diff --git a/cuML/src/dbscan/vertexdeg/algo.h b/cuML/src/dbscan/vertexdeg/algo.h index 0c447f42f0..3bca10c213 100644 --- a/cuML/src/dbscan/vertexdeg/algo.h +++ b/cuML/src/dbscan/vertexdeg/algo.h @@ -15,6 +15,7 @@ */ #pragma once + #include "cuda_runtime.h" #include "distance/distance.h" #include @@ -51,7 +52,7 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVe int k = data.D; int* vd = data.vd; - bool* adj = data.adj; +// bool* adj = data.adj; value_t eps2 = data.eps * data.eps; @@ -63,13 +64,6 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVe * Epilogue operator to fuse the construction of boolean eps neighborhood adjacency matrix, vertex degree array, * and the final distance matrix into a single kernel. */ - auto vertex_degree_op = [vd, n] __device__ (int global_c_idx, bool in_neigh) { - int batch_vertex = global_c_idx - (n * (global_c_idx / n)); - - atomicAdd(vd+batch_vertex, in_neigh); - atomicAdd(vd+n, in_neigh); - }; - constexpr auto distance_type = MLCommon::Distance::DistanceType::EucUnexpandedL2; workspaceSize = MLCommon::Distance::getWorkspaceSize @@ -79,21 +73,23 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVe workspace.resize(workspaceSize, stream); MLCommon::Distance::epsilon_neighborhood - (data.x, data.x+startVertexId*k, // x & y inputs - adj, - m, n, k, - eps2, - (void*)workspace.data(), workspaceSize, // workspace params - vertex_degree_op, // epilogue operator - stream // cuda stream - ); + (data.x, data.x+startVertexId*k, // x & y inputs + data.adj, + m, n, k, + eps2, + (void*)workspace.data(), workspaceSize, // workspace params + [vd, n] __device__ (int global_c_idx, bool in_neigh) { + int batch_vertex = global_c_idx - (n * (global_c_idx / n)); + atomicAdd(vd+batch_vertex, in_neigh); + atomicAdd(vd+n, in_neigh); + }, + stream // cuda stream + ); CUDA_CHECK(cudaPeekAtLastError()); -} - - - + std::cout << MLCommon::arr2Str(vd, batchSize, "vd", stream) << std::endl; +} } // end namespace Algo6 } // end namespace VertexDeg }; // end namespace Dbscan diff --git a/cuML/src/dbscan/vertexdeg/runner.h b/cuML/src/dbscan/vertexdeg/runner.h index 1951f0aab6..5360f8092c 100644 --- a/cuML/src/dbscan/vertexdeg/runner.h +++ b/cuML/src/dbscan/vertexdeg/runner.h @@ -15,6 +15,7 @@ */ #pragma once + #include "naive.h" #include "pack.h" #include "algo.h" diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index c8057d94f2..3e1a6a126f 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -558,5 +558,62 @@ void csr_add_finalize( ); CUDA_CHECK(cudaPeekAtLastError()); } + +template +__global__ void csr_row_op_batched_kernel(T *row_ind, T total_rows, + T batchSize, Lambda op) { + T row = blockIdx.x*TPB_X + threadIdx.x; + if(row < batchSize) { + T start_idx = row_ind[row]; + op(row, start_idx); + } +} + +/** + * Performs a batched row operation on the rows of a CSR matrix. + */ +template +void csr_row_op_batched(T *row_ind, T total_rows, T batchSize, + Lambda op, cudaStream_t stream) { + + dim3 grid(MLCommon::ceildiv(batchSize, TPB_X), 1, 1); + dim3 blk(TPB_X, 1, 1); + + csr_row_op_batched_kernel<<>> + (row_ind, total_rows, batchSize, op); +} + +template +void csr_row_op(T *row_ind, T n_rows, Lambda op, cudaStream_t stream) { + csr_row_op_batched(row_ind, n_rows, n_rows, op, stream); +} + +template +void csr_adj_graph_batched(T *row_ind, T total_rows, T batchSize, + bool *adj, T *row_ind_ptr, Lambda fused_op, cudaStream_t stream) { + + csr_row_op_batched(row_ind, total_rows, batchSize, + [fused_op, adj, total_rows, row_ind_ptr, batchSize] __device__ + (T row, T start_idx) { + + fused_op(row, start_idx); + int k = 0; + for(T i=0; i +void csr_adj_graph(T *row_ind, T n_rows, + bool *adj, T *row_ind_ptr, cudaStream_t stream) { + csr_adj_graph_batched(row_ind, n_rows, n_rows, adj, row_ind_ptr, stream); +} + }; }; From 5b159d2447c2fa167f0147c29baee4773e08e36c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 6 May 2019 22:06:26 -0400 Subject: [PATCH 021/156] Cleaning up --- cuML/src/dbscan/adjgraph/algo.h | 1 + cuML/src/dbscan/vertexdeg/algo.h | 19 +++++-------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/cuML/src/dbscan/adjgraph/algo.h b/cuML/src/dbscan/adjgraph/algo.h index b648bcb797..e71697b188 100644 --- a/cuML/src/dbscan/adjgraph/algo.h +++ b/cuML/src/dbscan/adjgraph/algo.h @@ -58,6 +58,7 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, Type batchSize MLCommon::Sparse::csr_adj_graph_batched(data.ex_scan, data.N, batchSize, data.adj, data.adj_graph, [core_pts, minPts, vd] __device__ (Type row, Type start_idx) { + // fuse the operation of core points construction core_pts[row] = (vd[row] >= minPts); }, stream); diff --git a/cuML/src/dbscan/vertexdeg/algo.h b/cuML/src/dbscan/vertexdeg/algo.h index 3bca10c213..20468b61fb 100644 --- a/cuML/src/dbscan/vertexdeg/algo.h +++ b/cuML/src/dbscan/vertexdeg/algo.h @@ -52,18 +52,12 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVe int k = data.D; int* vd = data.vd; -// bool* adj = data.adj; value_t eps2 = data.eps * data.eps; MLCommon::device_buffer workspace(handle.getDeviceAllocator(), stream); size_t workspaceSize = 0; - - /** - * Epilogue operator to fuse the construction of boolean eps neighborhood adjacency matrix, vertex degree array, - * and the final distance matrix into a single kernel. - */ constexpr auto distance_type = MLCommon::Distance::DistanceType::EucUnexpandedL2; workspaceSize = MLCommon::Distance::getWorkspaceSize @@ -73,22 +67,19 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVe workspace.resize(workspaceSize, stream); MLCommon::Distance::epsilon_neighborhood - (data.x, data.x+startVertexId*k, // x & y inputs - data.adj, - m, n, k, - eps2, - (void*)workspace.data(), workspaceSize, // workspace params + (data.x, data.x+startVertexId*k, data.adj, m, n, k, eps2, + (void*)workspace.data(), workspaceSize, + [vd, n] __device__ (int global_c_idx, bool in_neigh) { + // fused construction of vertex degree int batch_vertex = global_c_idx - (n * (global_c_idx / n)); atomicAdd(vd+batch_vertex, in_neigh); atomicAdd(vd+n, in_neigh); }, - stream // cuda stream + stream ); CUDA_CHECK(cudaPeekAtLastError()); - - std::cout << MLCommon::arr2Str(vd, batchSize, "vd", stream) << std::endl; } } // end namespace Algo6 } // end namespace VertexDeg From d1b1f155a181705b7a7992fa878eb2078ee5adce Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 7 May 2019 08:31:53 -0400 Subject: [PATCH 022/156] Commening out labeling for now --- cuML/src/dbscan/labelling/algo2.h | 2 +- ml-prims/src/sparse/csr.h | 29 ++++++++++++++++++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/cuML/src/dbscan/labelling/algo2.h b/cuML/src/dbscan/labelling/algo2.h index d1c3a7384e..54c3f73714 100644 --- a/cuML/src/dbscan/labelling/algo2.h +++ b/cuML/src/dbscan/labelling/algo2.h @@ -55,7 +55,7 @@ __global__ void label_device(Pack data, int startVertexId, int batchSize) Type ci, cj; bool ci_mod = false; ci = data.db_cluster[tid + startVertexId]; - for(int j=0; j< int(data.vd[tid]); j++) { + for(int j=0; j< int(data.vd[tid]); j++) { // TODO: Can't this be calculated from the ex_scan? cj = data.db_cluster[data.adj_graph[start + j]]; if(ci(row_ind, n_rows, n_rows, adj, row_ind_ptr, stream); } - +// +//template +//class WeaklyCCState { +// protected: +// bool *xa; +// bool *fa; +// bool *m; +// T *map_id; +// +// public: +// WeaklyCCState(T n) { +// // allocate +// } +// +// ~WeaklyCCState() { +// // free +// } +//}; +// +// +//template +//void weakly_cc_batched( +// Type *labels, Type *row_ind, Type *row_ind_ptr, Type N, +// Type startVertexId, Type batchSize, Lambda filter_op, +// WeaklyCCState *state, cudaStream_t stream) { +// +// +//} }; }; From a14018fe51af6e838729ff1c936a66a2b28888c3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 7 May 2019 13:21:14 -0400 Subject: [PATCH 023/156] Beginning to add weak cc --- ml-prims/src/sparse/csr.h | 111 ++++++++++++++++++++++++++++---------- 1 file changed, 83 insertions(+), 28 deletions(-) diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index ed97ca18af..8cb10b4aa6 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -614,33 +614,88 @@ void csr_adj_graph(T *row_ind, T n_rows, bool *adj, T *row_ind_ptr, cudaStream_t stream) { csr_adj_graph_batched(row_ind, n_rows, n_rows, adj, row_ind_ptr, stream); } -// -//template -//class WeaklyCCState { -// protected: -// bool *xa; -// bool *fa; -// bool *m; -// T *map_id; -// -// public: -// WeaklyCCState(T n) { -// // allocate -// } -// -// ~WeaklyCCState() { -// // free -// } -//}; -// -// -//template -//void weakly_cc_batched( -// Type *labels, Type *row_ind, Type *row_ind_ptr, Type N, -// Type startVertexId, Type batchSize, Lambda filter_op, -// WeaklyCCState *state, cudaStream_t stream) { -// -// -//} + +template +class WeaklyCCState { + protected: + bool *xa; + bool *fa; + bool *m; + T *map_id; + + public: + WeaklyCCState(T n) { + // allocate + } + + ~WeaklyCCState() { + // free + } +}; + + +template +__global__ void weak_cc_init_label_kernel(Pack data, int startVertexId, int batchSize, Type MAX_LABEL) { + /** F1 and F2 in the paper correspond to fa and xa */ + /** Cd in paper corresponds to db_cluster */ + int tid = threadIdx.x + blockIdx.x*TPB_X; + if(tid +__global__ void weak_cc_init_all_kernel(Type *labels, bool *fa, bool *xa, Type MAX_LABEL) { + int tid = threadIdx.x + blockIdx.x*TPB_X; + if(tid +void weak_cc_label_batched(const ML::cumlHandle_impl& handle, WeaklyCCState *state, int startVertexId, int batchSize, cudaStream_t stream) { + size_t N = data.N; + bool host_m; + MLCommon::host_buffer host_fa(handle.getHostAllocator(), stream, N); + MLCommon::host_buffer host_xa(handle.getHostAllocator(), stream, N); + + dim3 blocks(ceildiv(batchSize, TPB_X)); + dim3 threads(TPB_X); + Type MAX_LABEL = std::numeric_limits::max(); + + weak_cc_init_label_kernel<<>>(data, startVertexId, batchSize, MAX_LABEL); + do { + CUDA_CHECK( cudaMemsetAsync(data.m, false, sizeof(bool), stream) ); + label_device<<>>(data, startVertexId, batchSize); + //** swapping F1 and F2 + MLCommon::updateHost(host_fa.data(), data.fa, N, stream); + MLCommon::updateHost(host_xa.data(), data.xa, N, stream); + MLCommon::updateDevice(data.fa, host_xa.data(), N, stream); + MLCommon::updateDevice(data.xa, host_fa.data(), N, stream); + //** Updating m * + MLCommon::updateHost(&host_m, data.m, 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } while(host_m); +} + +template +void weak_cc_batched( + Type *labels, Type *row_ind, Type *row_ind_ptr, Type N, + Type startVertexId, Type batchSize, Lambda filter_op, + WeaklyCCState *state, cudaStream_t stream) { + + dim3 blocks(ceildiv(N, TPB_X)); + dim3 threads(TPB_X); + + Type MAX_LABEL = std::numeric_limits::max(); + if(startVertexId == 0) + weak_cc_init_all_kernel<<>> + (labels, state->fa, state->xa, MAX_LABEL); + label(handle, data, startVertexId, batchSize, stream); +} }; }; From c9fc27101885a4f2c3f5261b8940b2a79c4320dd Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 7 May 2019 13:55:31 -0400 Subject: [PATCH 024/156] Integrating final labeling for weakly cc into sparse csr prims --- ml-prims/src/sparse/csr.h | 92 +++++++++++++++++++++++++++++++++------ 1 file changed, 78 insertions(+), 14 deletions(-) diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index 8cb10b4aa6..384433a467 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -633,16 +633,49 @@ class WeaklyCCState { } }; - template -__global__ void weak_cc_init_label_kernel(Pack data, int startVertexId, int batchSize, Type MAX_LABEL) { +__global__ void weak_cc_label_device(Type *labels, + Type *row_ind, Type *row_ind_ptr, Type *vd, + bool *fa, bool *xa, bool *m, + int startVertexId, int batchSize) { + int tid = threadIdx.x + blockIdx.x*TPB_X; + if(tidcj) { + ci = cj; + ci_mod = true; + } + } + if(ci_mod) { + atomicMin(labels + startVertexId + tid, ci); + xa[startVertexId + tid] = true; + m[0] = true; + } + } + } +} + + +template +__global__ void weak_cc_init_label_kernel(Type *labels, int startVertexId, int batchSize, Type MAX_LABEL, Lambda filter_op) { /** F1 and F2 in the paper correspond to fa and xa */ /** Cd in paper corresponds to db_cluster */ int tid = threadIdx.x + blockIdx.x*TPB_X; if(tid -void weak_cc_label_batched(const ML::cumlHandle_impl& handle, WeaklyCCState *state, int startVertexId, int batchSize, cudaStream_t stream) { +template +void weak_cc_label_batched(const ML::cumlHandle_impl& handle, Type *labels, + Type *row_ind, Type *row_ind_ptr, Type *vd, + WeaklyCCState *state, + int startVertexId, int batchSize, cudaStream_t stream, Lambda filter_op) { size_t N = data.N; bool host_m; MLCommon::host_buffer host_fa(handle.getHostAllocator(), stream, N); @@ -667,17 +703,21 @@ void weak_cc_label_batched(const ML::cumlHandle_impl& handle, WeaklyCCState *sta dim3 threads(TPB_X); Type MAX_LABEL = std::numeric_limits::max(); - weak_cc_init_label_kernel<<>>(data, startVertexId, batchSize, MAX_LABEL); + weak_cc_init_label_kernel<<>>(labels, + startVertexId, batchSize, MAX_LABEL, filter_op); do { - CUDA_CHECK( cudaMemsetAsync(data.m, false, sizeof(bool), stream) ); - label_device<<>>(data, startVertexId, batchSize); + CUDA_CHECK( cudaMemsetAsync(state->m, false, sizeof(bool), stream) ); + weak_cc_label_device<<>>(labels, + row_ind, row_ind_ptr, vs, + state->fa, state->xa, state->m, + startVertexId, batchSize); //** swapping F1 and F2 - MLCommon::updateHost(host_fa.data(), data.fa, N, stream); - MLCommon::updateHost(host_xa.data(), data.xa, N, stream); + MLCommon::updateHost(host_fa.data(), state->fa, N, stream); + MLCommon::updateHost(host_xa.data(), state->xa, N, stream); MLCommon::updateDevice(data.fa, host_xa.data(), N, stream); MLCommon::updateDevice(data.xa, host_fa.data(), N, stream); //** Updating m * - MLCommon::updateHost(&host_m, data.m, 1, stream); + MLCommon::updateHost(&host_m, state->m, 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); } while(host_m); } @@ -695,7 +735,31 @@ void weak_cc_batched( if(startVertexId == 0) weak_cc_init_all_kernel<<>> (labels, state->fa, state->xa, MAX_LABEL); - label(handle, data, startVertexId, batchSize, stream); + weak_cc_label_batched(handle, data, startVertexId, batchSize, filter_op, stream); } + +template +void weak_cc_finalize_labels(const ML::cumlHandle_impl& handle, Type *labels, + WeaklyCCState *state, size_t N, cudaStream_t stream) { + dim3 blocks(ceildiv(data.N, TPB_X)); + dim3 threads(TPB_X); + Type MAX_LABEL = std::numeric_limits::max(); + MLCommon::host_buffer host_db_cluster(handle.getHostAllocator(), stream, N); + MLCommon::host_buffer host_map_id(handle.getHostAllocator(), stream, N); + + memset(host_map_id.data(), 0, N*sizeof(Type)); + MLCommon::updateHost(host_db_cluster.data(), labels, N, stream); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + sort(host, host_db_cluster.data(), host_db_cluster.data() + N); + Type *uid = unique(host, host_db_cluster.data(), host_db_cluster.data() + N, equal_to()); + Type num_clusters = uid - host_db_cluster.data(); + for(int i=0; imap_id, host_map_id.data(), N, stream); + map_label<<>>(data, MAX_LABEL); +} + }; }; From eec07b58d5b86e7de7298350ee7bd24815777b11 Mon Sep 17 00:00:00 2001 From: Chirayu Date: Tue, 7 May 2019 15:39:10 -0700 Subject: [PATCH 025/156] Minor changes for anticipated review comments --- ml-prims/src/metrics/contingencyMatrix.h | 11 ++++------- ml-prims/test/CMakeLists.txt | 2 +- ml-prims/test/contingencyMatrix.cu | 12 ++++++------ 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/ml-prims/src/metrics/contingencyMatrix.h b/ml-prims/src/metrics/contingencyMatrix.h index 19387db429..5c08e62c78 100644 --- a/ml-prims/src/metrics/contingencyMatrix.h +++ b/ml-prims/src/metrics/contingencyMatrix.h @@ -22,9 +22,6 @@ #include #include -#define ALIGN_BYTE 256 -#define ALIGN_MEMORY(x) (x + ALIGN_BYTE - 1) & ~(ALIGN_BYTE - 1) - namespace MLCommon { namespace Metrics { @@ -117,7 +114,7 @@ void contingencyMatrixWSort(T *groundTruth, T *predictedLabel, int nSamples, size_t workspaceSize, cudaStream_t stream) { T *outKeys = reinterpret_cast(workspace); - size_t alignedBufferSz = ALIGN_MEMORY((size_t)nSamples * sizeof(T)); + size_t alignedBufferSz = alignTo((size_t)nSamples * sizeof(T), (size_t)256); T *outValue = reinterpret_cast((size_t)workspace + alignedBufferSz); void *pWorkspaceCub = reinterpret_cast((size_t)workspace + 2*alignedBufferSz); @@ -206,7 +203,7 @@ size_t getCMatrixWorkspaceSize(int nSamples, T* groundTruth, cudaStream_t stream ContingencyMatrixImplType implVersion = getImplVersion(outDimN); if (implVersion == SORT_AND_GATOMICS) { - void *pWorkspaceCub = NULL; + void *pWorkspaceCub = nullptr; size_t tmpStorageBytes = 0; // bunch of no-op pointers to get workspace size T *pTmpKey, *pTmpValue, *pTmpKeyOut, *pTmpValueOut; @@ -214,7 +211,7 @@ size_t getCMatrixWorkspaceSize(int nSamples, T* groundTruth, cudaStream_t stream CUDA_CHECK(cub::DeviceRadixSort::SortPairs(pWorkspaceCub, tmpStorageBytes, pTmpKey, pTmpValue, pTmpKeyOut, pTmpValueOut, nSamples)); - size_t tmpStagingMemorySize = ALIGN_MEMORY(nSamples * sizeof(T)); + size_t tmpStagingMemorySize = alignTo(nSamples * sizeof(T), (size_t)256); tmpStagingMemorySize *= 2; workspaceSize = tmpStagingMemorySize + tmpStorageBytes; } @@ -237,7 +234,7 @@ size_t getCMatrixWorkspaceSize(int nSamples, T* groundTruth, cudaStream_t stream */ template void contingencyMatrix(T *groundTruth, T *predictedLabel, int nSamples, int *outMat, - cudaStream_t stream, void *workspace=NULL, size_t workspaceSize=0, + cudaStream_t stream, void *workspace=nullptr, size_t workspaceSize=0, T minLabel=std::numeric_limits::max(), T maxLabel=std::numeric_limits::max()) { // assumptions: diff --git a/ml-prims/test/CMakeLists.txt b/ml-prims/test/CMakeLists.txt index d69cf482ca..a903f34f1b 100644 --- a/ml-prims/test/CMakeLists.txt +++ b/ml-prims/test/CMakeLists.txt @@ -26,6 +26,7 @@ add_executable(mlcommon_test ternary_op.cu coalesced_reduction.cu cuda_utils.cu + contingencyMatrix.cu coo.cu cov.cu csr.cu @@ -75,7 +76,6 @@ add_executable(mlcommon_test penalty.cu sigmoid.cu weighted_mean.cu - contingencyMatrix.cu ) target_link_libraries(mlcommon_test diff --git a/ml-prims/test/contingencyMatrix.cu b/ml-prims/test/contingencyMatrix.cu index 39e8b0b853..41ffffc932 100644 --- a/ml-prims/test/contingencyMatrix.cu +++ b/ml-prims/test/contingencyMatrix.cu @@ -109,12 +109,12 @@ protected: contingencyMatrixParam params; int numUniqueClasses = -1; - T* dY=NULL; - T* dYHat=NULL; - int *dComputedOutput = NULL; - int *dGoldenOutput = NULL; - int *hGoldenOutput = NULL; - char *pWorkspace = NULL; + T* dY=nullptr; + T* dYHat=nullptr; + int *dComputedOutput = nullptr; + int *dGoldenOutput = nullptr; + int *hGoldenOutput = nullptr; + char *pWorkspace = nullptr; cudaStream_t stream; }; From 17d3ba5f7a0d313616b36f21db1416935b5d13ce Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Tue, 7 May 2019 19:04:15 -0400 Subject: [PATCH 026/156] Added cumlhandle to coordinate descent and stochastic gradient descent --- cuML/src/solver/cd.h | 72 +++-- cuML/src/solver/sgd.h | 32 +-- cuML/src/solver/solver.cu | 304 +++++++-------------- cuML/src/solver/{solver_c.h => solver.hpp} | 36 ++- cuML/test/cd_test.cu | 27 +- cuML/test/sgd.cu | 67 ++--- python/cuml/solvers/cd.pyx | 119 ++++---- python/cuml/solvers/sgd.pyx | 151 +++++----- 8 files changed, 349 insertions(+), 459 deletions(-) rename cuML/src/solver/{solver_c.h => solver.hpp} (73%) diff --git a/cuML/src/solver/cd.h b/cuML/src/solver/cd.h index a0d6d2428d..c9d50d5477 100644 --- a/cuML/src/solver/cd.h +++ b/cuML/src/solver/cd.h @@ -75,25 +75,12 @@ using namespace MLCommon; * cublas handle * @param cusolver_handle * cusolver handle -*/ + */ template -void cdFit(math_t *input, - int n_rows, - int n_cols, - math_t *labels, - math_t *coef, - math_t *intercept, - bool fit_intercept, - bool normalize, - int epochs, - ML::loss_funct loss, - math_t alpha, - math_t l1_ratio, - bool shuffle, - math_t tol, - cudaStream_t stream, - cublasHandle_t cublas_handle, - cusolverDnHandle_t cusolver_handle) { +void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, + math_t *labels, math_t *coef, math_t *intercept, bool fit_intercept, + bool normalize, int epochs, ML::loss_funct loss, math_t alpha, + math_t l1_ratio, bool shuffle, math_t tol, cudaStream_t stream) { ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); @@ -102,6 +89,9 @@ void cdFit(math_t *input, ASSERT(loss == ML::loss_funct::SQRD_LOSS, "Parameter loss: Only SQRT_LOSS function is supported for now"); + cublasHandle_t cublas_handle = handle.getCublasHandle(); + cusolverDnHandle_t cusolver_handle = handle.getcusolverDnHandle(); + math_t *mu_input = nullptr; math_t *mu_labels = nullptr; math_t *norm2_input = nullptr; @@ -110,6 +100,9 @@ void cdFit(math_t *input, math_t *squared = nullptr; math_t *loss_value = nullptr; + //auto allocator = handle.getDeviceAllocator(); + //device_buffer components_all(allocator, stream, len); + allocate(loss_value, 1); allocate(pred, n_rows, true); allocate(residual, n_rows, true); @@ -124,13 +117,9 @@ void cdFit(math_t *input, allocate(norm2_input, n_cols); } - ///@todo: remove this cumlHandle and use the cumlHandle_impl - /// passed to this method instead!! - cumlHandle handle; - handle.setStream(stream); - GLM::preProcessData(handle.getImpl(), input, n_rows, n_cols, labels, intercept, - mu_input, mu_labels, norm2_input, fit_intercept, normalize, - stream); + GLM::preProcessData(handle, input, n_rows, n_cols, labels, + intercept, mu_input, mu_labels, norm2_input, fit_intercept, + normalize, stream); } std::vector ri(n_cols); @@ -166,15 +155,17 @@ void cdFit(math_t *input, math_t *squared_loc = squared + ci; math_t *input_col_loc = input + (ci * n_rows); - LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, stream); + LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, + stream); LinAlg::add(residual, residual, pred, n_rows, stream); LinAlg::gemm(input_col_loc, n_rows, 1, residual, coef_loc, 1, 1, - CUBLAS_OP_T, CUBLAS_OP_N, cublas_handle, stream); + CUBLAS_OP_T, CUBLAS_OP_N, cublas_handle, stream); if (l1_ratio > math_t(0.0)) Functions::softThres(coef_loc, coef_loc, alpha, 1, stream); - LinAlg::eltwiseDivideCheckZero(coef_loc, coef_loc, squared_loc, 1, stream); + LinAlg::eltwiseDivideCheckZero(coef_loc, coef_loc, squared_loc, 1, + stream); coef_prev = h_coef[ci]; updateHost(&(h_coef[ci]), coef_loc, 1, stream); @@ -186,7 +177,8 @@ void cdFit(math_t *input, if (abs(h_coef[ci]) > coef_max) coef_max = abs(h_coef[ci]); - LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, stream); + LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, + stream); LinAlg::subtract(residual, residual, pred, n_rows, stream); } @@ -205,13 +197,9 @@ void cdFit(math_t *input, } if (fit_intercept) { - ///@todo: remove this cumlHandle and use the cumlHandle_impl - /// passed to this method instead!! - cumlHandle handle; - handle.setStream(stream); - GLM::postProcessData(handle.getImpl(), input, n_rows, n_cols, labels, coef, - intercept, mu_input, mu_labels, norm2_input, - fit_intercept, normalize, stream); + GLM::postProcessData(handle, input, n_rows, n_cols, labels, + coef, intercept, mu_input, mu_labels, norm2_input, + fit_intercept, normalize, stream); if (mu_input != nullptr) CUDA_CHECK(cudaFree(mu_input)); @@ -259,11 +247,11 @@ void cdFit(math_t *input, * cuda stream * @param cublas_handle * cublas handle -*/ + */ template -void cdPredict(const math_t *input, int n_rows, int n_cols, const math_t *coef, - math_t intercept, math_t *preds, ML::loss_funct loss, cudaStream_t stream, - cublasHandle_t cublas_handle) { +void cdPredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, + int n_cols, const math_t *coef, math_t intercept, math_t *preds, + ML::loss_funct loss, cudaStream_t stream) { ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); @@ -272,7 +260,9 @@ void cdPredict(const math_t *input, int n_rows, int n_cols, const math_t *coef, ASSERT(loss == ML::loss_funct::SQRD_LOSS, "Parameter loss: Only SQRT_LOSS function is supported for now"); - Functions::linearRegH(input, n_rows, n_cols, coef, preds, intercept, cublas_handle, stream); + cublasHandle_t cublas_handle = handle.getCublasHandle(); + Functions::linearRegH(input, n_rows, n_cols, coef, preds, intercept, + cublas_handle, stream); } diff --git a/cuML/src/solver/sgd.h b/cuML/src/solver/sgd.h index 97670556e1..2d5fea507c 100644 --- a/cuML/src/solver/sgd.h +++ b/cuML/src/solver/sgd.h @@ -35,7 +35,7 @@ #include #include #include "learning_rate.h" -#include "cuML.hpp" +#include "common/cumlHandle.hpp" namespace ML { namespace Solver { @@ -43,7 +43,8 @@ namespace Solver { using namespace MLCommon; template -void sgdFit(math_t *input, +void sgdFit(const cumlHandle_impl& handle, + math_t *input, int n_rows, int n_cols, math_t *labels, @@ -62,8 +63,6 @@ void sgdFit(math_t *input, bool shuffle, math_t tol, int n_iter_no_change, - cublasHandle_t cublas_handle, - cusolverDnHandle_t cusolver_handle, cudaStream_t stream) { ASSERT(n_cols > 0, @@ -75,14 +74,13 @@ void sgdFit(math_t *input, math_t *mu_labels = NULL; math_t *norm2_input = NULL; - ///@todo: the below line should go away once we expose - /// cumlHandle in the interface of sgd - cumlHandle handle; + cublasHandle_t cublas_handle = handle.getCublasHandle(); + if (fit_intercept) { allocate(mu_input, n_cols); allocate(mu_labels, 1); - GLM::preProcessData(handle.getImpl(), input, n_rows, n_cols, labels, intercept, mu_input, + GLM::preProcessData(handle, input, n_rows, n_cols, labels, intercept, mu_input, mu_labels, norm2_input, fit_intercept, false, stream); } @@ -208,9 +206,9 @@ void sgdFit(math_t *input, CUDA_CHECK(cudaFree(loss_value)); if (fit_intercept) { - GLM::postProcessData(handle.getImpl(), input, n_rows, n_cols, labels, coef, intercept, - mu_input, mu_labels, norm2_input, fit_intercept, false, - stream); + GLM::postProcessData(handle, input, n_rows, n_cols, labels, coef, intercept, + mu_input, mu_labels, norm2_input, fit_intercept, false, + stream); if (mu_input != NULL) CUDA_CHECK(cudaFree(mu_input)); @@ -223,8 +221,8 @@ void sgdFit(math_t *input, } template -void sgdPredict(const math_t *input, int n_rows, int n_cols, const math_t *coef, - math_t intercept, math_t *preds, ML::loss_funct loss, cublasHandle_t cublas_handle, +void sgdPredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, + int n_cols, const math_t *coef, math_t intercept, math_t *preds, ML::loss_funct loss, cudaStream_t stream) { ASSERT(n_cols > 0, @@ -232,6 +230,8 @@ void sgdPredict(const math_t *input, int n_rows, int n_cols, const math_t *coef, ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two"); + cublasHandle_t cublas_handle = handle.getCublasHandle(); + if (loss == ML::loss_funct::SQRD_LOSS) { Functions::linearRegH(input, n_rows, n_cols, coef, preds, intercept, cublas_handle, stream); } else if (loss == ML::loss_funct::LOG) { @@ -242,10 +242,10 @@ void sgdPredict(const math_t *input, int n_rows, int n_cols, const math_t *coef, } template -void sgdPredictBinaryClass(const math_t *input, int n_rows, int n_cols, const math_t *coef, - math_t intercept, math_t *preds, ML::loss_funct loss, cublasHandle_t cublas_handle, cudaStream_t stream) { +void sgdPredictBinaryClass(const cumlHandle_impl& handle, const math_t *input, int n_rows, int n_cols, const math_t *coef, + math_t intercept, math_t *preds, ML::loss_funct loss, cudaStream_t stream) { - sgdPredict(input, n_rows, n_cols, coef, intercept, preds, loss, cublas_handle, stream); + sgdPredict(handle, input, n_rows, n_cols, coef, intercept, preds, loss, stream); math_t scalar = math_t(1); if (loss == ML::loss_funct::SQRD_LOSS || loss == ML::loss_funct::LOG) { diff --git a/cuML/src/solver/solver.cu b/cuML/src/solver/solver.cu index 3d7298a696..ba270ffc74 100644 --- a/cuML/src/solver/solver.cu +++ b/cuML/src/solver/solver.cu @@ -16,17 +16,16 @@ #include "sgd.h" #include "cd.h" -#include "solver_c.h" +#include "solver.hpp" #include "ml_utils.h" -#include -#include namespace ML { namespace Solver { using namespace ML; -void sgdFit(float *input, +void sgdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -88,45 +87,32 @@ void sgdFit(float *input, "glm.cu: this learning rate type is not supported."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdFit(input, - n_rows, - n_cols, - labels, - coef, - intercept, - fit_intercept, - batch_size, - epochs, - learning_rate_type, - eta0, - power_t, - loss_funct, - pen, - alpha, - l1_ratio, - shuffle, - tol, - n_iter_no_change, - cublas_handle, - cusolver_handle, - stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdFit(handle.getImpl(), + input, + n_rows, + n_cols, + labels, + coef, + intercept, + fit_intercept, + batch_size, + epochs, + learning_rate_type, + eta0, + power_t, + loss_funct, + pen, + alpha, + l1_ratio, + shuffle, + tol, + n_iter_no_change, + handle.getStream()); } -void sgdFit(double *input, +void sgdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -186,45 +172,31 @@ void sgdFit(double *input, "glm.cu: this learning rate type is not supported."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdFit(input, - n_rows, - n_cols, - labels, - coef, - intercept, - fit_intercept, - batch_size, - epochs, - learning_rate_type, - eta0, - power_t, - loss_funct, - pen, - alpha, - l1_ratio, - shuffle, - tol, - n_iter_no_change, - cublas_handle, - cusolver_handle, - stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdFit(handle.getImpl(), + input, + n_rows, + n_cols, + labels, + coef, + intercept, + fit_intercept, + batch_size, + epochs, + learning_rate_type, + eta0, + power_t, + loss_funct, + pen, + alpha, + l1_ratio, + shuffle, + tol, + n_iter_no_change, + handle.getStream()); } -void sgdPredict(const float *input, int n_rows, int n_cols, const float *coef, +void sgdPredict(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -239,20 +211,11 @@ void sgdPredict(const float *input, int n_rows, int n_cols, const float *coef, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdPredict(input, n_rows, n_cols, coef, intercept, preds, loss_funct, cublas_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void sgdPredict(const double *input, int n_rows, int n_cols, +void sgdPredict(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -267,20 +230,11 @@ void sgdPredict(const double *input, int n_rows, int n_cols, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdPredict(input, n_rows, n_cols, coef, intercept, preds, loss_funct, cublas_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void sgdPredictBinaryClass(const float *input, int n_rows, int n_cols, const float *coef, +void sgdPredictBinaryClass(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -295,20 +249,11 @@ void sgdPredictBinaryClass(const float *input, int n_rows, int n_cols, const flo "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdPredictBinaryClass(input, n_rows, n_cols, coef, intercept, preds, loss_funct, cublas_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdPredictBinaryClass(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void sgdPredictBinaryClass(const double *input, int n_rows, int n_cols, +void sgdPredictBinaryClass(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -323,22 +268,12 @@ void sgdPredictBinaryClass(const double *input, int n_rows, int n_cols, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdPredictBinaryClass(input, n_rows, n_cols, coef, intercept, preds, loss_funct, cublas_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - - // should probably do a stream sync before destroy - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdPredictBinaryClass(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void cdFit(float *input, +void cdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -358,40 +293,26 @@ void cdFit(float *input, ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - cdFit(input, - n_rows, - n_cols, - labels, - coef, - intercept, - fit_intercept, - normalize, - epochs, - loss_funct, - alpha, - l1_ratio, - shuffle, - tol, - stream, - cublas_handle, - cusolver_handle); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - + cdFit(handle.getImpl(), + input, + n_rows, + n_cols, + labels, + coef, + intercept, + fit_intercept, + normalize, + epochs, + loss_funct, + alpha, + l1_ratio, + shuffle, + tol, + handle.getStream()); } -void cdFit(double *input, +void cdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -407,44 +328,30 @@ void cdFit(double *input, double tol) { ASSERT(loss == 0, - "Parameter loss: Only SQRT_LOSS function is supported for now"); + "Parameter loss: Only SQRT_LOSS function is supported for now"); ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - cdFit(input, - n_rows, - n_cols, - labels, - coef, - intercept, - fit_intercept, - normalize, - epochs, - loss_funct, - alpha, - l1_ratio, - shuffle, - tol, - stream, - cublas_handle, - cusolver_handle); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + cdFit(handle.getImpl(), + input, + n_rows, + n_cols, + labels, + coef, + intercept, + fit_intercept, + normalize, + epochs, + loss_funct, + alpha, + l1_ratio, + shuffle, + tol, + handle.getStream()); } -void cdPredict(const float *input, int n_rows, int n_cols, const float *coef, +void cdPredict(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -455,20 +362,10 @@ void cdPredict(const float *input, int n_rows, int n_cols, const float *coef, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - cdPredict(input, n_rows, n_cols, coef, intercept, preds, loss_funct, stream, cublas_handle); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - + cdPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void cdPredict(const double *input, int n_rows, int n_cols, +void cdPredict(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -479,16 +376,7 @@ void cdPredict(const double *input, int n_rows, int n_cols, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - cdPredict(input, n_rows, n_cols, coef, intercept, preds, loss_funct, stream, cublas_handle); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + cdPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } } diff --git a/cuML/src/solver/solver_c.h b/cuML/src/solver/solver.hpp similarity index 73% rename from cuML/src/solver/solver_c.h rename to cuML/src/solver/solver.hpp index 750e36db23..b2cd32804f 100644 --- a/cuML/src/solver/solver_c.h +++ b/cuML/src/solver/solver.hpp @@ -14,11 +14,17 @@ * limitations under the License. */ +#pragma once + +#include "ml_utils.h" +#include "cuML.hpp" + + namespace ML { namespace Solver { - -void sgdFit(float *input, +void sgdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -38,7 +44,8 @@ void sgdFit(float *input, float tol, int n_iter_no_change); -void sgdFit(double *input, +void sgdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -58,20 +65,20 @@ void sgdFit(double *input, double tol, int n_iter_no_change); -void sgdPredict(const float *input, int n_rows, int n_cols, const float *coef, +void sgdPredict(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss); -void sgdPredict(const double *input, int n_rows, int n_cols, +void sgdPredict(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss); -void sgdPredictBinaryClass(const float *input, int n_rows, int n_cols, const float *coef, +void sgdPredictBinaryClass(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss); -void sgdPredictBinaryClass(const double *input, int n_rows, int n_cols, +void sgdPredictBinaryClass(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss); - -void cdFit(float *input, +void cdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -86,7 +93,8 @@ void cdFit(float *input, bool shuffle, float tol); -void cdFit(double *input, +void cdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -101,11 +109,11 @@ void cdFit(double *input, bool shuffle, double tol); -void cdPredict(const float *input, int n_rows, int n_cols, const float *coef, +void cdPredict(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss); -void cdPredict(const double *input, int n_rows, int n_cols, +void cdPredict(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss); -} -} +}; +}; // end namespace ML diff --git a/cuML/test/cd_test.cu b/cuML/test/cd_test.cu index 33743b45f4..4c655173d0 100644 --- a/cuML/test/cd_test.cu +++ b/cuML/test/cd_test.cu @@ -42,12 +42,6 @@ protected: params = ::testing::TestWithParam>::GetParam(); int len = params.n_row * params.n_col; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - allocate(data, len); allocate(labels, params.n_row); allocate(coef, params.n_col, true); @@ -87,40 +81,39 @@ protected: ML::loss_funct loss = ML::loss_funct::SQRD_LOSS; intercept = T(0); - cdFit(data, params.n_row, params.n_col, labels, coef, &intercept, + cdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef, &intercept, fit_intercept, normalize, epochs, loss, alpha, l1_ratio, shuffle, - tol, stream, cublas_handle, cusolver_handle); + tol, stream); fit_intercept = true; intercept2 = T(0); - cdFit(data, params.n_row, params.n_col, labels, coef2, &intercept2, + cdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef2, &intercept2, fit_intercept, normalize, epochs, loss, alpha, l1_ratio, shuffle, - tol, stream, cublas_handle, cusolver_handle); + tol, stream); alpha = T(1.0); l1_ratio = T(0.5); fit_intercept = false; intercept = T(0); - cdFit(data, params.n_row, params.n_col, labels, coef3, &intercept, + cdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef3, &intercept, fit_intercept, normalize, epochs, loss, alpha, l1_ratio, shuffle, - tol, stream, cublas_handle, cusolver_handle); + tol, stream); fit_intercept = true; normalize = true; intercept2 = T(0); - cdFit(data, params.n_row, params.n_col, labels, coef4, &intercept2, + cdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef4, &intercept2, fit_intercept, normalize, epochs, loss, alpha, l1_ratio, shuffle, - tol, stream, cublas_handle, cusolver_handle); + tol, stream); - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); } void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); + handle.setStream(stream); lasso(); } @@ -146,7 +139,7 @@ protected: T *coef4, *coef4_ref; T intercept, intercept2; cudaStream_t stream; - + cumlHandle handle; }; const std::vector > inputsf2 = { { 0.01f, 4, 2 } }; diff --git a/cuML/test/sgd.cu b/cuML/test/sgd.cu index 34ae6605de..3ac3fd326d 100644 --- a/cuML/test/sgd.cu +++ b/cuML/test/sgd.cu @@ -29,15 +29,6 @@ protected: params = ::testing::TestWithParam>::GetParam(); int len = params.n_row * params.n_col; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); allocate(labels, params.n_row); allocate(coef, params.n_col, true); @@ -71,21 +62,17 @@ protected: MLCommon::Functions::penalty pen = MLCommon::Functions::penalty::NONE; int n_iter_no_change = 10; - sgdFit(data, params.n_row, params.n_col, labels, coef, &intercept, + sgdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef, &intercept, fit_intercept, params.batch_size, epochs, lr_type, lr, power_t, loss, pen, alpha, l1_ratio, shuffle, tol, n_iter_no_change, - cublas_handle, cusolver_handle, stream); + stream); fit_intercept = true; intercept2 = T(0); - sgdFit(data, params.n_row, params.n_col, labels, coef2, &intercept2, + sgdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef2, &intercept2, fit_intercept, params.batch_size, epochs, ML::lr_type::CONSTANT, lr, power_t, loss, pen, alpha, l1_ratio, shuffle, tol, - n_iter_no_change, cublas_handle, cusolver_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + n_iter_no_change, stream); } @@ -93,15 +80,6 @@ protected: params = ::testing::TestWithParam>::GetParam(); int len = params.n_row2 * params.n_col2; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - T *coef_class; allocate(data_logreg, len); allocate(data_logreg_test, len); @@ -138,35 +116,22 @@ protected: MLCommon::Functions::penalty pen = MLCommon::Functions::penalty::NONE; int n_iter_no_change = 10; - sgdFit(data_logreg, params.n_row2, params.n_col2, labels_logreg, + sgdFit(handle.getImpl(), data_logreg, params.n_row2, params.n_col2, labels_logreg, coef_class, &intercept_class, fit_intercept, params.batch_size, epochs, lr_type, lr, power_t, loss, pen, alpha, l1_ratio, shuffle, tol, - n_iter_no_change, cublas_handle, cusolver_handle, stream); + n_iter_no_change, stream); - sgdPredictBinaryClass(data_logreg_test, params.n_row2, params.n_col2, - coef_class, intercept_class, pred_log, loss, cublas_handle, stream); + sgdPredictBinaryClass(handle.getImpl(), data_logreg_test, params.n_row2, params.n_col2, + coef_class, intercept_class, pred_log, loss, stream); CUDA_CHECK(cudaFree(coef_class)); - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - } void svmTest() { params = ::testing::TestWithParam>::GetParam(); int len = params.n_row2 * params.n_col2; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - T *coef_class; allocate(data_svmreg, len); allocate(data_svmreg_test, len); @@ -203,23 +168,21 @@ protected: MLCommon::Functions::penalty pen = MLCommon::Functions::penalty::L2; int n_iter_no_change = 10; - sgdFit(data_svmreg, params.n_row2, params.n_col2, labels_svmreg, + sgdFit(handle.getImpl(), data_svmreg, params.n_row2, params.n_col2, labels_svmreg, coef_class, &intercept_class, fit_intercept, params.batch_size, epochs, lr_type, lr, power_t, loss, pen, alpha, l1_ratio, shuffle, tol, - n_iter_no_change, cublas_handle, cusolver_handle, stream); + n_iter_no_change, stream); - sgdPredictBinaryClass(data_svmreg_test, params.n_row2, params.n_col2, - coef_class, intercept_class, pred_svm, loss, cublas_handle, stream); + sgdPredictBinaryClass(handle.getImpl(), data_svmreg_test, params.n_row2, params.n_col2, + coef_class, intercept_class, pred_svm, loss, stream); CUDA_CHECK(cudaFree(coef_class)); - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - } void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); + handle.setStream(stream); linearRegressionTest(); logisticRegressionTest(); svmTest(); @@ -252,6 +215,8 @@ protected: T *data_svmreg, *data_svmreg_test, *labels_svmreg; T *pred_svm, *pred_svm_ref, *pred_log, *pred_log_ref; T intercept, intercept2; + cudaStream_t stream; + cumlHandle handle; }; diff --git a/python/cuml/solvers/cd.pyx b/python/cuml/solvers/cd.pyx index 8b0fa52261..7c2e39498d 100644 --- a/python/cuml/solvers/cd.pyx +++ b/python/cuml/solvers/cd.pyx @@ -24,9 +24,13 @@ from libcpp cimport bool from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free -cdef extern from "solver/solver_c.h" namespace "ML::Solver": +import cuml +from cuml.common.handle cimport cumlHandle - cdef void cdFit(float *input, +cdef extern from "solver/solver.hpp" namespace "ML::Solver": + + cdef void cdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -42,7 +46,8 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": float tol) - cdef void cdFit(double *input, + cdef void cdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -57,7 +62,8 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": bool shuffle, double tol) - cdef void cdPredict(const float *input, + cdef void cdPredict(cumlHandle& handle, + const float *input, int n_rows, int n_cols, const float *coef, @@ -65,7 +71,8 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": float *preds, int loss) - cdef void cdPredict(const double *input, + cdef void cdPredict(cumlHandle& handle, + const double *input, int n_rows, int n_cols, const double *coef, @@ -73,7 +80,7 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": double *preds, int loss) -class CD: +class CD(cuml.Base): """ Coordinate Descent (CD) is a very common optimization algorithm that minimizes along coordinate directions to find the minimum of a function. @@ -153,7 +160,8 @@ class CD: """ def __init__(self, loss='squared_loss', alpha=0.0001, l1_ratio=0.15, - fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, shuffle=True): + fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, shuffle=True, + handle=None): if loss in ['squared_loss']: self.loss = self._get_loss_int(loss) @@ -161,6 +169,7 @@ class CD: msg = "loss {!r} is not supported" raise NotImplementedError(msg.format(loss)) + super(CD, self).__init__(handle=handle, verbose=False) self.alpha = alpha self.l1_ratio = l1_ratio self.fit_intercept = fit_intercept @@ -242,42 +251,47 @@ class CD: cdef float c_intercept1 cdef double c_intercept2 - + cdef cumlHandle* handle_ = self.handle.getHandle() + if self.gdf_datatype.type == np.float32: - cdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.normalize, - self.max_iter, - self.loss, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol) + cdFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.normalize, + self.max_iter, + self.loss, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol) self.intercept_ = c_intercept1 else: - cdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.normalize, - self.max_iter, - self.loss, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol) + cdFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.normalize, + self.max_iter, + self.loss, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol) self.intercept_ = c_intercept2 + self.handle.sync() + return self def predict(self, X): @@ -318,23 +332,28 @@ class CD: cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) preds = cudf.Series(np.zeros(n_rows, dtype=pred_datatype)) cdef uintptr_t preds_ptr = self._get_column_ptr(preds) + cdef cumlHandle* handle_ = self.handle.getHandle() if pred_datatype.type == np.float32: - cdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + cdPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) else: - cdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + cdPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) + + self.handle.sync() del(X_m) diff --git a/python/cuml/solvers/sgd.pyx b/python/cuml/solvers/sgd.pyx index e3ac104cdd..7a486f3126 100644 --- a/python/cuml/solvers/sgd.pyx +++ b/python/cuml/solvers/sgd.pyx @@ -24,9 +24,13 @@ from libcpp cimport bool from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free -cdef extern from "solver/solver_c.h" namespace "ML::Solver": +import cuml +from cuml.common.handle cimport cumlHandle - cdef void sgdFit(float *input, +cdef extern from "solver/solver.hpp" namespace "ML::Solver": + + cdef void sgdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -47,7 +51,8 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": int n_iter_no_change) - cdef void sgdFit(double *input, + cdef void sgdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -67,7 +72,8 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": double tol, int n_iter_no_change) - cdef void sgdPredict(const float *input, + cdef void sgdPredict(cumlHandle& handle, + const float *input, int n_rows, int n_cols, const float *coef, @@ -75,7 +81,8 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": float *preds, int loss) - cdef void sgdPredict(const double *input, + cdef void sgdPredict(cumlHandle& handle, + const double *input, int n_rows, int n_cols, const double *coef, @@ -83,7 +90,8 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": double *preds, int loss) - cdef void sgdPredictBinaryClass(const float *input, + cdef void sgdPredictBinaryClass(cumlHandle& handle, + const float *input, int n_rows, int n_cols, const float *coef, @@ -91,7 +99,8 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": float *preds, int loss) - cdef void sgdPredictBinaryClass(const double *input, + cdef void sgdPredictBinaryClass(cumlHandle& handle, + const double *input, int n_rows, int n_cols, const double *coef, @@ -99,7 +108,7 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": double *preds, int loss) -class SGD: +class SGD(cuml.Base): """ Stochastic Gradient Descent is a very common machine learning algorithm where one optimizes some cost function via gradient steps. This makes SGD very attractive for large problems @@ -187,7 +196,8 @@ class SGD: """ def __init__(self, loss='squared_loss', penalty='none', alpha=0.0001, l1_ratio=0.15, - fit_intercept=True, epochs=1000, tol=1e-3, shuffle=True, learning_rate='constant', eta0=0.0, power_t=0.5, batch_size=32, n_iter_no_change=5): + fit_intercept=True, epochs=1000, tol=1e-3, shuffle=True, learning_rate='constant', eta0=0.0, + power_t=0.5, batch_size=32, n_iter_no_change=5, handle=None): if loss in ['hinge', 'log', 'squared_loss']: self.loss = self._get_loss_int(loss) @@ -201,6 +211,7 @@ class SGD: msg = "penalty {!r} is not supported" raise TypeError(msg.format(penalty)) + super(SGD, self).__init__(handle=handle, verbose=False) self.alpha = alpha self.l1_ratio = l1_ratio self.fit_intercept = fit_intercept @@ -324,52 +335,57 @@ class SGD: cdef float c_intercept1 cdef double c_intercept2 + cdef cumlHandle* handle_ = self.handle.getHandle() if self.gdf_datatype.type == np.float32: - sgdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.batch_size, - self.epochs, - self.lr_type, - self.eta0, - self.power_t, - self.loss, - self.penalty, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol, + sgdFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.batch_size, + self.epochs, + self.lr_type, + self.eta0, + self.power_t, + self.loss, + self.penalty, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol, self.n_iter_no_change) self.intercept_ = c_intercept1 else: - sgdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.batch_size, - self.epochs, - self.lr_type, - self.eta0, - self.power_t, - self.loss, - self.penalty, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol, + sgdFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.batch_size, + self.epochs, + self.lr_type, + self.eta0, + self.power_t, + self.loss, + self.penalty, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol, self.n_iter_no_change) self.intercept_ = c_intercept2 + self.handle.sync() + return self def predict(self, X): @@ -411,22 +427,28 @@ class SGD: preds = cudf.Series(np.zeros(n_rows, dtype=pred_datatype)) cdef uintptr_t preds_ptr = self._get_column_ptr(preds) + cdef cumlHandle* handle_ = self.handle.getHandle() + if pred_datatype.type == np.float32: - sgdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + sgdPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) else: - sgdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + sgdPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) + + self.handle.sync() del(X_m) @@ -470,9 +492,11 @@ class SGD: cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) preds = cudf.Series(np.zeros(n_rows, dtype=pred_datatype)) cdef uintptr_t preds_ptr = self._get_column_ptr(preds) - + cdef cumlHandle* handle_ = self.handle.getHandle() + if pred_datatype.type == np.float32: - sgdPredictBinaryClass(X_ptr, + sgdPredictBinaryClass(handle_[0], + X_ptr, n_rows, n_cols, coef_ptr, @@ -480,7 +504,8 @@ class SGD: preds_ptr, self.loss) else: - sgdPredictBinaryClass(X_ptr, + sgdPredictBinaryClass(handle_[0], + X_ptr, n_rows, n_cols, coef_ptr, @@ -488,6 +513,8 @@ class SGD: preds_ptr, self.loss) + self.handle.sync() + del(X_m) return preds From 68aa6bf7d3b3ea418fc81065b862bb189e2f16f6 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 8 May 2019 12:55:22 -0400 Subject: [PATCH 027/156] All components of Dbscan are now officially integrated into prims. Still need to test & document prims. API still needs some work. --- cuML/src/dbscan/labelling/algo2.h | 130 ++--------------------------- cuML/src/dbscan/labelling/pack.h | 15 +--- cuML/src/dbscan/labelling/runner.h | 16 +--- cuML/src/dbscan/runner.h | 59 +++++++------ ml-prims/src/array/array.h | 98 ++++++++++++++++++++++ ml-prims/src/sparse/csr.h | 102 +++++++++++----------- 6 files changed, 200 insertions(+), 220 deletions(-) create mode 100644 ml-prims/src/array/array.h diff --git a/cuML/src/dbscan/labelling/algo2.h b/cuML/src/dbscan/labelling/algo2.h index 54c3f73714..a35c3046ac 100644 --- a/cuML/src/dbscan/labelling/algo2.h +++ b/cuML/src/dbscan/labelling/algo2.h @@ -28,6 +28,8 @@ #include #include +#include "sparse/csr.h" + namespace Dbscan { namespace Label { @@ -45,132 +47,18 @@ namespace Algo2 { using namespace thrust; using namespace MLCommon; -template -__global__ void label_device(Pack data, int startVertexId, int batchSize) { - int tid = threadIdx.x + blockIdx.x*TPB_X; - if(tidcj) { - ci = cj; - ci_mod = true; - } - } - if(ci_mod) { - atomicMin(data.db_cluster + startVertexId + tid, ci); - data.xa[startVertexId + tid] = true; - data.m[0] = true; - } - } - } -} - -template -__global__ void init_label(Pack data, int startVertexId, int batchSize, Type MAX_LABEL) { - /** F1 and F2 in the paper correspond to fa and xa */ - /** Cd in paper corresponds to db_cluster */ - int tid = threadIdx.x + blockIdx.x*TPB_X; - if(tid -__global__ void init_all(Pack data, Type MAX_LABEL) { - int tid = threadIdx.x + blockIdx.x*TPB_X; - if(tid -__global__ void map_label(Pack data, Type MAX_LABEL) { - int tid = threadIdx.x + blockIdx.x*TPB_X; - if(tid -void label(const ML::cumlHandle_impl& handle, Pack data, int startVertexId, int batchSize, cudaStream_t stream) { - size_t N = data.N; - bool host_m; - MLCommon::host_buffer host_fa(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_xa(handle.getHostAllocator(), stream, N); - - dim3 blocks(ceildiv(batchSize, TPB_X)); - dim3 threads(TPB_X); - Type MAX_LABEL = std::numeric_limits::max(); - - init_label<<>>(data, startVertexId, batchSize, MAX_LABEL); - do { - CUDA_CHECK( cudaMemsetAsync(data.m, false, sizeof(bool), stream) ); - label_device<<>>(data, startVertexId, batchSize); - //** swapping F1 and F2 - MLCommon::updateHost(host_fa.data(), data.fa, N, stream); - MLCommon::updateHost(host_xa.data(), data.xa, N, stream); - MLCommon::updateDevice(data.fa, host_xa.data(), N, stream); - MLCommon::updateDevice(data.xa, host_fa.data(), N, stream); - //** Updating m * - MLCommon::updateHost(&host_m, data.m, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - } while(host_m); -} +void launcher(const ML::cumlHandle_impl& handle, Pack data, Type N, + int startVertexId, int batchSize, cudaStream_t stream) { -template -void launcher(const ML::cumlHandle_impl& handle, Pack data, Type N, int startVertexId, int batchSize, cudaStream_t stream) { - //data.resetArray(stream); - dim3 blocks(ceildiv(data.N, TPB_X)); - dim3 threads(TPB_X); - Type MAX_LABEL = std::numeric_limits::max(); - if(startVertexId == 0) - init_all<<>>(data, MAX_LABEL); - label(handle, data, startVertexId, batchSize, stream); -} + bool *core_pts = data.core_pts; -template -void relabel(const ML::cumlHandle_impl& handle, Pack data, cudaStream_t stream) { - dim3 blocks(ceildiv(data.N, TPB_X)); - dim3 threads(TPB_X); - Type MAX_LABEL = std::numeric_limits::max(); - size_t N = data.N; - MLCommon::host_buffer host_db_cluster(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_map_id(handle.getHostAllocator(), stream, N); - memset(host_map_id.data(), 0, N*sizeof(Type)); - MLCommon::updateHost(host_db_cluster.data(), data.db_cluster, N, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - sort(host, host_db_cluster.data(), host_db_cluster.data() + N); - Type *uid = unique(host, host_db_cluster.data(), host_db_cluster.data() + N, equal_to()); - Type num_clusters = uid - host_db_cluster.data(); - for(int i=0; i<<>>(data, MAX_LABEL); + MLCommon::Sparse::weak_cc_batched( + data.db_cluster, data.ex_scan, data.adj_graph, N, + startVertexId, batchSize, [core_pts](int tid) {return core_pts[tid];}, + data.state, stream); } } // End Algo2 diff --git a/cuML/src/dbscan/labelling/pack.h b/cuML/src/dbscan/labelling/pack.h index eb3afeafea..40d2821932 100644 --- a/cuML/src/dbscan/labelling/pack.h +++ b/cuML/src/dbscan/labelling/pack.h @@ -15,7 +15,7 @@ */ #pragma once - +#include "sparse/csr.h" #include "utils.h" namespace Dbscan { @@ -45,21 +45,12 @@ struct Pack { bool *visited; /** array to store the final cluster */ Type *db_cluster; - /** array to store visited points for GPU */ - bool *xa; - /** array to store border points for GPU */ - bool *fa; - /** bool variable for algo 2 */ - bool *m; - /** array to store map index after sorting */ - Type *map_id; + + MLCommon::Sparse::WeakCCState *state; void resetArray(cudaStream_t stream) { CUDA_CHECK(cudaMemsetAsync(visited, false, sizeof(bool)*N, stream)); CUDA_CHECK(cudaMemsetAsync(db_cluster, 0, sizeof(Type)*N, stream)); - CUDA_CHECK(cudaMemsetAsync(xa, false, sizeof(bool)*N, stream)); - CUDA_CHECK(cudaMemsetAsync(fa, false, sizeof(bool)*N, stream)); - CUDA_CHECK(cudaMemsetAsync(map_id, 0, sizeof(Type)*N, stream)); } }; diff --git a/cuML/src/dbscan/labelling/runner.h b/cuML/src/dbscan/labelling/runner.h index 8d067038f8..016919b723 100644 --- a/cuML/src/dbscan/labelling/runner.h +++ b/cuML/src/dbscan/labelling/runner.h @@ -24,6 +24,7 @@ #include "pack.h" #include "algo2.h" #include +#include "sparse/csr.h" namespace Dbscan { namespace Label { @@ -32,10 +33,10 @@ namespace Label { template void run(const ML::cumlHandle_impl& handle, bool* adj, int* vd, Type* adj_graph, Type* ex_scan, Type N, Type minpts, bool* core_pts, bool* visited, Type *db_cluster, - bool *xa, bool *fa, bool *m, Type *map_id, + MLCommon::Sparse::WeakCCState *state, int algo, int startVertexId, int batchSize, cudaStream_t stream) { Pack data = {vd, adj, adj_graph, ex_scan, core_pts, N, minpts, - visited, db_cluster, xa, fa, m, map_id}; + visited, db_cluster, state}; switch(algo) { case 0: Naive::launcher(handle, data, startVertexId, batchSize, stream); @@ -45,22 +46,13 @@ void run(const ML::cumlHandle_impl& handle, bool* adj, int* vd, Type* adj_graph, Algo1::launcher(handle, data, startVertexId, batchSize, stream); break; case 2: - Algo2::launcher(handle, data, N, startVertexId, batchSize, stream); + Algo2::launcher(handle, data, N, startVertexId, batchSize, state, stream); break; default: ASSERT(false, "Incorrect algo passed! '%d'", algo); } } -template -void final_relabel(const ML::cumlHandle_impl& handle, bool* adj, int* vd, Type* adj_graph, Type* ex_scan, Type N, - Type minpts, bool* core_pts, bool* visited, Type *db_cluster, - bool *xa, bool *fa, bool *m, Type *map_id, cudaStream_t stream) { - Pack data = {vd, adj, adj_graph, ex_scan, core_pts, N, minpts, - visited, db_cluster, xa, fa, m, map_id}; - Algo2::relabel(handle, data, stream); -} - } // namespace Label } // namespace Dbscan diff --git a/cuML/src/dbscan/runner.h b/cuML/src/dbscan/runner.h index b39482f2fa..22ce3f3dc2 100644 --- a/cuML/src/dbscan/runner.h +++ b/cuML/src/dbscan/runner.h @@ -23,14 +23,30 @@ #include #include +#include "array/array.h" +#include "sparse/csr.h" + namespace Dbscan { using namespace MLCommon; +static const int TPB = 256; + + template -__global__ void relabelForSkl(Type* labels, Type N) { +__global__ void relabelForSkl(Type* labels, Type N, Type MAX_LABEL) { int tid = threadIdx.x + blockDim.x * blockIdx.x; - if(tid < N) --labels[tid]; + if(labels[tid] == MAX_LABEL) labels[tid] = -1; + else if(tid < N) --labels[tid]; +} + +template +void final_relabel(Type *db_cluster, Type N, cudaStream_t stream) { + + Type MAX_LABEL = std::numeric_limits::max(); + + MLCommon::Array::map_to_monotonic(db_cluster, db_cluster, N, stream, + [MAX_LABEL] __device__ (int val) {return val == MAX_LABEL;}); } template @@ -50,25 +66,21 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Type N, Type D, Type_f int algoVd, int algoAdj, int algoCcl, void* workspace, int nBatches, cudaStream_t stream) { const size_t align = 256; - int batchSize = ceildiv(N, nBatches); + Type batchSize = ceildiv(N, nBatches); size_t adjSize = alignTo(sizeof(bool) * N * batchSize, align); size_t corePtsSize = alignTo(sizeof(bool) * batchSize, align); - size_t visitedSize = alignTo(sizeof(bool) * batchSize, align); size_t xaSize = alignTo(sizeof(bool) * N, align); size_t mSize = alignTo(sizeof(bool), align); size_t vdSize = alignTo(sizeof(Type) * (batchSize + 1), align); size_t exScanSize = alignTo(sizeof(Type) * batchSize, align); - size_t mapIdSize = alignTo(sizeof(Type) * N, align); if(workspace == NULL) { auto size = adjSize + corePtsSize - + visitedSize + 2 * xaSize + mSize + vdSize - + exScanSize - + mapIdSize; + + exScanSize; return size; } // partition the temporary workspace needed for different stages of dbscan @@ -77,18 +89,18 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Type N, Type D, Type_f char* temp = (char*)workspace; bool* adj = (bool*)temp; temp += adjSize; bool* core_pts = (bool*)temp; temp += corePtsSize; - bool* visited = (bool*)temp; temp += visitedSize; bool* xa = (bool*)temp; temp += xaSize; bool* fa = (bool*)temp; temp += xaSize; bool* m = (bool*)temp; temp += mSize; int* vd = (int*)temp; temp += vdSize; Type* ex_scan = (Type*)temp; temp += exScanSize; - Type* map_id = (Type*)temp; temp += mapIdSize; // Running VertexDeg + MLCommon::Sparse::WeakCCState state(xa, fa, m); + for (int i = 0; i < nBatches; i++) { MLCommon::device_buffer adj_graph(handle.getDeviceAllocator(), stream); - int startVertexId = i * batchSize; + Type startVertexId = i * batchSize; int nPoints = min(N-startVertexId, batchSize); if(nPoints <= 0) @@ -111,22 +123,21 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Type N, Type D, Type_f std::cout << MLCommon::arr2Str(adj, batchSize*N, "adj", stream) << std::endl; std::cout << MLCommon::arr2Str(adj_graph.data(), adjlen, "adj_graph", stream) << std::endl; - // Running Labelling - Label::run(handle, adj, vd, adj_graph.data(), ex_scan, N, minPts, core_pts, visited, - labels, xa, fa, m, map_id, algoCcl, startVertexId, - nPoints, stream); - } - if (algoCcl == 2) { - Type *adj_graph = NULL; - Label::final_relabel(handle, adj, vd, adj_graph, ex_scan, N, minPts, core_pts, - visited, labels, xa, fa, m, map_id, stream); + MLCommon::Sparse::weak_cc_batched( + labels, ex_scan, adj_graph.data(), vd, N, + startVertexId, batchSize, [core_pts] __device__ (Type tid) { + return core_pts[tid]; + },&state, stream); } + if (algoCcl == 2) + final_relabel(labels, N, stream); + + Type MAX_LABEL = std::numeric_limits::max(); - static const int TPB = 256; - int nblks = ceildiv(N, TPB); - relabelForSkl<<>>(labels, N); + int nblks = ceildiv(N, TPB); + relabelForSkl<<>>(labels, N, MAX_LABEL); - CUDA_CHECK(cudaPeekAtLastError()); + CUDA_CHECK(cudaPeekAtLastError()); return (size_t) 0; diff --git a/ml-prims/src/array/array.h b/ml-prims/src/array/array.h new file mode 100644 index 0000000000..b39823411d --- /dev/null +++ b/ml-prims/src/array/array.h @@ -0,0 +1,98 @@ +/* + * array.h + * + * Created on: May 8, 2019 + * Author: cjnolet + */ + +#pragma once + +#include +#include + +#include "cuda_utils.h" + + +namespace MLCommon { +namespace Array { + +template +__global__ void map_label_kernel(Type *map_ids, Type *in, Type *out, + Type N, Lambda filter_op) { + int tid = threadIdx.x + blockIdx.x*TPB_X; + if(tid < N) { + + if(!filter_op(in[tid])) { + for(int i=0; i < N; i++) { + if(in[tid] == map_ids[i]) { + out[tid] = i + 1; + break; + } + } + } + } +} + + +/** + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param N number of elements in the input array + * @param stream cuda stream to use + * @param filter_op an optional function for specifying which values + * should have monotonically increasing labels applied to them. + */ +template +void map_to_monotonic(Type *out, Type *in, Type N, cudaStream_t stream, + Lambda filter_op) { + + static const int TPB_X = 256; + + dim3 blocks(ceildiv(N, TPB_X)); + dim3 threads(TPB_X); + + Type *map_ids; + allocate(map_ids, N, stream); + + Type *host_in = (Type*)malloc(N*sizeof(Type)); + Type *host_map_ids = (Type*)malloc(N*sizeof(Type)); + + memset(host_map_ids, 0, N*sizeof(Type)); + + MLCommon::updateHost(host_in, in, N, stream); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + thrust::sort(host, host_in, host_in + N); + + Type *uid = thrust::unique(host, host_in, host_in + N, equal_to()); + Type num_clusters = uid - host_in; + for(int i=0; i<<>>(map_ids, in, out, N, filter_op); +} + +template +void map_to_monotonic(Type *out, Type *in, Type N, cudaStream_t stream) { + + map_to_monotonic(out, in, N, stream, + [] __device__ (int val) {return false;}); +} + + + + +}; +}; diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index 384433a467..816681e6a2 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -616,25 +616,41 @@ void csr_adj_graph(T *row_ind, T n_rows, } template -class WeaklyCCState { - protected: +class WeakCCState { + public: + bool *xa; bool *fa; bool *m; - T *map_id; + bool owner; - public: - WeaklyCCState(T n) { - // allocate + WeakCCState(T n): owner(true) { + MLCommon::allocate(xa, n, true); + MLCommon::allocate(fa, n, true); + MLCommon::allocate(m, 1, true); } - ~WeaklyCCState() { - // free + WeakCCState(bool *xa, bool *fa, bool *m): + owner(false), xa(xa), fa(fa), m(m) { + } + + ~WeakCCState() { + if(owner) { + try { + CUDA_CHECK(cudaFree(xa)); + CUDA_CHECK(cudaFree(fa)); + CUDA_CHECK(cudaFree(m)); + } catch(Exception &e) { + std::cout << "Exception freeing memory for WeakCCState: " << + e.what() << std::endl; + } + } } }; template -__global__ void weak_cc_label_device(Type *labels, +__global__ void weak_cc_label_device( + Type *labels, Type *row_ind, Type *row_ind_ptr, Type *vd, bool *fa, bool *xa, bool *m, int startVertexId, int batchSize) { @@ -669,7 +685,8 @@ __global__ void weak_cc_label_device(Type *labels, template -__global__ void weak_cc_init_label_kernel(Type *labels, int startVertexId, int batchSize, Type MAX_LABEL, Lambda filter_op) { +__global__ void weak_cc_init_label_kernel(Type *labels, int startVertexId, int batchSize, + Type MAX_LABEL, Lambda filter_op) { /** F1 and F2 in the paper correspond to fa and xa */ /** Cd in paper corresponds to db_cluster */ int tid = threadIdx.x + blockIdx.x*TPB_X; @@ -680,7 +697,8 @@ __global__ void weak_cc_init_label_kernel(Type *labels, int startVertexId, int b } template -__global__ void weak_cc_init_all_kernel(Type *labels, bool *fa, bool *xa, Type MAX_LABEL) { +__global__ void weak_cc_init_all_kernel(Type *labels, bool *fa, bool *xa, + Type N, Type MAX_LABEL) { int tid = threadIdx.x + blockIdx.x*TPB_X; if(tid -void weak_cc_label_batched(const ML::cumlHandle_impl& handle, Type *labels, - Type *row_ind, Type *row_ind_ptr, Type *vd, - WeaklyCCState *state, - int startVertexId, int batchSize, cudaStream_t stream, Lambda filter_op) { - size_t N = data.N; +void weak_cc_label_batched(Type *labels, + Type *row_ind, Type *row_ind_ptr, Type *vd, Type N, + WeakCCState *state, + Type startVertexId, Type batchSize, + cudaStream_t stream, Lambda filter_op) { bool host_m; - MLCommon::host_buffer host_fa(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_xa(handle.getHostAllocator(), stream, N); + bool *host_fa = (bool*)malloc(sizeof(bool)*N); + bool *host_xa = (bool*)malloc(sizeof(bool)*N); dim3 blocks(ceildiv(batchSize, TPB_X)); dim3 threads(TPB_X); @@ -707,15 +725,18 @@ void weak_cc_label_batched(const ML::cumlHandle_impl& handle, Type *labels, startVertexId, batchSize, MAX_LABEL, filter_op); do { CUDA_CHECK( cudaMemsetAsync(state->m, false, sizeof(bool), stream) ); - weak_cc_label_device<<>>(labels, - row_ind, row_ind_ptr, vs, + weak_cc_label_device<<>>( + labels, + row_ind, row_ind_ptr, vd, state->fa, state->xa, state->m, startVertexId, batchSize); + //** swapping F1 and F2 - MLCommon::updateHost(host_fa.data(), state->fa, N, stream); - MLCommon::updateHost(host_xa.data(), state->xa, N, stream); - MLCommon::updateDevice(data.fa, host_xa.data(), N, stream); - MLCommon::updateDevice(data.xa, host_fa.data(), N, stream); + MLCommon::updateHost(host_fa, state->fa, N, stream); + MLCommon::updateHost(host_xa, state->xa, N, stream); + MLCommon::updateDevice(state->fa, host_xa, N, stream); + MLCommon::updateDevice(state->xa, host_fa, N, stream); + //** Updating m * MLCommon::updateHost(&host_m, state->m, 1, stream); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -724,9 +745,10 @@ void weak_cc_label_batched(const ML::cumlHandle_impl& handle, Type *labels, template void weak_cc_batched( - Type *labels, Type *row_ind, Type *row_ind_ptr, Type N, + Type *labels, Type *row_ind, Type *row_ind_ptr, Type *vd, + Type N, Type startVertexId, Type batchSize, Lambda filter_op, - WeaklyCCState *state, cudaStream_t stream) { + WeakCCState *state, cudaStream_t stream) { dim3 blocks(ceildiv(N, TPB_X)); dim3 threads(TPB_X); @@ -734,31 +756,9 @@ void weak_cc_batched( Type MAX_LABEL = std::numeric_limits::max(); if(startVertexId == 0) weak_cc_init_all_kernel<<>> - (labels, state->fa, state->xa, MAX_LABEL); - weak_cc_label_batched(handle, data, startVertexId, batchSize, filter_op, stream); -} - -template -void weak_cc_finalize_labels(const ML::cumlHandle_impl& handle, Type *labels, - WeaklyCCState *state, size_t N, cudaStream_t stream) { - dim3 blocks(ceildiv(data.N, TPB_X)); - dim3 threads(TPB_X); - Type MAX_LABEL = std::numeric_limits::max(); - MLCommon::host_buffer host_db_cluster(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_map_id(handle.getHostAllocator(), stream, N); - - memset(host_map_id.data(), 0, N*sizeof(Type)); - MLCommon::updateHost(host_db_cluster.data(), labels, N, stream); - - CUDA_CHECK(cudaStreamSynchronize(stream)); - sort(host, host_db_cluster.data(), host_db_cluster.data() + N); - Type *uid = unique(host, host_db_cluster.data(), host_db_cluster.data() + N, equal_to()); - Type num_clusters = uid - host_db_cluster.data(); - for(int i=0; imap_id, host_map_id.data(), N, stream); - map_label<<>>(data, MAX_LABEL); + (labels, state->fa, state->xa, N, MAX_LABEL); + weak_cc_label_batched(labels, row_ind, row_ind_ptr, vd, N, state, + startVertexId, batchSize, stream, filter_op); } }; From 4949609a47a698431b9ba6096929b4527efbc2ec Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 8 May 2019 13:02:33 -0400 Subject: [PATCH 028/156] Building non-batched weak cc from batched version. --- cuML/src/dbscan/runner.h | 5 +++-- ml-prims/src/array/array.h | 10 +--------- ml-prims/src/sparse/csr.h | 19 +++++++++++++++++-- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/cuML/src/dbscan/runner.h b/cuML/src/dbscan/runner.h index 22ce3f3dc2..fdfe2d0e15 100644 --- a/cuML/src/dbscan/runner.h +++ b/cuML/src/dbscan/runner.h @@ -125,9 +125,10 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Type N, Type D, Type_f MLCommon::Sparse::weak_cc_batched( labels, ex_scan, adj_graph.data(), vd, N, - startVertexId, batchSize, [core_pts] __device__ (Type tid) { + startVertexId, batchSize, &state, stream, + [core_pts] __device__ (Type tid) { return core_pts[tid]; - },&state, stream); + }); } if (algoCcl == 2) final_relabel(labels, N, stream); diff --git a/ml-prims/src/array/array.h b/ml-prims/src/array/array.h index b39823411d..b06d2bb31e 100644 --- a/ml-prims/src/array/array.h +++ b/ml-prims/src/array/array.h @@ -53,7 +53,7 @@ __global__ void map_label_kernel(Type *map_ids, Type *in, Type *out, */ template void map_to_monotonic(Type *out, Type *in, Type N, cudaStream_t stream, - Lambda filter_op) { + Lambda filter_op = [] __device__ (int val) {return false;}) { static const int TPB_X = 256; @@ -84,14 +84,6 @@ void map_to_monotonic(Type *out, Type *in, Type N, cudaStream_t stream, map_label_kernel<<>>(map_ids, in, out, N, filter_op); } -template -void map_to_monotonic(Type *out, Type *in, Type N, cudaStream_t stream) { - - map_to_monotonic(out, in, N, stream, - [] __device__ (int val) {return false;}); -} - - }; diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index 816681e6a2..e78223607c 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -747,8 +747,9 @@ template void weak_cc_batched( Type *labels, Type *row_ind, Type *row_ind_ptr, Type *vd, Type N, - Type startVertexId, Type batchSize, Lambda filter_op, - WeakCCState *state, cudaStream_t stream) { + Type startVertexId, Type batchSize, + WeakCCState *state, cudaStream_t stream, + Lambda filter_op = [] __device__ (int tid) {return true;}) { dim3 blocks(ceildiv(N, TPB_X)); dim3 threads(TPB_X); @@ -761,5 +762,19 @@ void weak_cc_batched( startVertexId, batchSize, stream, filter_op); } +template +void weak_cc(Type *labels, Type *row_ind, Type *row_ind_ptr, + Type *vd, Type N, + cudaStream_t stream, + Lambda filter_op = [] __device__ (int tid) {return true;}) { + + WeakCCState state; + weak_cc_batched( + labels, row_ind, row_ind_ptr, + vd, N, 0, N, stream, + filter_op); +} + + }; }; From cfca839e76d55c6e62b77ea358a2c507d1fd668c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 8 May 2019 14:09:58 -0400 Subject: [PATCH 029/156] Updating changelog --- CHANGELOG.md | 2 ++ cuML/src/dbscan/runner.h | 4 ++++ ml-prims/src/array/array.h | 7 +++++-- ml-prims/src/sparse/csr.h | 37 +++++++++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25b95b0f95..15c1ad3c8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -122,6 +122,8 @@ - PR #315: Documentation updating and enhancements - PR #330: Added ignored argument to pca.fit_transform to map to sklearn's implemenation - PR #342: Change default ABI to ON +- PR #572: Pulling DBSCAN components into reusable primitives + ## Bug Fixes diff --git a/cuML/src/dbscan/runner.h b/cuML/src/dbscan/runner.h index fdfe2d0e15..7bdbf07e1d 100644 --- a/cuML/src/dbscan/runner.h +++ b/cuML/src/dbscan/runner.h @@ -43,6 +43,8 @@ __global__ void relabelForSkl(Type* labels, Type N, Type MAX_LABEL) { template void final_relabel(Type *db_cluster, Type N, cudaStream_t stream) { + std::cout << "DBSCAN PERFORMING FINAL RELABEL!" << std::endl; + Type MAX_LABEL = std::numeric_limits::max(); MLCommon::Array::map_to_monotonic(db_cluster, db_cluster, N, stream, @@ -50,6 +52,8 @@ void final_relabel(Type *db_cluster, Type N, cudaStream_t stream) { } template + + /* @param N number of points * @param D dimensionality of the points * @param eps epsilon neighborhood criterion diff --git a/ml-prims/src/array/array.h b/ml-prims/src/array/array.h index b06d2bb31e..80d0bb535a 100644 --- a/ml-prims/src/array/array.h +++ b/ml-prims/src/array/array.h @@ -7,6 +7,9 @@ #pragma once +#include +#include +#include #include #include @@ -72,9 +75,9 @@ void map_to_monotonic(Type *out, Type *in, Type N, cudaStream_t stream, CUDA_CHECK(cudaStreamSynchronize(stream)); - thrust::sort(host, host_in, host_in + N); + thrust::sort(thrust::host, host_in, host_in + N); - Type *uid = thrust::unique(host, host_in, host_in + N, equal_to()); + Type *uid = thrust::unique(thrust::host, host_in, host_in + N, thrust::equal_to()); Type num_clusters = uid - host_in; for(int i=0; i #include @@ -743,6 +745,20 @@ void weak_cc_label_batched(Type *labels, } while(host_m); } +/** + * @brief Compute weakly connected components + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param vd the vertex degree array (todo: modify this algorithm to only use row_ind) + * @param N number of vertices + * @param stream the cuda stream to use + * @param filter_op an optional filtering function to determine which points + * should get considered for labeling. + */ template void weak_cc_batched( Type *labels, Type *row_ind, Type *row_ind_ptr, Type *vd, @@ -762,6 +778,24 @@ void weak_cc_batched( startVertexId, batchSize, stream, filter_op); } +/** + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param vd the vertex degree array (todo: modify this algorithm to only use row_ind) + * @param N number of vertices + * @param stream the cuda stream to use + * @param filter_op an optional filtering function to determine which points + * should get considered for labeling. + */ template void weak_cc(Type *labels, Type *row_ind, Type *row_ind_ptr, Type *vd, Type N, @@ -773,6 +807,9 @@ void weak_cc(Type *labels, Type *row_ind, Type *row_ind_ptr, labels, row_ind, row_ind_ptr, vd, N, 0, N, stream, filter_op); + + // Map the labels to a monotonic set + Array::map_to_monotonic(labels, labels, stream); } From 89589f9e92c4bce1d3f747fb5b288299b42f13a5 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 8 May 2019 14:29:28 -0400 Subject: [PATCH 030/156] Adding comments to all the new sparse CSR prims. --- cuML/src/dbscan/labelling/algo2.h | 5 --- cuML/src/dbscan/runner.h | 2 +- ml-prims/src/array/array.h | 2 +- ml-prims/src/sparse/csr.h | 61 +++++++++++++++++++++++++++---- 4 files changed, 56 insertions(+), 14 deletions(-) diff --git a/cuML/src/dbscan/labelling/algo2.h b/cuML/src/dbscan/labelling/algo2.h index a35c3046ac..05dc870c8a 100644 --- a/cuML/src/dbscan/labelling/algo2.h +++ b/cuML/src/dbscan/labelling/algo2.h @@ -53,12 +53,7 @@ template void launcher(const ML::cumlHandle_impl& handle, Pack data, Type N, int startVertexId, int batchSize, cudaStream_t stream) { - bool *core_pts = data.core_pts; - MLCommon::Sparse::weak_cc_batched( - data.db_cluster, data.ex_scan, data.adj_graph, N, - startVertexId, batchSize, [core_pts](int tid) {return core_pts[tid];}, - data.state, stream); } } // End Algo2 diff --git a/cuML/src/dbscan/runner.h b/cuML/src/dbscan/runner.h index 7bdbf07e1d..e03822eb97 100644 --- a/cuML/src/dbscan/runner.h +++ b/cuML/src/dbscan/runner.h @@ -47,7 +47,7 @@ void final_relabel(Type *db_cluster, Type N, cudaStream_t stream) { Type MAX_LABEL = std::numeric_limits::max(); - MLCommon::Array::map_to_monotonic(db_cluster, db_cluster, N, stream, + MLCommon::Array::make_monotonic(db_cluster, db_cluster, N, stream, [MAX_LABEL] __device__ (int val) {return val == MAX_LABEL;}); } diff --git a/ml-prims/src/array/array.h b/ml-prims/src/array/array.h index 80d0bb535a..d05a9d77cb 100644 --- a/ml-prims/src/array/array.h +++ b/ml-prims/src/array/array.h @@ -55,7 +55,7 @@ __global__ void map_label_kernel(Type *map_ids, Type *in, Type *out, * should have monotonically increasing labels applied to them. */ template -void map_to_monotonic(Type *out, Type *in, Type N, cudaStream_t stream, +void make_monotonic(Type *out, Type *in, Type N, cudaStream_t stream, Lambda filter_op = [] __device__ (int val) {return false;}) { static const int TPB_X = 256; diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index a47b85d309..73e3320937 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -572,7 +572,15 @@ __global__ void csr_row_op_batched_kernel(T *row_ind, T total_rows, } /** - * Performs a batched row operation on the rows of a CSR matrix. + * @brief Perform a custom row operation on a CSR matrix in batches. + * @tparam T numerical type of row_ind array + * @tparam TPB_X number of threads per block to use for underlying kernel + * @tparam Lambda type of custom operation function + * @param row_ind the CSR row_ind array to perform parallel operations over + * @param total_rows total number vertices in graph + * @param batchSize size of row_ind + * @param op custom row operation functor + * @param stream cuda stream to use */ template void csr_row_op_batched(T *row_ind, T total_rows, T batchSize, @@ -585,11 +593,35 @@ void csr_row_op_batched(T *row_ind, T total_rows, T batchSize, (row_ind, total_rows, batchSize, op); } +/** + * @brief Perform a custom row operation on a CSR matrix. + * @tparam T numerical type of row_ind array + * @tparam TPB_X number of threads per block to use for underlying kernel + * @tparam Lambda type of custom operation function + * @param row_ind the CSR row_ind array to perform parallel operations over + * @param n_rows total number vertices in graph (size of row_ind) + * @param op custom row operation functor + * @param stream cuda stream to use + */ template void csr_row_op(T *row_ind, T n_rows, Lambda op, cudaStream_t stream) { csr_row_op_batched(row_ind, n_rows, n_rows, op, stream); } +/** + * @brief Constructs an adjacency graph CSR row_ind_ptr array from + * a row_ind array and adjacency array. + * @tparam T the numeric type of the index arrays + * @tparam TPB_X the number of threads to use per block for kernels + * @tparam Lambda function for fused operation in the adj_graph construction + * @param row_ind the input CSR row_ind array + * @param total_rows number of vertices in graph + * @param batchSize number of vertices in current batch + * @param adj an adjacency array + * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph + * @param stream cuda stream to use + */ + template void csr_adj_graph_batched(T *row_ind, T total_rows, T batchSize, bool *adj, T *row_ind_ptr, Lambda fused_op, cudaStream_t stream) { @@ -605,16 +637,27 @@ void csr_adj_graph_batched(T *row_ind, T total_rows, T batchSize, if(adj[batchSize * i + row]) { row_ind_ptr[start_idx + k] = i; k += 1; - printf("row=%d, adj=%d, total_rows=%d, k=%d\n", row, adj[total_rows * i + row], total_rows, k); } } }, stream); } +/** + * @brief Constructs an adjacency graph CSR row_ind_ptr array from a + * a row_ind array and adjacency array. + * @tparam T the numeric type of the index arrays + * @tparam TPB_X the number of threads to use per block for kernels + * @param row_ind the input CSR row_ind array + * @param n_rows number of total vertices in graph + * @param adj an adjacency array + * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph + * @param stream cuda stream to use + */ template void csr_adj_graph(T *row_ind, T n_rows, bool *adj, T *row_ind_ptr, cudaStream_t stream) { - csr_adj_graph_batched(row_ind, n_rows, n_rows, adj, row_ind_ptr, stream); + csr_adj_graph_batched(row_ind, n_rows, n_rows, adj, + row_ind_ptr, stream); } template @@ -746,7 +789,11 @@ void weak_cc_label_batched(Type *labels, } /** - * @brief Compute weakly connected components + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * * @tparam Type the numeric type of non-floating point elements * @tparam TPB_X the threads to use per block when configuring the kernel * @tparam Lambda the type of an optional filter function (int)->bool @@ -755,6 +802,9 @@ void weak_cc_label_batched(Type *labels, * @param row_ind_ptr the row index pointer of the CSR array * @param vd the vertex degree array (todo: modify this algorithm to only use row_ind) * @param N number of vertices + * @param startVertexId the starting vertex index for the current batch + * @param batchSize number of vertices for current batch + * @param state instance of inter-batch state management * @param stream the cuda stream to use * @param filter_op an optional filtering function to determine which points * should get considered for labeling. @@ -807,9 +857,6 @@ void weak_cc(Type *labels, Type *row_ind, Type *row_ind_ptr, labels, row_ind, row_ind_ptr, vd, N, 0, N, stream, filter_op); - - // Map the labels to a monotonic set - Array::map_to_monotonic(labels, labels, stream); } From b82e045127850b32b951690570e12e45bfd09981 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 8 May 2019 15:18:24 -0400 Subject: [PATCH 031/156] Removing unnecessary vertex degree from weak cc --- cuML/src/dbscan/runner.h | 7 +------ ml-prims/src/sparse/csr.h | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/cuML/src/dbscan/runner.h b/cuML/src/dbscan/runner.h index e03822eb97..078cce9457 100644 --- a/cuML/src/dbscan/runner.h +++ b/cuML/src/dbscan/runner.h @@ -43,8 +43,6 @@ __global__ void relabelForSkl(Type* labels, Type N, Type MAX_LABEL) { template void final_relabel(Type *db_cluster, Type N, cudaStream_t stream) { - std::cout << "DBSCAN PERFORMING FINAL RELABEL!" << std::endl; - Type MAX_LABEL = std::numeric_limits::max(); MLCommon::Array::make_monotonic(db_cluster, db_cluster, N, stream, @@ -124,11 +122,8 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Type N, Type D, Type_f AdjGraph::run(handle, adj, vd, adj_graph.data(), ex_scan, N, minPts, core_pts, algoAdj, nPoints, stream); - std::cout << MLCommon::arr2Str(adj, batchSize*N, "adj", stream) << std::endl; - std::cout << MLCommon::arr2Str(adj_graph.data(), adjlen, "adj_graph", stream) << std::endl; - MLCommon::Sparse::weak_cc_batched( - labels, ex_scan, adj_graph.data(), vd, N, + labels, ex_scan, adj_graph.data(), adjlen, N, startVertexId, batchSize, &state, stream, [core_pts] __device__ (Type tid) { return core_pts[tid]; diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index 73e3320937..7dc00cf9c9 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -696,7 +696,7 @@ class WeakCCState { template __global__ void weak_cc_label_device( Type *labels, - Type *row_ind, Type *row_ind_ptr, Type *vd, + Type *row_ind, Type *row_ind_ptr, Type nnz, bool *fa, bool *xa, bool *m, int startVertexId, int batchSize) { int tid = threadIdx.x + blockIdx.x*TPB_X; @@ -707,7 +707,10 @@ __global__ void weak_cc_label_device( Type ci, cj; bool ci_mod = false; ci = labels[tid + startVertexId]; - for(int j=0; j< int(vd[tid]); j++) { // TODO: Can't this be calculated from the ex_scan? + + Type degree = get_stop_idx(tid, batchSize,nnz, row_ind) - row_ind[tid]; + + for(int j=0; j< int(degree); j++) { // TODO: Can't this be calculated from the ex_scan? cj = labels[row_ind_ptr[start + j]]; if(ci void weak_cc_label_batched(Type *labels, - Type *row_ind, Type *row_ind_ptr, Type *vd, Type N, + Type *row_ind, Type *row_ind_ptr, Type nnz, Type N, WeakCCState *state, Type startVertexId, Type batchSize, cudaStream_t stream, Lambda filter_op) { @@ -772,7 +775,7 @@ void weak_cc_label_batched(Type *labels, CUDA_CHECK( cudaMemsetAsync(state->m, false, sizeof(bool), stream) ); weak_cc_label_device<<>>( labels, - row_ind, row_ind_ptr, vd, + row_ind, row_ind_ptr, nnz, state->fa, state->xa, state->m, startVertexId, batchSize); @@ -800,7 +803,7 @@ void weak_cc_label_batched(Type *labels, * @param labels an array for the output labels * @param row_ind the compressed row index of the CSR array * @param row_ind_ptr the row index pointer of the CSR array - * @param vd the vertex degree array (todo: modify this algorithm to only use row_ind) + * @param nnz the size of row_ind_ptr array * @param N number of vertices * @param startVertexId the starting vertex index for the current batch * @param batchSize number of vertices for current batch @@ -810,10 +813,8 @@ void weak_cc_label_batched(Type *labels, * should get considered for labeling. */ template -void weak_cc_batched( - Type *labels, Type *row_ind, Type *row_ind_ptr, Type *vd, - Type N, - Type startVertexId, Type batchSize, +void weak_cc_batched(Type *labels, Type *row_ind, Type *row_ind_ptr, + Type nnz, Type N, Type startVertexId, Type batchSize, WeakCCState *state, cudaStream_t stream, Lambda filter_op = [] __device__ (int tid) {return true;}) { @@ -824,7 +825,7 @@ void weak_cc_batched( if(startVertexId == 0) weak_cc_init_all_kernel<<>> (labels, state->fa, state->xa, N, MAX_LABEL); - weak_cc_label_batched(labels, row_ind, row_ind_ptr, vd, N, state, + weak_cc_label_batched(labels, row_ind, row_ind_ptr, nnz, N, state, startVertexId, batchSize, stream, filter_op); } @@ -840,22 +841,21 @@ void weak_cc_batched( * @param labels an array for the output labels * @param row_ind the compressed row index of the CSR array * @param row_ind_ptr the row index pointer of the CSR array - * @param vd the vertex degree array (todo: modify this algorithm to only use row_ind) + * @param nnz the size of row_ind_ptr array * @param N number of vertices * @param stream the cuda stream to use * @param filter_op an optional filtering function to determine which points * should get considered for labeling. */ template -void weak_cc(Type *labels, Type *row_ind, Type *row_ind_ptr, - Type *vd, Type N, - cudaStream_t stream, +void weak_cc(Type *labels, const Type *row_ind, const Type *row_ind_ptr, + Type nnz, Type N, cudaStream_t stream, Lambda filter_op = [] __device__ (int tid) {return true;}) { WeakCCState state; weak_cc_batched( labels, row_ind, row_ind_ptr, - vd, N, 0, N, stream, + nnz, N, 0, N, stream, filter_op); } From e6a8142010ce1d96815257eb13c3c1062d3b5fcc Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 8 May 2019 15:25:03 -0400 Subject: [PATCH 032/156] Adding const pointer qualifier for input values --- ml-prims/src/sparse/csr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index 7dc00cf9c9..b3d14d2e43 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -813,7 +813,7 @@ void weak_cc_label_batched(Type *labels, * should get considered for labeling. */ template -void weak_cc_batched(Type *labels, Type *row_ind, Type *row_ind_ptr, +void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr, Type nnz, Type N, Type startVertexId, Type batchSize, WeakCCState *state, cudaStream_t stream, Lambda filter_op = [] __device__ (int tid) {return true;}) { @@ -848,7 +848,7 @@ void weak_cc_batched(Type *labels, Type *row_ind, Type *row_ind_ptr, * should get considered for labeling. */ template -void weak_cc(Type *labels, const Type *row_ind, const Type *row_ind_ptr, +void weak_cc(Type *labels, Type* const row_ind, Type* const row_ind_ptr, Type nnz, Type N, cudaStream_t stream, Lambda filter_op = [] __device__ (int tid) {return true;}) { From eeebdf0bb9cfd028dbfb0d44fb2e3a6f418d047d Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 8 May 2019 15:23:16 -0500 Subject: [PATCH 033/156] FEA Initial move of prims content into cuML folder and rename to cpp --- {cuML => cpp}/.gitignore | 0 {cuML => cpp}/CMakeLists.txt | 166 +++++---- {cuML => cpp}/DEVELOPER_GUIDE.md | 0 {ml-prims => cpp}/Doxyfile.in | 0 {cuML => cpp}/README.md | 0 {ml-prims => cpp}/cmake/FindClangFormat.cmake | 0 {ml-prims => cpp}/cmake/FindClangTidy.cmake | 0 {ml-prims => cpp}/cmake/doxygen.cmake | 0 {cuML => cpp}/examples/CMakeLists.txt | 0 {cuML => cpp}/examples/dbscan/CMakeLists.txt | 0 .../examples/dbscan/CMakeLists_standalone.txt | 0 {cuML => cpp}/examples/dbscan/README.md | 0 .../examples/dbscan/dbscan_example.cpp | 0 {cuML => cpp}/examples/dbscan/gen_dataset.py | 0 {cuML => cpp}/examples/kmeans/CMakeLists.txt | 0 .../examples/kmeans/CMakeLists_standalone.txt | 0 {cuML => cpp}/examples/kmeans/README.md | 0 .../examples/kmeans/kmeans_example.cpp | 0 .../examples/kmeans/prepare_input.py | 0 {cuML => cpp}/external | 0 {ml-prims => cpp}/scripts/plotter_kf.py | 0 {ml-prims => cpp}/scripts/run-clang-format.py | 0 {cuML => cpp}/src/.gitkeep | 0 {cuML => cpp}/src/common/allocatorAdapter.hpp | 0 {cuML => cpp}/src/common/cumlHandle.cpp | 0 {cuML => cpp}/src/common/cumlHandle.hpp | 0 {cuML => cpp}/src/common/cuml_api.cpp | 0 cpp/src/common/rmmAllocatorAdapter.hpp | 103 ++++++ {cuML => cpp}/src/common/tensor.hpp | 0 {cuML => cpp}/src/cuML.hpp | 0 {cuML => cpp}/src/cuML_api.h | 0 {cuML => cpp}/src/dbscan/adjgraph/algo.h | 0 {cuML => cpp}/src/dbscan/adjgraph/naive.h | 0 {cuML => cpp}/src/dbscan/adjgraph/pack.h | 0 {cuML => cpp}/src/dbscan/adjgraph/runner.h | 0 {cuML => cpp}/src/dbscan/common.h | 0 {cuML => cpp}/src/dbscan/dbscan.cu | 0 {cuML => cpp}/src/dbscan/dbscan.h | 0 {cuML => cpp}/src/dbscan/dbscan.hpp | 0 {cuML => cpp}/src/dbscan/dbscan_api.h | 0 {cuML => cpp}/src/dbscan/labelling/algo1.h | 0 {cuML => cpp}/src/dbscan/labelling/algo2.h | 0 {cuML => cpp}/src/dbscan/labelling/naive.h | 0 {cuML => cpp}/src/dbscan/labelling/pack.h | 0 {cuML => cpp}/src/dbscan/labelling/runner.h | 0 {cuML => cpp}/src/dbscan/runner.h | 0 {cuML => cpp}/src/dbscan/vertexdeg/algo.h | 0 {cuML => cpp}/src/dbscan/vertexdeg/naive.h | 0 {cuML => cpp}/src/dbscan/vertexdeg/pack.h | 0 {cuML => cpp}/src/dbscan/vertexdeg/runner.h | 0 {cuML => cpp}/src/decisiontree/.gitkeep | 0 {cuML => cpp}/src/decisiontree/algo_helper.h | 0 .../src/decisiontree/decisiontree.cu | 0 {cuML => cpp}/src/decisiontree/decisiontree.h | 0 .../decisiontree/kernels/col_condenser.cuh | 0 .../src/decisiontree/kernels/evaluate.cuh | 0 .../src/decisiontree/kernels/gini.cuh | 0 .../src/decisiontree/kernels/gini_def.h | 0 .../src/decisiontree/kernels/quantile.cuh | 0 .../src/decisiontree/kernels/split_labels.cuh | 0 {cuML => cpp}/src/decisiontree/memory.cuh | 0 {cuML => cpp}/src/glm/glm.cu | 0 {cuML => cpp}/src/glm/glm.hpp | 0 {cuML => cpp}/src/glm/glm_api.h | 0 {cuML => cpp}/src/glm/glm_spmg.h | 0 {cuML => cpp}/src/glm/ols.h | 0 {cuML => cpp}/src/glm/preprocess.h | 0 {cuML => cpp}/src/glm/qn/glm_base.h | 0 {cuML => cpp}/src/glm/qn/glm_linear.h | 0 {cuML => cpp}/src/glm/qn/glm_logistic.h | 0 {cuML => cpp}/src/glm/qn/glm_regularizer.h | 0 {cuML => cpp}/src/glm/qn/glm_softmax.h | 0 {cuML => cpp}/src/glm/qn/qn.h | 0 {cuML => cpp}/src/glm/qn/qn_linesearch.h | 0 {cuML => cpp}/src/glm/qn/qn_solvers.h | 0 {cuML => cpp}/src/glm/qn/qn_util.h | 0 {cuML => cpp}/src/glm/qn/simple_mat.h | 0 {cuML => cpp}/src/glm/ridge.h | 0 .../src/kalman_filter/KalmanFilter.cuh | 0 .../src/kalman_filter/kf_variables.h | 0 {cuML => cpp}/src/kalman_filter/lkf.h | 0 {cuML => cpp}/src/kalman_filter/lkf_py.cu | 0 {cuML => cpp}/src/kalman_filter/lkf_py.h | 0 {cuML => cpp}/src/kalman_filter/utils.h | 0 {cuML => cpp}/src/kmeans/kmeans-inl.cuh | 0 {cuML => cpp}/src/kmeans/kmeans.cu | 0 {cuML => cpp}/src/kmeans/kmeans.cuh | 0 {cuML => cpp}/src/kmeans/kmeans.hpp | 0 cpp/src/knn/knn.cu | 347 ++++++++++++++++++ {cuML => cpp}/src/knn/knn.h | 0 {cuML => cpp}/src/metrics/metrics.cu | 0 {cuML => cpp}/src/metrics/metrics.hpp | 0 {cuML => cpp}/src/ml_cuda_utils.h | 0 {cuML => cpp}/src/ml_utils.h | 0 {cuML => cpp}/src/pca/pca.cu | 0 {cuML => cpp}/src/pca/pca.h | 0 {cuML => cpp}/src/pca/pca.hpp | 0 {cuML => cpp}/src/randomforest/.gitkeep | 0 .../src/randomforest/randomforest.cu | 0 {cuML => cpp}/src/randomforest/randomforest.h | 0 {cuML => cpp}/src/solver/cd.h | 0 {cuML => cpp}/src/solver/learning_rate.h | 0 {cuML => cpp}/src/solver/sgd.h | 0 {cuML => cpp}/src/solver/shuffle.h | 0 {cuML => cpp}/src/solver/solver.cu | 0 {cuML => cpp}/src/solver/solver_c.h | 0 {cuML => cpp}/src/spectral/spectral.h | 0 {cuML => cpp}/src/tsvd/tsvd.cu | 0 {cuML => cpp}/src/tsvd/tsvd.h | 0 {cuML => cpp}/src/tsvd/tsvd.hpp | 0 {cuML => cpp}/src/tsvd/tsvd_spmg.h | 0 .../src/umap/fuzzy_simpl_set/naive.h | 0 .../src/umap/fuzzy_simpl_set/runner.h | 0 .../src/umap/init_embed/random_algo.h | 0 {cuML => cpp}/src/umap/init_embed/runner.h | 0 .../src/umap/init_embed/spectral_algo.h | 0 {cuML => cpp}/src/umap/knn_graph/algo.h | 0 {cuML => cpp}/src/umap/knn_graph/runner.h | 0 {cuML => cpp}/src/umap/optimize.h | 0 {cuML => cpp}/src/umap/runner.h | 0 {cuML => cpp}/src/umap/simpl_set_embed/algo.h | 0 .../src/umap/simpl_set_embed/runner.h | 0 {cuML => cpp}/src/umap/supervised.h | 0 {cuML => cpp}/src/umap/umap.cu | 0 {cuML => cpp}/src/umap/umap.h | 0 {cuML => cpp}/src/umap/umapparams.h | 0 .../src => cpp/src_prims}/common/Timer.h | 0 .../src_prims}/common/buffer_base.hpp | 0 .../src_prims}/common/cuml_allocator.hpp | 0 .../src_prims}/common/device_buffer.hpp | 0 .../src => cpp/src_prims}/common/grid_sync.h | 0 .../src_prims}/common/host_buffer.hpp | 0 {ml-prims/src => cpp/src_prims}/cuda_utils.h | 0 .../src_prims}/decoupled_lookback.h | 0 .../src => cpp/src_prims}/distance/algo1.h | 0 .../src => cpp/src_prims}/distance/cosine.h | 0 .../src => cpp/src_prims}/distance/distance.h | 0 .../src_prims}/distance/distance_epilogue.h | 0 .../distance/distance_epilogue_functor.h | 0 .../distance/distance_epilogue_traits.h | 0 .../distance/distance_fragment_multiply_add.h | 0 .../distance/distance_tile_traits.h | 0 .../src_prims}/distance/euclidean.h | 0 .../src_prims}/distance/fragment_sqrt.h | 0 {ml-prims/src => cpp/src_prims}/distance/l1.h | 0 .../src_prims}/distance/linear_scaling_sqrt.h | 0 .../src => cpp/src_prims}/functions/hinge.h | 0 .../src_prims}/functions/linearReg.h | 0 .../src => cpp/src_prims}/functions/log.h | 0 .../src_prims}/functions/logisticReg.h | 0 .../src => cpp/src_prims}/functions/penalty.h | 0 .../src => cpp/src_prims}/functions/sigmoid.h | 0 .../src => cpp/src_prims}/functions/sign.h | 0 .../src_prims}/functions/softThres.h | 0 {ml-prims/src => cpp/src_prims}/linalg/add.h | 0 .../src => cpp/src_prims}/linalg/binary_op.h | 0 .../src_prims}/linalg/coalesced_reduction.h | 0 .../src_prims}/linalg/cublas_wrappers.h | 0 .../src_prims}/linalg/cusolver_wrappers.h | 0 .../src_prims}/linalg/custom_accum.h | 0 .../src_prims}/linalg/cutlass_wrappers.h | 0 .../src => cpp/src_prims}/linalg/divide.h | 0 {ml-prims/src => cpp/src_prims}/linalg/eig.h | 0 .../src => cpp/src_prims}/linalg/eltwise.h | 0 .../src => cpp/src_prims}/linalg/eltwise2d.h | 0 {ml-prims/src => cpp/src_prims}/linalg/gemm.h | 0 {ml-prims/src => cpp/src_prims}/linalg/gemv.h | 0 .../src => cpp/src_prims}/linalg/lstsq.h | 0 .../src_prims}/linalg/map_then_reduce.h | 0 .../src_prims}/linalg/matrix_vector_op.h | 0 .../src_prims}/linalg/mean_squared_error.h | 0 .../src => cpp/src_prims}/linalg/multiply.h | 0 {ml-prims/src => cpp/src_prims}/linalg/norm.h | 0 .../src => cpp/src_prims}/linalg/power.h | 0 {ml-prims/src => cpp/src_prims}/linalg/qr.h | 0 .../src => cpp/src_prims}/linalg/reduce.h | 0 .../src_prims}/linalg/reduce_rows_by_key.h | 0 .../src => cpp/src_prims}/linalg/row_gemm.h | 0 {ml-prims/src => cpp/src_prims}/linalg/rsvd.h | 0 {ml-prims/src => cpp/src_prims}/linalg/sqrt.h | 0 .../src_prims}/linalg/strided_reduction.h | 0 .../src => cpp/src_prims}/linalg/subtract.h | 0 {ml-prims/src => cpp/src_prims}/linalg/svd.h | 0 .../src => cpp/src_prims}/linalg/ternary_op.h | 0 .../src => cpp/src_prims}/linalg/transpose.h | 0 .../src => cpp/src_prims}/linalg/unary_op.h | 0 .../src_prims}/linalg/vector_broadcast.h | 0 .../src => cpp/src_prims}/matrix/gather.h | 0 {ml-prims/src => cpp/src_prims}/matrix/math.h | 0 .../src => cpp/src_prims}/matrix/matrix.h | 0 .../src_prims}/random/curand_wrappers.h | 0 {ml-prims/src => cpp/src_prims}/random/mvg.h | 0 .../src => cpp/src_prims}/random/permute.h | 0 {ml-prims/src => cpp/src_prims}/random/rng.h | 0 .../src => cpp/src_prims}/random/rng_impl.h | 0 .../src => cpp/src_prims}/score/scores.h | 0 .../src_prims}/selection/columnWiseSort.h | 0 .../src_prims}/selection/kselection.h | 0 {ml-prims/src => cpp/src_prims}/sparse/coo.h | 0 {ml-prims/src => cpp/src_prims}/sparse/csr.h | 0 .../src_prims}/sparse/cusparse_wrappers.h | 0 .../src_prims}/sparse/nvgraph_wrappers.h | 0 {ml-prims/src => cpp/src_prims}/stats/cov.h | 0 {ml-prims/src => cpp/src_prims}/stats/mean.h | 0 .../src => cpp/src_prims}/stats/mean_center.h | 0 .../src => cpp/src_prims}/stats/minmax.h | 0 .../src => cpp/src_prims}/stats/stddev.h | 0 {ml-prims/src => cpp/src_prims}/stats/sum.h | 0 .../src_prims}/stats/weighted_mean.h | 0 {ml-prims/src => cpp/src_prims}/utils.h | 0 {ml-prims/src => cpp/src_prims}/vectorized.h | 0 {cuML => cpp}/test/.gitkeep | 0 .../knn_test.cu => cpp/test/mg/knn_test_mg.cu | 0 .../test => cpp/test/prims}/CMakeLists.txt | 0 {ml-prims/test => cpp/test/prims}/add.cu | 0 {ml-prims/test => cpp/test/prims}/add.h | 0 .../test/prims}/add_and_sub_dev_scalar.cu | 0 .../test => cpp/test/prims}/binary_op.cu | 0 {ml-prims/test => cpp/test/prims}/binary_op.h | 0 .../test/prims}/coalesced_reduction.cu | 0 .../test => cpp/test/prims}/columnSort.cu | 0 {ml-prims/test => cpp/test/prims}/coo.cu | 0 {ml-prims/test => cpp/test/prims}/coo.h | 0 {ml-prims/test => cpp/test/prims}/cov.cu | 0 {ml-prims/test => cpp/test/prims}/csr.cu | 0 {ml-prims/test => cpp/test/prims}/csr.h | 0 .../test => cpp/test/prims}/cuda_utils.cu | 0 .../test/prims}/decoupled_lookback.cu | 0 {ml-prims/test => cpp/test/prims}/dist_adj.cu | 0 {ml-prims/test => cpp/test/prims}/dist_cos.cu | 0 .../test => cpp/test/prims}/dist_euc_exp.cu | 0 .../test => cpp/test/prims}/dist_euc_unexp.cu | 0 {ml-prims/test => cpp/test/prims}/dist_l1.cu | 0 .../test => cpp/test/prims}/distance_base.h | 0 {ml-prims/test => cpp/test/prims}/divide.cu | 0 {ml-prims/test => cpp/test/prims}/eig.cu | 0 {ml-prims/test => cpp/test/prims}/eltwise.cu | 0 .../test => cpp/test/prims}/eltwise2d.cu | 0 {ml-prims/test => cpp/test/prims}/gather.cu | 0 {ml-prims/test => cpp/test/prims}/gemm.cu | 0 .../test => cpp/test/prims}/grid_sync.cu | 0 {ml-prims/test => cpp/test/prims}/hinge.cu | 0 .../test => cpp/test/prims}/kselection.cu | 0 .../test => cpp/test/prims}/linearReg.cu | 0 {ml-prims/test => cpp/test/prims}/log.cu | 0 .../test => cpp/test/prims}/logisticReg.cu | 0 .../test/prims}/map_then_reduce.cu | 0 {ml-prims/test => cpp/test/prims}/math.cu | 0 {ml-prims/test => cpp/test/prims}/matrix.cu | 0 .../test/prims}/matrix_vector_op.cu | 0 .../test/prims}/matrix_vector_op.h | 0 {ml-prims/test => cpp/test/prims}/mean.cu | 0 .../test => cpp/test/prims}/mean_center.cu | 0 {ml-prims/test => cpp/test/prims}/minmax.cu | 0 {ml-prims/test => cpp/test/prims}/multiply.cu | 0 {ml-prims/test => cpp/test/prims}/mvg.cu | 0 {ml-prims/test => cpp/test/prims}/norm.cu | 0 .../test => cpp/test/prims}/opg_distance.cu | 0 {ml-prims/test => cpp/test/prims}/penalty.cu | 0 {ml-prims/test => cpp/test/prims}/permute.cu | 0 {ml-prims/test => cpp/test/prims}/power.cu | 0 {ml-prims/test => cpp/test/prims}/reduce.cu | 0 {ml-prims/test => cpp/test/prims}/reduce.h | 0 .../test/prims}/reduce_rows_by_key.cu | 0 {ml-prims/test => cpp/test/prims}/rng.cu | 0 {ml-prims/test => cpp/test/prims}/rng_int.cu | 0 {ml-prims/test => cpp/test/prims}/rsvd.cu | 0 {ml-prims/test => cpp/test/prims}/score.cu | 0 {ml-prims/test => cpp/test/prims}/sigmoid.cu | 0 {ml-prims/test => cpp/test/prims}/sqrt.cu | 0 {ml-prims/test => cpp/test/prims}/stddev.cu | 0 .../test/prims}/strided_reduction.cu | 0 {ml-prims/test => cpp/test/prims}/subtract.cu | 0 {ml-prims/test => cpp/test/prims}/sum.cu | 0 {ml-prims/test => cpp/test/prims}/svd.cu | 0 .../test => cpp/test/prims}/ternary_op.cu | 0 .../test => cpp/test/prims}/test_utils.h | 0 .../test => cpp/test/prims}/transpose.cu | 0 {ml-prims/test => cpp/test/prims}/unary_op.cu | 0 {ml-prims/test => cpp/test/prims}/unary_op.h | 0 .../test/prims}/vector_broadcast.cu | 0 .../test => cpp/test/prims}/weighted_mean.cu | 0 {cuML/test => cpp/test/sg}/cd_test.cu | 0 {cuML/test => cpp/test/sg}/dbscan_test.cu | 0 {cuML/test => cpp/test/sg}/handle_test.cu | 0 {cuML/test => cpp/test/sg}/kmeans_test.cu | 0 {cuML/test => cpp/test/sg}/knn_test.cu | 0 {cuML/test => cpp/test/sg}/lkf_test.cu | 0 {cuML/test => cpp/test/sg}/ols.cu | 0 {cuML/test => cpp/test/sg}/pca_test.cu | 0 {cuML/test => cpp/test/sg}/quasi_newton.cu | 0 {cuML/test => cpp/test/sg}/rf_test.cu | 0 {cuML/test => cpp/test/sg}/ridge.cu | 0 {cuML/test => cpp/test/sg}/sgd.cu | 0 {cuML/test => cpp/test/sg}/spectral_test.cu | 0 {cuML/test => cpp/test/sg}/tsvd_test.cu | 0 {cuML/test => cpp/test/sg}/umap_test.cu | 0 297 files changed, 550 insertions(+), 66 deletions(-) rename {cuML => cpp}/.gitignore (100%) rename {cuML => cpp}/CMakeLists.txt (79%) rename {cuML => cpp}/DEVELOPER_GUIDE.md (100%) rename {ml-prims => cpp}/Doxyfile.in (100%) rename {cuML => cpp}/README.md (100%) rename {ml-prims => cpp}/cmake/FindClangFormat.cmake (100%) rename {ml-prims => cpp}/cmake/FindClangTidy.cmake (100%) rename {ml-prims => cpp}/cmake/doxygen.cmake (100%) rename {cuML => cpp}/examples/CMakeLists.txt (100%) rename {cuML => cpp}/examples/dbscan/CMakeLists.txt (100%) rename {cuML => cpp}/examples/dbscan/CMakeLists_standalone.txt (100%) rename {cuML => cpp}/examples/dbscan/README.md (100%) rename {cuML => cpp}/examples/dbscan/dbscan_example.cpp (100%) rename {cuML => cpp}/examples/dbscan/gen_dataset.py (100%) rename {cuML => cpp}/examples/kmeans/CMakeLists.txt (100%) rename {cuML => cpp}/examples/kmeans/CMakeLists_standalone.txt (100%) rename {cuML => cpp}/examples/kmeans/README.md (100%) rename {cuML => cpp}/examples/kmeans/kmeans_example.cpp (100%) rename {cuML => cpp}/examples/kmeans/prepare_input.py (100%) rename {cuML => cpp}/external (100%) rename {ml-prims => cpp}/scripts/plotter_kf.py (100%) rename {ml-prims => cpp}/scripts/run-clang-format.py (100%) rename {cuML => cpp}/src/.gitkeep (100%) rename {cuML => cpp}/src/common/allocatorAdapter.hpp (100%) rename {cuML => cpp}/src/common/cumlHandle.cpp (100%) rename {cuML => cpp}/src/common/cumlHandle.hpp (100%) rename {cuML => cpp}/src/common/cuml_api.cpp (100%) create mode 100644 cpp/src/common/rmmAllocatorAdapter.hpp rename {cuML => cpp}/src/common/tensor.hpp (100%) rename {cuML => cpp}/src/cuML.hpp (100%) rename {cuML => cpp}/src/cuML_api.h (100%) rename {cuML => cpp}/src/dbscan/adjgraph/algo.h (100%) rename {cuML => cpp}/src/dbscan/adjgraph/naive.h (100%) rename {cuML => cpp}/src/dbscan/adjgraph/pack.h (100%) rename {cuML => cpp}/src/dbscan/adjgraph/runner.h (100%) rename {cuML => cpp}/src/dbscan/common.h (100%) rename {cuML => cpp}/src/dbscan/dbscan.cu (100%) rename {cuML => cpp}/src/dbscan/dbscan.h (100%) rename {cuML => cpp}/src/dbscan/dbscan.hpp (100%) rename {cuML => cpp}/src/dbscan/dbscan_api.h (100%) rename {cuML => cpp}/src/dbscan/labelling/algo1.h (100%) rename {cuML => cpp}/src/dbscan/labelling/algo2.h (100%) rename {cuML => cpp}/src/dbscan/labelling/naive.h (100%) rename {cuML => cpp}/src/dbscan/labelling/pack.h (100%) rename {cuML => cpp}/src/dbscan/labelling/runner.h (100%) rename {cuML => cpp}/src/dbscan/runner.h (100%) rename {cuML => cpp}/src/dbscan/vertexdeg/algo.h (100%) rename {cuML => cpp}/src/dbscan/vertexdeg/naive.h (100%) rename {cuML => cpp}/src/dbscan/vertexdeg/pack.h (100%) rename {cuML => cpp}/src/dbscan/vertexdeg/runner.h (100%) rename {cuML => cpp}/src/decisiontree/.gitkeep (100%) rename {cuML => cpp}/src/decisiontree/algo_helper.h (100%) rename {cuML => cpp}/src/decisiontree/decisiontree.cu (100%) rename {cuML => cpp}/src/decisiontree/decisiontree.h (100%) rename {cuML => cpp}/src/decisiontree/kernels/col_condenser.cuh (100%) rename {cuML => cpp}/src/decisiontree/kernels/evaluate.cuh (100%) rename {cuML => cpp}/src/decisiontree/kernels/gini.cuh (100%) rename {cuML => cpp}/src/decisiontree/kernels/gini_def.h (100%) rename {cuML => cpp}/src/decisiontree/kernels/quantile.cuh (100%) rename {cuML => cpp}/src/decisiontree/kernels/split_labels.cuh (100%) rename {cuML => cpp}/src/decisiontree/memory.cuh (100%) rename {cuML => cpp}/src/glm/glm.cu (100%) rename {cuML => cpp}/src/glm/glm.hpp (100%) rename {cuML => cpp}/src/glm/glm_api.h (100%) rename {cuML => cpp}/src/glm/glm_spmg.h (100%) rename {cuML => cpp}/src/glm/ols.h (100%) rename {cuML => cpp}/src/glm/preprocess.h (100%) rename {cuML => cpp}/src/glm/qn/glm_base.h (100%) rename {cuML => cpp}/src/glm/qn/glm_linear.h (100%) rename {cuML => cpp}/src/glm/qn/glm_logistic.h (100%) rename {cuML => cpp}/src/glm/qn/glm_regularizer.h (100%) rename {cuML => cpp}/src/glm/qn/glm_softmax.h (100%) rename {cuML => cpp}/src/glm/qn/qn.h (100%) rename {cuML => cpp}/src/glm/qn/qn_linesearch.h (100%) rename {cuML => cpp}/src/glm/qn/qn_solvers.h (100%) rename {cuML => cpp}/src/glm/qn/qn_util.h (100%) rename {cuML => cpp}/src/glm/qn/simple_mat.h (100%) rename {cuML => cpp}/src/glm/ridge.h (100%) rename {cuML => cpp}/src/kalman_filter/KalmanFilter.cuh (100%) rename {cuML => cpp}/src/kalman_filter/kf_variables.h (100%) rename {cuML => cpp}/src/kalman_filter/lkf.h (100%) rename {cuML => cpp}/src/kalman_filter/lkf_py.cu (100%) rename {cuML => cpp}/src/kalman_filter/lkf_py.h (100%) rename {cuML => cpp}/src/kalman_filter/utils.h (100%) rename {cuML => cpp}/src/kmeans/kmeans-inl.cuh (100%) rename {cuML => cpp}/src/kmeans/kmeans.cu (100%) rename {cuML => cpp}/src/kmeans/kmeans.cuh (100%) rename {cuML => cpp}/src/kmeans/kmeans.hpp (100%) create mode 100644 cpp/src/knn/knn.cu rename {cuML => cpp}/src/knn/knn.h (100%) rename {cuML => cpp}/src/metrics/metrics.cu (100%) rename {cuML => cpp}/src/metrics/metrics.hpp (100%) rename {cuML => cpp}/src/ml_cuda_utils.h (100%) rename {cuML => cpp}/src/ml_utils.h (100%) rename {cuML => cpp}/src/pca/pca.cu (100%) rename {cuML => cpp}/src/pca/pca.h (100%) rename {cuML => cpp}/src/pca/pca.hpp (100%) rename {cuML => cpp}/src/randomforest/.gitkeep (100%) rename {cuML => cpp}/src/randomforest/randomforest.cu (100%) rename {cuML => cpp}/src/randomforest/randomforest.h (100%) rename {cuML => cpp}/src/solver/cd.h (100%) rename {cuML => cpp}/src/solver/learning_rate.h (100%) rename {cuML => cpp}/src/solver/sgd.h (100%) rename {cuML => cpp}/src/solver/shuffle.h (100%) rename {cuML => cpp}/src/solver/solver.cu (100%) rename {cuML => cpp}/src/solver/solver_c.h (100%) rename {cuML => cpp}/src/spectral/spectral.h (100%) rename {cuML => cpp}/src/tsvd/tsvd.cu (100%) rename {cuML => cpp}/src/tsvd/tsvd.h (100%) rename {cuML => cpp}/src/tsvd/tsvd.hpp (100%) rename {cuML => cpp}/src/tsvd/tsvd_spmg.h (100%) rename {cuML => cpp}/src/umap/fuzzy_simpl_set/naive.h (100%) rename {cuML => cpp}/src/umap/fuzzy_simpl_set/runner.h (100%) rename {cuML => cpp}/src/umap/init_embed/random_algo.h (100%) rename {cuML => cpp}/src/umap/init_embed/runner.h (100%) rename {cuML => cpp}/src/umap/init_embed/spectral_algo.h (100%) rename {cuML => cpp}/src/umap/knn_graph/algo.h (100%) rename {cuML => cpp}/src/umap/knn_graph/runner.h (100%) rename {cuML => cpp}/src/umap/optimize.h (100%) rename {cuML => cpp}/src/umap/runner.h (100%) rename {cuML => cpp}/src/umap/simpl_set_embed/algo.h (100%) rename {cuML => cpp}/src/umap/simpl_set_embed/runner.h (100%) rename {cuML => cpp}/src/umap/supervised.h (100%) rename {cuML => cpp}/src/umap/umap.cu (100%) rename {cuML => cpp}/src/umap/umap.h (100%) rename {cuML => cpp}/src/umap/umapparams.h (100%) rename {ml-prims/src => cpp/src_prims}/common/Timer.h (100%) rename {ml-prims/src => cpp/src_prims}/common/buffer_base.hpp (100%) rename {ml-prims/src => cpp/src_prims}/common/cuml_allocator.hpp (100%) rename {ml-prims/src => cpp/src_prims}/common/device_buffer.hpp (100%) rename {ml-prims/src => cpp/src_prims}/common/grid_sync.h (100%) rename {ml-prims/src => cpp/src_prims}/common/host_buffer.hpp (100%) rename {ml-prims/src => cpp/src_prims}/cuda_utils.h (100%) rename {ml-prims/src => cpp/src_prims}/decoupled_lookback.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/algo1.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/cosine.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/distance.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/distance_epilogue.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/distance_epilogue_functor.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/distance_epilogue_traits.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/distance_fragment_multiply_add.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/distance_tile_traits.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/euclidean.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/fragment_sqrt.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/l1.h (100%) rename {ml-prims/src => cpp/src_prims}/distance/linear_scaling_sqrt.h (100%) rename {ml-prims/src => cpp/src_prims}/functions/hinge.h (100%) rename {ml-prims/src => cpp/src_prims}/functions/linearReg.h (100%) rename {ml-prims/src => cpp/src_prims}/functions/log.h (100%) rename {ml-prims/src => cpp/src_prims}/functions/logisticReg.h (100%) rename {ml-prims/src => cpp/src_prims}/functions/penalty.h (100%) rename {ml-prims/src => cpp/src_prims}/functions/sigmoid.h (100%) rename {ml-prims/src => cpp/src_prims}/functions/sign.h (100%) rename {ml-prims/src => cpp/src_prims}/functions/softThres.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/add.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/binary_op.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/coalesced_reduction.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/cublas_wrappers.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/cusolver_wrappers.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/custom_accum.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/cutlass_wrappers.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/divide.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/eig.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/eltwise.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/eltwise2d.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/gemm.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/gemv.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/lstsq.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/map_then_reduce.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/matrix_vector_op.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/mean_squared_error.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/multiply.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/norm.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/power.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/qr.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/reduce.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/reduce_rows_by_key.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/row_gemm.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/rsvd.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/sqrt.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/strided_reduction.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/subtract.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/svd.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/ternary_op.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/transpose.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/unary_op.h (100%) rename {ml-prims/src => cpp/src_prims}/linalg/vector_broadcast.h (100%) rename {ml-prims/src => cpp/src_prims}/matrix/gather.h (100%) rename {ml-prims/src => cpp/src_prims}/matrix/math.h (100%) rename {ml-prims/src => cpp/src_prims}/matrix/matrix.h (100%) rename {ml-prims/src => cpp/src_prims}/random/curand_wrappers.h (100%) rename {ml-prims/src => cpp/src_prims}/random/mvg.h (100%) rename {ml-prims/src => cpp/src_prims}/random/permute.h (100%) rename {ml-prims/src => cpp/src_prims}/random/rng.h (100%) rename {ml-prims/src => cpp/src_prims}/random/rng_impl.h (100%) rename {ml-prims/src => cpp/src_prims}/score/scores.h (100%) rename {ml-prims/src => cpp/src_prims}/selection/columnWiseSort.h (100%) rename {ml-prims/src => cpp/src_prims}/selection/kselection.h (100%) rename {ml-prims/src => cpp/src_prims}/sparse/coo.h (100%) rename {ml-prims/src => cpp/src_prims}/sparse/csr.h (100%) rename {ml-prims/src => cpp/src_prims}/sparse/cusparse_wrappers.h (100%) rename {ml-prims/src => cpp/src_prims}/sparse/nvgraph_wrappers.h (100%) rename {ml-prims/src => cpp/src_prims}/stats/cov.h (100%) rename {ml-prims/src => cpp/src_prims}/stats/mean.h (100%) rename {ml-prims/src => cpp/src_prims}/stats/mean_center.h (100%) rename {ml-prims/src => cpp/src_prims}/stats/minmax.h (100%) rename {ml-prims/src => cpp/src_prims}/stats/stddev.h (100%) rename {ml-prims/src => cpp/src_prims}/stats/sum.h (100%) rename {ml-prims/src => cpp/src_prims}/stats/weighted_mean.h (100%) rename {ml-prims/src => cpp/src_prims}/utils.h (100%) rename {ml-prims/src => cpp/src_prims}/vectorized.h (100%) rename {cuML => cpp}/test/.gitkeep (100%) rename cuML/test_mg/knn_test.cu => cpp/test/mg/knn_test_mg.cu (100%) rename {ml-prims/test => cpp/test/prims}/CMakeLists.txt (100%) rename {ml-prims/test => cpp/test/prims}/add.cu (100%) rename {ml-prims/test => cpp/test/prims}/add.h (100%) rename {ml-prims/test => cpp/test/prims}/add_and_sub_dev_scalar.cu (100%) rename {ml-prims/test => cpp/test/prims}/binary_op.cu (100%) rename {ml-prims/test => cpp/test/prims}/binary_op.h (100%) rename {ml-prims/test => cpp/test/prims}/coalesced_reduction.cu (100%) rename {ml-prims/test => cpp/test/prims}/columnSort.cu (100%) rename {ml-prims/test => cpp/test/prims}/coo.cu (100%) rename {ml-prims/test => cpp/test/prims}/coo.h (100%) rename {ml-prims/test => cpp/test/prims}/cov.cu (100%) rename {ml-prims/test => cpp/test/prims}/csr.cu (100%) rename {ml-prims/test => cpp/test/prims}/csr.h (100%) rename {ml-prims/test => cpp/test/prims}/cuda_utils.cu (100%) rename {ml-prims/test => cpp/test/prims}/decoupled_lookback.cu (100%) rename {ml-prims/test => cpp/test/prims}/dist_adj.cu (100%) rename {ml-prims/test => cpp/test/prims}/dist_cos.cu (100%) rename {ml-prims/test => cpp/test/prims}/dist_euc_exp.cu (100%) rename {ml-prims/test => cpp/test/prims}/dist_euc_unexp.cu (100%) rename {ml-prims/test => cpp/test/prims}/dist_l1.cu (100%) rename {ml-prims/test => cpp/test/prims}/distance_base.h (100%) rename {ml-prims/test => cpp/test/prims}/divide.cu (100%) rename {ml-prims/test => cpp/test/prims}/eig.cu (100%) rename {ml-prims/test => cpp/test/prims}/eltwise.cu (100%) rename {ml-prims/test => cpp/test/prims}/eltwise2d.cu (100%) rename {ml-prims/test => cpp/test/prims}/gather.cu (100%) rename {ml-prims/test => cpp/test/prims}/gemm.cu (100%) rename {ml-prims/test => cpp/test/prims}/grid_sync.cu (100%) rename {ml-prims/test => cpp/test/prims}/hinge.cu (100%) rename {ml-prims/test => cpp/test/prims}/kselection.cu (100%) rename {ml-prims/test => cpp/test/prims}/linearReg.cu (100%) rename {ml-prims/test => cpp/test/prims}/log.cu (100%) rename {ml-prims/test => cpp/test/prims}/logisticReg.cu (100%) rename {ml-prims/test => cpp/test/prims}/map_then_reduce.cu (100%) rename {ml-prims/test => cpp/test/prims}/math.cu (100%) rename {ml-prims/test => cpp/test/prims}/matrix.cu (100%) rename {ml-prims/test => cpp/test/prims}/matrix_vector_op.cu (100%) rename {ml-prims/test => cpp/test/prims}/matrix_vector_op.h (100%) rename {ml-prims/test => cpp/test/prims}/mean.cu (100%) rename {ml-prims/test => cpp/test/prims}/mean_center.cu (100%) rename {ml-prims/test => cpp/test/prims}/minmax.cu (100%) rename {ml-prims/test => cpp/test/prims}/multiply.cu (100%) rename {ml-prims/test => cpp/test/prims}/mvg.cu (100%) rename {ml-prims/test => cpp/test/prims}/norm.cu (100%) rename {ml-prims/test => cpp/test/prims}/opg_distance.cu (100%) rename {ml-prims/test => cpp/test/prims}/penalty.cu (100%) rename {ml-prims/test => cpp/test/prims}/permute.cu (100%) rename {ml-prims/test => cpp/test/prims}/power.cu (100%) rename {ml-prims/test => cpp/test/prims}/reduce.cu (100%) rename {ml-prims/test => cpp/test/prims}/reduce.h (100%) rename {ml-prims/test => cpp/test/prims}/reduce_rows_by_key.cu (100%) rename {ml-prims/test => cpp/test/prims}/rng.cu (100%) rename {ml-prims/test => cpp/test/prims}/rng_int.cu (100%) rename {ml-prims/test => cpp/test/prims}/rsvd.cu (100%) rename {ml-prims/test => cpp/test/prims}/score.cu (100%) rename {ml-prims/test => cpp/test/prims}/sigmoid.cu (100%) rename {ml-prims/test => cpp/test/prims}/sqrt.cu (100%) rename {ml-prims/test => cpp/test/prims}/stddev.cu (100%) rename {ml-prims/test => cpp/test/prims}/strided_reduction.cu (100%) rename {ml-prims/test => cpp/test/prims}/subtract.cu (100%) rename {ml-prims/test => cpp/test/prims}/sum.cu (100%) rename {ml-prims/test => cpp/test/prims}/svd.cu (100%) rename {ml-prims/test => cpp/test/prims}/ternary_op.cu (100%) rename {ml-prims/test => cpp/test/prims}/test_utils.h (100%) rename {ml-prims/test => cpp/test/prims}/transpose.cu (100%) rename {ml-prims/test => cpp/test/prims}/unary_op.cu (100%) rename {ml-prims/test => cpp/test/prims}/unary_op.h (100%) rename {ml-prims/test => cpp/test/prims}/vector_broadcast.cu (100%) rename {ml-prims/test => cpp/test/prims}/weighted_mean.cu (100%) rename {cuML/test => cpp/test/sg}/cd_test.cu (100%) rename {cuML/test => cpp/test/sg}/dbscan_test.cu (100%) rename {cuML/test => cpp/test/sg}/handle_test.cu (100%) rename {cuML/test => cpp/test/sg}/kmeans_test.cu (100%) rename {cuML/test => cpp/test/sg}/knn_test.cu (100%) rename {cuML/test => cpp/test/sg}/lkf_test.cu (100%) rename {cuML/test => cpp/test/sg}/ols.cu (100%) rename {cuML/test => cpp/test/sg}/pca_test.cu (100%) rename {cuML/test => cpp/test/sg}/quasi_newton.cu (100%) rename {cuML/test => cpp/test/sg}/rf_test.cu (100%) rename {cuML/test => cpp/test/sg}/ridge.cu (100%) rename {cuML/test => cpp/test/sg}/sgd.cu (100%) rename {cuML/test => cpp/test/sg}/spectral_test.cu (100%) rename {cuML/test => cpp/test/sg}/tsvd_test.cu (100%) rename {cuML/test => cpp/test/sg}/umap_test.cu (100%) diff --git a/cuML/.gitignore b/cpp/.gitignore similarity index 100% rename from cuML/.gitignore rename to cpp/.gitignore diff --git a/cuML/CMakeLists.txt b/cpp/CMakeLists.txt similarity index 79% rename from cuML/CMakeLists.txt rename to cpp/CMakeLists.txt index a75bf92068..62c3456932 100644 --- a/cuML/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,4 +1,4 @@ -# +#============================================================================= # Copyright (c) 2018-2019, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,32 +12,65 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# +#============================================================================= set (CMAKE_FIND_NO_INSTALL_PREFIX TRUE FORCE) -cmake_minimum_required(VERSION 3.12 FATAL_ERROR) -project(cuML VERSION 0.7.0 LANGUAGES CXX CUDA) +cmake_minimum_required(VERSION 3.13 FATAL_ERROR) +project(CUML VERSION 0.8.0 LANGUAGES CXX CUDA) -set(CMAKE_CXX_STANDARD 11) -set(CMAKE_CXX_STANDARD_REQUIRED ON) +################################################################################################### +# - build type ------------------------------------------------------------------------------------ + +# Set a default build type if none was specified +set(DEFAULT_BUILD_TYPE "Release") + +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' since none specified.") + set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE + STRING "Choose the type of build." FORCE) + # Set the possible values of build type for cmake-gui + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS + "Debug" "Release") +endif() ################################################################################################### -# - Requirements ---------------------------------------------------------------------------------- +# - User Options --------------------------------------------------------------------------------- -if(NOT DEFINED BLAS_LIBRARIES) - find_package( BLAS REQUIRED ) -else() - message(STATUS "Manually setting BLAS to ${BLAS_LIBRARIES}") -endif() +set(CMAKE_IGNORE_PATH "${CMAKE_INSTALL_DIR}/lib" CACHE STRING + "Ignore any libs added implicitly from the CMAKE_INSTALL_DIR") -if(NOT DEFINED LAPACK_LIBRARIES) - find_package( LAPACK REQUIRED ) -else() - message(STATUS "Manually setting LAPACK to ${LAPACK_LIBRARIES}") +option(LINEINFO "Enable lineinfo in nvcc" OFF) + +option(KERNELINFO "Enable kernel resource usage info" OFF) + +option(DEBUG "Get a debug build" OFF) + +option(BUILD_CUML_TESTS "Build cuML algorithm tests" ON) + +option(BUILD_PRIM_TESTS "Build ml-prim tests" OFF) + +set(BLAS_LIBRARIES "" CACHE STRING + "Location of BLAS library") + +set(GPU_ARCHS "" CACHE STRING + "List of GPU architectures (semicolon-separated) to be compiled for") + +if(NOT "${GPU_ARCHS}") + set(GPU_ARCHS "60") + if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9)) + set(GPU_ARCHS "${GPU_ARCHS};70") + endif() + if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10)) + set(GPU_ARCHS "${GPU_ARCHS};75") + endif() endif() +################################################################################################### +# - Requirements ---------------------------------------------------------------------------------- + find_package(CUDA 9.0 REQUIRED) + if (NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP}) find_package(OpenMP) endif(NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP}) @@ -49,16 +82,33 @@ else() message(FATAL_ERROR "ZLib not found, please check your settings.") endif(ZLIB_FOUND) +# set(CMAKE_THREAD_PREFER_PTHREAD TRUE) +# find_package (Threads REQUIRED) +# if(NOT CMAKE_USE_PTHREADS_INIT) +# message(FATAL_ERROR "pthreads not found, please check your settings") + +if(NOT DEFINED BLAS_LIBRARIES) + find_package( BLAS REQUIRED ) +else() + message(STATUS "Manually setting BLAS to ${BLAS_LIBRARIES}") +endif() + +# if(NOT DEFINED LAPACK_LIBRARIES) +# find_package( LAPACK REQUIRED ) +# else() +# message(STATUS "Manually setting LAPACK to ${LAPACK_LIBRARIES}") +# endif() ################################################################################################### # - Submodules ------------------------------------------------------------------------------------ + set(GTEST_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims/external/googletest/googletest CACHE STRING "Path to the googletest repo") set(GTEST_LIBNAME "gtest_main" CACHE STRING "Name of the googletest library") set(FAISS_DIR ${PROJECT_SOURCE_DIR}/external/faiss CACHE STRING "Path to FAISS source directory") -set(MLPRIMS_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims CACHE STRING +set(MLPRIMS_DIR ${PROJECT_SOURCE_DIR}/src_prims/ CACHE STRING "Path to the ml-prims repo") set(CUB_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims/external/cub CACHE STRING "Path to cub repo") @@ -67,61 +117,62 @@ set(CUTLASS_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims/external/cutlass CACHE S set(CUDA_nvgraph_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvgraph.so CACHE STRING "Path to nvGraph lib") -################################################################################################### -# - User Options --------------------------------------------------------------------------------- - -set(CMAKE_IGNORE_PATH "${CMAKE_INSTALL_DIR}/lib" CACHE STRING - "Ignore any libs added implicitly from the CMAKE_INSTALL_DIR") -set(GPU_ARCHS "" CACHE STRING - "List of GPU architectures (semicolon-separated) to be compiled for") -option(LINEINFO "Enable lineinfo in nvcc" OFF) -option(KERNELINFO "Enable kernel resource usage info" OFF) -option(DEBUG "Get a debug build" OFF) ################################################################################################### # - Compiler Options ----------------------------------------------------------------------------- -## nvcc options +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) if(OPENMP_FOUND) + message(STATUS "Building with OpenMP support") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler ${OpenMP_CXX_FLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") endif(OPENMP_FOUND) -if(CMAKE_CXX_STANDARD STREQUAL "11") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++11") -endif() + +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++11") + if(LINEINFO) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") endif() + if(KERNELINFO) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas=-v") endif() + if(DEBUG) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G -g") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") endif() -# Generate optimized binary for every known arch -if(NOT "${GPU_ARCHS}") - set(GPU_ARCHS "60;61") - # NOTE: NOTE: Add more 'if's for every new arch release! - if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9)) - set(GPU_ARCHS "${GPU_ARCHS};70") - endif() - if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10)) - set(GPU_ARCHS "${GPU_ARCHS};75") - endif() -endif() + foreach(arch ${GPU_ARCHS}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}") endforeach() -# Generate PTX (to be JIT'd at runtime) for the latest architecture -# It is assumed that the last arch in the 'archs' is the latest! + list(GET GPU_ARCHS -1 ptx) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}") +if(CMAKE_COMPILER_IS_GNUCXX) + if(NOT CMAKE_CXX11_ABI) + message(STATUS "Disabling the GLIBCXX11 ABI") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -D_GLIBCXX_USE_CXX11_ABI=0") + elseif(CMAKE_CXX11_ABI) + message(STATUS "Enabling the GLIBCXX11 ABI") + endif(NOT CMAKE_CXX11_ABI) +endif(CMAKE_COMPILER_IS_GNUCXX) + + +## end of other compiler options + +################################################################################################### +# - FAISS Build ---------------------------------------------------------------------------------- + # Configuration of faiss for the correct architectures +# TODO: Update faiss submodule and use new flags file(READ ${FAISS_DIR}/makefile.inc.in CONFIG_FILE) string(REPLACE "-Xcudafe --diag_suppress=unrecognized_attribute" "--disable-warnings" @@ -147,27 +198,6 @@ endif() file(WRITE ${FAISS_DIR}/makefile.inc.in "${CONFIG_FILE}") -## end nvcc options - -## other compiler options - -option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON) - -if(NOT CMAKE_CXX11_ABI) - message(STATUS "Disabling the GLIBCXX11 ABI") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -D_GLIBCXX_USE_CXX11_ABI=0") -elseif(CMAKE_CXX11_ABI) - message(STATUS "Enabling the GLIBCXX11 ABI") -endif(NOT CMAKE_CXX11_ABI) - - -## end of other compiler options - -################################################################################################### -# - include paths --------------------------------------------------------------------------------- - include (ExternalProject) ExternalProject_Add(faiss @@ -182,6 +212,9 @@ ExternalProject_Add(faiss ExternalProject_Get_Property(faiss install_dir) +################################################################################################### +# - include paths --------------------------------------------------------------------------------- + add_library(faisslib STATIC IMPORTED) add_library(gpufaisslib STATIC IMPORTED) @@ -210,6 +243,7 @@ file(GLOB_RECURSE cuml_mg_test_cuda_sources "test_mg/*.cu") ################################################################################################### # - build libcuml++ shared library ------------------------------------------------------------------ + set(CUML_CPP_TARGET "cuml++") add_library(${CUML_CPP_TARGET} SHARED src/pca/pca.cu diff --git a/cuML/DEVELOPER_GUIDE.md b/cpp/DEVELOPER_GUIDE.md similarity index 100% rename from cuML/DEVELOPER_GUIDE.md rename to cpp/DEVELOPER_GUIDE.md diff --git a/ml-prims/Doxyfile.in b/cpp/Doxyfile.in similarity index 100% rename from ml-prims/Doxyfile.in rename to cpp/Doxyfile.in diff --git a/cuML/README.md b/cpp/README.md similarity index 100% rename from cuML/README.md rename to cpp/README.md diff --git a/ml-prims/cmake/FindClangFormat.cmake b/cpp/cmake/FindClangFormat.cmake similarity index 100% rename from ml-prims/cmake/FindClangFormat.cmake rename to cpp/cmake/FindClangFormat.cmake diff --git a/ml-prims/cmake/FindClangTidy.cmake b/cpp/cmake/FindClangTidy.cmake similarity index 100% rename from ml-prims/cmake/FindClangTidy.cmake rename to cpp/cmake/FindClangTidy.cmake diff --git a/ml-prims/cmake/doxygen.cmake b/cpp/cmake/doxygen.cmake similarity index 100% rename from ml-prims/cmake/doxygen.cmake rename to cpp/cmake/doxygen.cmake diff --git a/cuML/examples/CMakeLists.txt b/cpp/examples/CMakeLists.txt similarity index 100% rename from cuML/examples/CMakeLists.txt rename to cpp/examples/CMakeLists.txt diff --git a/cuML/examples/dbscan/CMakeLists.txt b/cpp/examples/dbscan/CMakeLists.txt similarity index 100% rename from cuML/examples/dbscan/CMakeLists.txt rename to cpp/examples/dbscan/CMakeLists.txt diff --git a/cuML/examples/dbscan/CMakeLists_standalone.txt b/cpp/examples/dbscan/CMakeLists_standalone.txt similarity index 100% rename from cuML/examples/dbscan/CMakeLists_standalone.txt rename to cpp/examples/dbscan/CMakeLists_standalone.txt diff --git a/cuML/examples/dbscan/README.md b/cpp/examples/dbscan/README.md similarity index 100% rename from cuML/examples/dbscan/README.md rename to cpp/examples/dbscan/README.md diff --git a/cuML/examples/dbscan/dbscan_example.cpp b/cpp/examples/dbscan/dbscan_example.cpp similarity index 100% rename from cuML/examples/dbscan/dbscan_example.cpp rename to cpp/examples/dbscan/dbscan_example.cpp diff --git a/cuML/examples/dbscan/gen_dataset.py b/cpp/examples/dbscan/gen_dataset.py similarity index 100% rename from cuML/examples/dbscan/gen_dataset.py rename to cpp/examples/dbscan/gen_dataset.py diff --git a/cuML/examples/kmeans/CMakeLists.txt b/cpp/examples/kmeans/CMakeLists.txt similarity index 100% rename from cuML/examples/kmeans/CMakeLists.txt rename to cpp/examples/kmeans/CMakeLists.txt diff --git a/cuML/examples/kmeans/CMakeLists_standalone.txt b/cpp/examples/kmeans/CMakeLists_standalone.txt similarity index 100% rename from cuML/examples/kmeans/CMakeLists_standalone.txt rename to cpp/examples/kmeans/CMakeLists_standalone.txt diff --git a/cuML/examples/kmeans/README.md b/cpp/examples/kmeans/README.md similarity index 100% rename from cuML/examples/kmeans/README.md rename to cpp/examples/kmeans/README.md diff --git a/cuML/examples/kmeans/kmeans_example.cpp b/cpp/examples/kmeans/kmeans_example.cpp similarity index 100% rename from cuML/examples/kmeans/kmeans_example.cpp rename to cpp/examples/kmeans/kmeans_example.cpp diff --git a/cuML/examples/kmeans/prepare_input.py b/cpp/examples/kmeans/prepare_input.py similarity index 100% rename from cuML/examples/kmeans/prepare_input.py rename to cpp/examples/kmeans/prepare_input.py diff --git a/cuML/external b/cpp/external similarity index 100% rename from cuML/external rename to cpp/external diff --git a/ml-prims/scripts/plotter_kf.py b/cpp/scripts/plotter_kf.py similarity index 100% rename from ml-prims/scripts/plotter_kf.py rename to cpp/scripts/plotter_kf.py diff --git a/ml-prims/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py similarity index 100% rename from ml-prims/scripts/run-clang-format.py rename to cpp/scripts/run-clang-format.py diff --git a/cuML/src/.gitkeep b/cpp/src/.gitkeep similarity index 100% rename from cuML/src/.gitkeep rename to cpp/src/.gitkeep diff --git a/cuML/src/common/allocatorAdapter.hpp b/cpp/src/common/allocatorAdapter.hpp similarity index 100% rename from cuML/src/common/allocatorAdapter.hpp rename to cpp/src/common/allocatorAdapter.hpp diff --git a/cuML/src/common/cumlHandle.cpp b/cpp/src/common/cumlHandle.cpp similarity index 100% rename from cuML/src/common/cumlHandle.cpp rename to cpp/src/common/cumlHandle.cpp diff --git a/cuML/src/common/cumlHandle.hpp b/cpp/src/common/cumlHandle.hpp similarity index 100% rename from cuML/src/common/cumlHandle.hpp rename to cpp/src/common/cumlHandle.hpp diff --git a/cuML/src/common/cuml_api.cpp b/cpp/src/common/cuml_api.cpp similarity index 100% rename from cuML/src/common/cuml_api.cpp rename to cpp/src/common/cuml_api.cpp diff --git a/cpp/src/common/rmmAllocatorAdapter.hpp b/cpp/src/common/rmmAllocatorAdapter.hpp new file mode 100644 index 0000000000..54c67b4394 --- /dev/null +++ b/cpp/src/common/rmmAllocatorAdapter.hpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "../../../ml-prims/src/utils.h" + +#include "../cuML.hpp" + +namespace ML { + +/** + * @brief Implemententation of ML::deviceAllocator using the RAPIDS Memory Manager (RMM) for allocations. + * + * rmmAllocatorAdapter does not initialize RMM. If RMM is not initialized on construction of rmmAllocatorAdapter + * allocations fall back to cudaMalloc. + */ +class rmmAllocatorAdapter : public ML::deviceAllocator { +public: + rmmAllocatorAdapter() + : _rmmInitialized( rmmIsInitialized( NULL ) ) + { + //@todo: Log warning if RMM is not initialized. Blocked by https://github.com/rapidsai/cuml/issues/229 + } + + /** + * @brief asynchronosly allocate n bytes that can be used after all work in stream sheduled prior to this call + * has completetd. + * + * @param[in] n size of the allocation in bytes + * @param[in] stream the stream to use for the asynchronous allocations + * @returns a pointer to n byte of device memory + */ + virtual void* allocate( std::size_t n, cudaStream_t stream ) + { + void* ptr = 0; + if (!_rmmInitialized) + { + CUDA_CHECK( cudaMalloc( &ptr, n ) ); + } + else + { + rmmError_t rmmStatus = RMM_ALLOC(&ptr, n, stream); + if ( RMM_SUCCESS != rmmStatus || 0 == ptr ) + { + std::ostringstream msg; + msg<<"RMM allocation of "< +#include +#include +#include +#include +#include + +#include +#include +#include + + +namespace ML { + + + /** + * Build a kNN object for training and querying a k-nearest neighbors model. + * @param D number of features in each vector + */ + kNN::kNN(int D, bool verbose): D(D), total_n(0), indices(0), verbose(verbose), owner(false){} + kNN::~kNN() { + + try { + if(this->owner) { + if(this->verbose) + std::cout << "Freeing kNN memory" << std::endl; + for(kNNParams p : knn_params) { CUDA_CHECK(cudaFree(p.ptr)); } + } + + } catch(const std::exception &e) { + std::cout << "An exception occurred releasing kNN memory: " << e.what() << std::endl; + } + } + + void kNN::reset() { + if(knn_params.size() > 0) { + knn_params.clear(); + this->id_ranges.clear(); + this->indices = 0; + this->total_n = 0; + } + } + + bool kNN::verify_size(size_t size, int device) { + size_t free, total; + cudaMemGetInfo(&free, &total); + + if(size > free) { + std::cout << "Not enough free memory on device " + << device + << " to run kneighbors. " + << "needed=" + << size + << ", free=" << free << std::endl; + return false; + } + + return true; + } + + /** + * Fit a kNN model by creating separate indices for multiple given + * instances of kNNParams. + * @param input an array of pointers to data on (possibly different) devices + * @param N number of items in input array. + */ + void kNN::fit(kNNParams *input, int N) { + + + if(this->owner) { + for(kNNParams p : knn_params) { CUDA_CHECK(cudaFree(p.ptr)); } + } + + if(this->verbose) + std::cout << "N=" << N << std::endl; + + reset(); + + for(int i = 0; i < N; i++) { + + kNNParams params = input[i]; + this->indices++; + this->knn_params.emplace_back(params); + if(i < params.N) { + id_ranges.push_back(total_n); + } + + this->total_n += params.N; + } + } + + /** + * Search the kNN for the k-nearest neighbors of a set of query vectors + * @param search_items set of vectors to query for neighbors + * @param n number of items in search_items + * @param res_I pointer to device memory for returning k nearest indices + * @param res_D pointer to device memory for returning k nearest distances + * @param k number of neighbors to query + */ + void kNN::search(const float *search_items, int n, + long *res_I, float *res_D, int k) { + + float *result_D = new float[k*size_t(n)]; + long*result_I = new long[k*size_t(n)]; + + float *all_D = new float[indices*k*size_t(n)]; + long *all_I = new long[indices*k*size_t(n)]; + + cudaPointerAttributes s_att; + cudaError_t s_err = cudaPointerGetAttributes(&s_att, search_items); + + if(s_err != 0 || s_att.device == -1) + std::cout << "Invalid device pointer encountered in knn search: " << search_items << std::endl; + + s_err = cudaPointerGetAttributes(&s_att, res_I); + + if(s_err != 0 || s_att.device == -1) + std::cout << "Invalid index results pointer encountered in knn search: " << search_items << std::endl; + + s_err = cudaPointerGetAttributes(&s_att, res_D); + + if(s_err != 0 || s_att.device == -1) + std::cout << "Invalid distance results pointer encountered in knn search: " << search_items << std::endl; + + + /** + * Initial verification of memory + */ + for(int i = 0; i < indices; i++) { + kNNParams params = knn_params[i]; + + cudaPointerAttributes att; + cudaError_t err = cudaPointerGetAttributes(&att, params.ptr); + + if(err == 0 && att.device > -1) { + CUDA_CHECK(cudaSetDevice(att.device)); + + if(!verify_size(size_t(params.N)*size_t(this->D)*4l, att.device)) + return; + } + } + + + #pragma omp parallel + { + #pragma omp for + for(int i = 0; i < indices; i++) { + + kNNParams params = knn_params[i]; + + cudaPointerAttributes att; + cudaError_t err = cudaPointerGetAttributes(&att, params.ptr); + + if(err == 0 && att.device > -1) { + CUDA_CHECK(cudaSetDevice(att.device)); + + try { + faiss::gpu::StandardGpuResources gpu_res; + gpu_res.noTempMemory(); + gpu_res.setCudaMallocWarning(false); + gpu_res.setDefaultNullStreamAllDevices(); + + bruteForceKnn(&gpu_res, + faiss::METRIC_L2, + params.ptr, + params.N, + search_items, + n, + this->D, + k, + all_D+(long(i)*k*long(n)), + all_I+(long(i)*k*long(n))); + + CUDA_CHECK(cudaPeekAtLastError()); + + } catch(const std::exception &e) { + std::cout << "Exception occurred: " << e.what() << std::endl; + } + + + } else { + std::stringstream ss; + ss << "Input memory for " << ¶ms << " failed. isDevice?=" << att.devicePointer << ", N=" << params.N; + std::cout << "Exception: " << ss.str() << std::endl; + } + } + } + + merge_tables>(long(n), k, indices, + result_D, result_I, all_D, all_I, id_ranges.data()); + + MLCommon::updateDevice(res_D, result_D, k*size_t(n), 0); + MLCommon::updateDevice(res_I, result_I, k*size_t(n), 0); + + delete all_D; + delete all_I; + + delete result_D; + delete result_I; + } + + /** + * Chunk a host array up into one or many GPUs (determined by the provided + * list of gpu ids) and fit a knn model. + * + * @param ptr an array in host memory to chunk over devices + * @param n number of elements in ptr + * @param devices array of device ids for chunking the ptr + * @param n_chunks number of elements in gpus + * @param out host pointer (size n) to store output + */ + void kNN::fit_from_host(float *ptr, int n, int* devices, int n_chunks) { + + if(this->owner) { + for(kNNParams p : knn_params) { CUDA_CHECK(cudaFree(p.ptr)); } + } + + reset(); + + size_t chunk_size = MLCommon::ceildiv((size_t)n, (size_t)n_chunks); + kNNParams params[n_chunks]; + + this->owner = true; + + /** + * Initial verification of memory + */ + for(int i = 0; i < n_chunks; i++) { + + int device = devices[i]; + size_t length = chunk_size; + if(length * i >= n) + length = (chunk_size*i)-size_t(n); + CUDA_CHECK(cudaSetDevice(device)); + if(!verify_size(size_t(length)*size_t(D), device)) + return; + } + + #pragma omp parallel for + for(int i = 0; i < n_chunks; i++) { + + int device = devices[i]; + CUDA_CHECK(cudaSetDevice(device)); + + size_t length = chunk_size; + if(length * i >= n) + length = (size_t(chunk_size)*i)-size_t(n); + + float *ptr_d; + MLCommon::allocate(ptr_d, size_t(length)*size_t(D)); + MLCommon::updateDevice(ptr_d, ptr+(size_t(chunk_size)*i), size_t(length)*size_t(D), 0); + + kNNParams p; + p.N = length; + p.ptr = ptr_d; + + params[i] = p; + } + + fit(params, n_chunks); + } + + /** Merge results from several shards into a single result set. + * @param all_distances size nshard * n * k + * @param all_labels idem + * @param translartions label translations to apply, size nshard + */ + template + void kNN::merge_tables (long n, long k, long nshard, + float *distances, long *labels, + float *all_distances, + long *all_labels, + long *translations) { + if(k == 0) { + return; + } + + size_t stride = n * k; + #pragma omp parallel + { + std::vector buf (2 * nshard); + int * pointer = buf.data(); + int * shard_ids = pointer + nshard; + std::vector buf2 (nshard); + float * heap_vals = buf2.data(); + #pragma omp for + for (long i = 0; i < n; i++) { + // the heap maps values to the shard where they are + // produced. + const float *D_in = all_distances + i * k; + const long *I_in = all_labels + i * k; + int heap_size = 0; + + for (long s = 0; s < nshard; s++) { + pointer[s] = 0; + if (I_in[stride * s] >= 0) + heap_push (++heap_size, heap_vals, shard_ids, + D_in[stride * s], s); + } + + float *D = distances + i * k; + long *I = labels + i * k; + + for (int j = 0; j < k; j++) { + if (heap_size == 0) { + I[j] = -1; + D[j] = C::neutral(); + } else { + // pop best element + int s = shard_ids[0]; + int & p = pointer[s]; + D[j] = heap_vals[0]; + I[j] = I_in[stride * s + p] + translations[s]; + + heap_pop (heap_size--, heap_vals, shard_ids); + p++; + if (p < k && I_in[stride * s + p] >= 0) + heap_push (++heap_size, heap_vals, shard_ids, + D_in[stride * s + p], s); + } + } + } + } + }; + +}; + + +// end namespace ML diff --git a/cuML/src/knn/knn.h b/cpp/src/knn/knn.h similarity index 100% rename from cuML/src/knn/knn.h rename to cpp/src/knn/knn.h diff --git a/cuML/src/metrics/metrics.cu b/cpp/src/metrics/metrics.cu similarity index 100% rename from cuML/src/metrics/metrics.cu rename to cpp/src/metrics/metrics.cu diff --git a/cuML/src/metrics/metrics.hpp b/cpp/src/metrics/metrics.hpp similarity index 100% rename from cuML/src/metrics/metrics.hpp rename to cpp/src/metrics/metrics.hpp diff --git a/cuML/src/ml_cuda_utils.h b/cpp/src/ml_cuda_utils.h similarity index 100% rename from cuML/src/ml_cuda_utils.h rename to cpp/src/ml_cuda_utils.h diff --git a/cuML/src/ml_utils.h b/cpp/src/ml_utils.h similarity index 100% rename from cuML/src/ml_utils.h rename to cpp/src/ml_utils.h diff --git a/cuML/src/pca/pca.cu b/cpp/src/pca/pca.cu similarity index 100% rename from cuML/src/pca/pca.cu rename to cpp/src/pca/pca.cu diff --git a/cuML/src/pca/pca.h b/cpp/src/pca/pca.h similarity index 100% rename from cuML/src/pca/pca.h rename to cpp/src/pca/pca.h diff --git a/cuML/src/pca/pca.hpp b/cpp/src/pca/pca.hpp similarity index 100% rename from cuML/src/pca/pca.hpp rename to cpp/src/pca/pca.hpp diff --git a/cuML/src/randomforest/.gitkeep b/cpp/src/randomforest/.gitkeep similarity index 100% rename from cuML/src/randomforest/.gitkeep rename to cpp/src/randomforest/.gitkeep diff --git a/cuML/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu similarity index 100% rename from cuML/src/randomforest/randomforest.cu rename to cpp/src/randomforest/randomforest.cu diff --git a/cuML/src/randomforest/randomforest.h b/cpp/src/randomforest/randomforest.h similarity index 100% rename from cuML/src/randomforest/randomforest.h rename to cpp/src/randomforest/randomforest.h diff --git a/cuML/src/solver/cd.h b/cpp/src/solver/cd.h similarity index 100% rename from cuML/src/solver/cd.h rename to cpp/src/solver/cd.h diff --git a/cuML/src/solver/learning_rate.h b/cpp/src/solver/learning_rate.h similarity index 100% rename from cuML/src/solver/learning_rate.h rename to cpp/src/solver/learning_rate.h diff --git a/cuML/src/solver/sgd.h b/cpp/src/solver/sgd.h similarity index 100% rename from cuML/src/solver/sgd.h rename to cpp/src/solver/sgd.h diff --git a/cuML/src/solver/shuffle.h b/cpp/src/solver/shuffle.h similarity index 100% rename from cuML/src/solver/shuffle.h rename to cpp/src/solver/shuffle.h diff --git a/cuML/src/solver/solver.cu b/cpp/src/solver/solver.cu similarity index 100% rename from cuML/src/solver/solver.cu rename to cpp/src/solver/solver.cu diff --git a/cuML/src/solver/solver_c.h b/cpp/src/solver/solver_c.h similarity index 100% rename from cuML/src/solver/solver_c.h rename to cpp/src/solver/solver_c.h diff --git a/cuML/src/spectral/spectral.h b/cpp/src/spectral/spectral.h similarity index 100% rename from cuML/src/spectral/spectral.h rename to cpp/src/spectral/spectral.h diff --git a/cuML/src/tsvd/tsvd.cu b/cpp/src/tsvd/tsvd.cu similarity index 100% rename from cuML/src/tsvd/tsvd.cu rename to cpp/src/tsvd/tsvd.cu diff --git a/cuML/src/tsvd/tsvd.h b/cpp/src/tsvd/tsvd.h similarity index 100% rename from cuML/src/tsvd/tsvd.h rename to cpp/src/tsvd/tsvd.h diff --git a/cuML/src/tsvd/tsvd.hpp b/cpp/src/tsvd/tsvd.hpp similarity index 100% rename from cuML/src/tsvd/tsvd.hpp rename to cpp/src/tsvd/tsvd.hpp diff --git a/cuML/src/tsvd/tsvd_spmg.h b/cpp/src/tsvd/tsvd_spmg.h similarity index 100% rename from cuML/src/tsvd/tsvd_spmg.h rename to cpp/src/tsvd/tsvd_spmg.h diff --git a/cuML/src/umap/fuzzy_simpl_set/naive.h b/cpp/src/umap/fuzzy_simpl_set/naive.h similarity index 100% rename from cuML/src/umap/fuzzy_simpl_set/naive.h rename to cpp/src/umap/fuzzy_simpl_set/naive.h diff --git a/cuML/src/umap/fuzzy_simpl_set/runner.h b/cpp/src/umap/fuzzy_simpl_set/runner.h similarity index 100% rename from cuML/src/umap/fuzzy_simpl_set/runner.h rename to cpp/src/umap/fuzzy_simpl_set/runner.h diff --git a/cuML/src/umap/init_embed/random_algo.h b/cpp/src/umap/init_embed/random_algo.h similarity index 100% rename from cuML/src/umap/init_embed/random_algo.h rename to cpp/src/umap/init_embed/random_algo.h diff --git a/cuML/src/umap/init_embed/runner.h b/cpp/src/umap/init_embed/runner.h similarity index 100% rename from cuML/src/umap/init_embed/runner.h rename to cpp/src/umap/init_embed/runner.h diff --git a/cuML/src/umap/init_embed/spectral_algo.h b/cpp/src/umap/init_embed/spectral_algo.h similarity index 100% rename from cuML/src/umap/init_embed/spectral_algo.h rename to cpp/src/umap/init_embed/spectral_algo.h diff --git a/cuML/src/umap/knn_graph/algo.h b/cpp/src/umap/knn_graph/algo.h similarity index 100% rename from cuML/src/umap/knn_graph/algo.h rename to cpp/src/umap/knn_graph/algo.h diff --git a/cuML/src/umap/knn_graph/runner.h b/cpp/src/umap/knn_graph/runner.h similarity index 100% rename from cuML/src/umap/knn_graph/runner.h rename to cpp/src/umap/knn_graph/runner.h diff --git a/cuML/src/umap/optimize.h b/cpp/src/umap/optimize.h similarity index 100% rename from cuML/src/umap/optimize.h rename to cpp/src/umap/optimize.h diff --git a/cuML/src/umap/runner.h b/cpp/src/umap/runner.h similarity index 100% rename from cuML/src/umap/runner.h rename to cpp/src/umap/runner.h diff --git a/cuML/src/umap/simpl_set_embed/algo.h b/cpp/src/umap/simpl_set_embed/algo.h similarity index 100% rename from cuML/src/umap/simpl_set_embed/algo.h rename to cpp/src/umap/simpl_set_embed/algo.h diff --git a/cuML/src/umap/simpl_set_embed/runner.h b/cpp/src/umap/simpl_set_embed/runner.h similarity index 100% rename from cuML/src/umap/simpl_set_embed/runner.h rename to cpp/src/umap/simpl_set_embed/runner.h diff --git a/cuML/src/umap/supervised.h b/cpp/src/umap/supervised.h similarity index 100% rename from cuML/src/umap/supervised.h rename to cpp/src/umap/supervised.h diff --git a/cuML/src/umap/umap.cu b/cpp/src/umap/umap.cu similarity index 100% rename from cuML/src/umap/umap.cu rename to cpp/src/umap/umap.cu diff --git a/cuML/src/umap/umap.h b/cpp/src/umap/umap.h similarity index 100% rename from cuML/src/umap/umap.h rename to cpp/src/umap/umap.h diff --git a/cuML/src/umap/umapparams.h b/cpp/src/umap/umapparams.h similarity index 100% rename from cuML/src/umap/umapparams.h rename to cpp/src/umap/umapparams.h diff --git a/ml-prims/src/common/Timer.h b/cpp/src_prims/common/Timer.h similarity index 100% rename from ml-prims/src/common/Timer.h rename to cpp/src_prims/common/Timer.h diff --git a/ml-prims/src/common/buffer_base.hpp b/cpp/src_prims/common/buffer_base.hpp similarity index 100% rename from ml-prims/src/common/buffer_base.hpp rename to cpp/src_prims/common/buffer_base.hpp diff --git a/ml-prims/src/common/cuml_allocator.hpp b/cpp/src_prims/common/cuml_allocator.hpp similarity index 100% rename from ml-prims/src/common/cuml_allocator.hpp rename to cpp/src_prims/common/cuml_allocator.hpp diff --git a/ml-prims/src/common/device_buffer.hpp b/cpp/src_prims/common/device_buffer.hpp similarity index 100% rename from ml-prims/src/common/device_buffer.hpp rename to cpp/src_prims/common/device_buffer.hpp diff --git a/ml-prims/src/common/grid_sync.h b/cpp/src_prims/common/grid_sync.h similarity index 100% rename from ml-prims/src/common/grid_sync.h rename to cpp/src_prims/common/grid_sync.h diff --git a/ml-prims/src/common/host_buffer.hpp b/cpp/src_prims/common/host_buffer.hpp similarity index 100% rename from ml-prims/src/common/host_buffer.hpp rename to cpp/src_prims/common/host_buffer.hpp diff --git a/ml-prims/src/cuda_utils.h b/cpp/src_prims/cuda_utils.h similarity index 100% rename from ml-prims/src/cuda_utils.h rename to cpp/src_prims/cuda_utils.h diff --git a/ml-prims/src/decoupled_lookback.h b/cpp/src_prims/decoupled_lookback.h similarity index 100% rename from ml-prims/src/decoupled_lookback.h rename to cpp/src_prims/decoupled_lookback.h diff --git a/ml-prims/src/distance/algo1.h b/cpp/src_prims/distance/algo1.h similarity index 100% rename from ml-prims/src/distance/algo1.h rename to cpp/src_prims/distance/algo1.h diff --git a/ml-prims/src/distance/cosine.h b/cpp/src_prims/distance/cosine.h similarity index 100% rename from ml-prims/src/distance/cosine.h rename to cpp/src_prims/distance/cosine.h diff --git a/ml-prims/src/distance/distance.h b/cpp/src_prims/distance/distance.h similarity index 100% rename from ml-prims/src/distance/distance.h rename to cpp/src_prims/distance/distance.h diff --git a/ml-prims/src/distance/distance_epilogue.h b/cpp/src_prims/distance/distance_epilogue.h similarity index 100% rename from ml-prims/src/distance/distance_epilogue.h rename to cpp/src_prims/distance/distance_epilogue.h diff --git a/ml-prims/src/distance/distance_epilogue_functor.h b/cpp/src_prims/distance/distance_epilogue_functor.h similarity index 100% rename from ml-prims/src/distance/distance_epilogue_functor.h rename to cpp/src_prims/distance/distance_epilogue_functor.h diff --git a/ml-prims/src/distance/distance_epilogue_traits.h b/cpp/src_prims/distance/distance_epilogue_traits.h similarity index 100% rename from ml-prims/src/distance/distance_epilogue_traits.h rename to cpp/src_prims/distance/distance_epilogue_traits.h diff --git a/ml-prims/src/distance/distance_fragment_multiply_add.h b/cpp/src_prims/distance/distance_fragment_multiply_add.h similarity index 100% rename from ml-prims/src/distance/distance_fragment_multiply_add.h rename to cpp/src_prims/distance/distance_fragment_multiply_add.h diff --git a/ml-prims/src/distance/distance_tile_traits.h b/cpp/src_prims/distance/distance_tile_traits.h similarity index 100% rename from ml-prims/src/distance/distance_tile_traits.h rename to cpp/src_prims/distance/distance_tile_traits.h diff --git a/ml-prims/src/distance/euclidean.h b/cpp/src_prims/distance/euclidean.h similarity index 100% rename from ml-prims/src/distance/euclidean.h rename to cpp/src_prims/distance/euclidean.h diff --git a/ml-prims/src/distance/fragment_sqrt.h b/cpp/src_prims/distance/fragment_sqrt.h similarity index 100% rename from ml-prims/src/distance/fragment_sqrt.h rename to cpp/src_prims/distance/fragment_sqrt.h diff --git a/ml-prims/src/distance/l1.h b/cpp/src_prims/distance/l1.h similarity index 100% rename from ml-prims/src/distance/l1.h rename to cpp/src_prims/distance/l1.h diff --git a/ml-prims/src/distance/linear_scaling_sqrt.h b/cpp/src_prims/distance/linear_scaling_sqrt.h similarity index 100% rename from ml-prims/src/distance/linear_scaling_sqrt.h rename to cpp/src_prims/distance/linear_scaling_sqrt.h diff --git a/ml-prims/src/functions/hinge.h b/cpp/src_prims/functions/hinge.h similarity index 100% rename from ml-prims/src/functions/hinge.h rename to cpp/src_prims/functions/hinge.h diff --git a/ml-prims/src/functions/linearReg.h b/cpp/src_prims/functions/linearReg.h similarity index 100% rename from ml-prims/src/functions/linearReg.h rename to cpp/src_prims/functions/linearReg.h diff --git a/ml-prims/src/functions/log.h b/cpp/src_prims/functions/log.h similarity index 100% rename from ml-prims/src/functions/log.h rename to cpp/src_prims/functions/log.h diff --git a/ml-prims/src/functions/logisticReg.h b/cpp/src_prims/functions/logisticReg.h similarity index 100% rename from ml-prims/src/functions/logisticReg.h rename to cpp/src_prims/functions/logisticReg.h diff --git a/ml-prims/src/functions/penalty.h b/cpp/src_prims/functions/penalty.h similarity index 100% rename from ml-prims/src/functions/penalty.h rename to cpp/src_prims/functions/penalty.h diff --git a/ml-prims/src/functions/sigmoid.h b/cpp/src_prims/functions/sigmoid.h similarity index 100% rename from ml-prims/src/functions/sigmoid.h rename to cpp/src_prims/functions/sigmoid.h diff --git a/ml-prims/src/functions/sign.h b/cpp/src_prims/functions/sign.h similarity index 100% rename from ml-prims/src/functions/sign.h rename to cpp/src_prims/functions/sign.h diff --git a/ml-prims/src/functions/softThres.h b/cpp/src_prims/functions/softThres.h similarity index 100% rename from ml-prims/src/functions/softThres.h rename to cpp/src_prims/functions/softThres.h diff --git a/ml-prims/src/linalg/add.h b/cpp/src_prims/linalg/add.h similarity index 100% rename from ml-prims/src/linalg/add.h rename to cpp/src_prims/linalg/add.h diff --git a/ml-prims/src/linalg/binary_op.h b/cpp/src_prims/linalg/binary_op.h similarity index 100% rename from ml-prims/src/linalg/binary_op.h rename to cpp/src_prims/linalg/binary_op.h diff --git a/ml-prims/src/linalg/coalesced_reduction.h b/cpp/src_prims/linalg/coalesced_reduction.h similarity index 100% rename from ml-prims/src/linalg/coalesced_reduction.h rename to cpp/src_prims/linalg/coalesced_reduction.h diff --git a/ml-prims/src/linalg/cublas_wrappers.h b/cpp/src_prims/linalg/cublas_wrappers.h similarity index 100% rename from ml-prims/src/linalg/cublas_wrappers.h rename to cpp/src_prims/linalg/cublas_wrappers.h diff --git a/ml-prims/src/linalg/cusolver_wrappers.h b/cpp/src_prims/linalg/cusolver_wrappers.h similarity index 100% rename from ml-prims/src/linalg/cusolver_wrappers.h rename to cpp/src_prims/linalg/cusolver_wrappers.h diff --git a/ml-prims/src/linalg/custom_accum.h b/cpp/src_prims/linalg/custom_accum.h similarity index 100% rename from ml-prims/src/linalg/custom_accum.h rename to cpp/src_prims/linalg/custom_accum.h diff --git a/ml-prims/src/linalg/cutlass_wrappers.h b/cpp/src_prims/linalg/cutlass_wrappers.h similarity index 100% rename from ml-prims/src/linalg/cutlass_wrappers.h rename to cpp/src_prims/linalg/cutlass_wrappers.h diff --git a/ml-prims/src/linalg/divide.h b/cpp/src_prims/linalg/divide.h similarity index 100% rename from ml-prims/src/linalg/divide.h rename to cpp/src_prims/linalg/divide.h diff --git a/ml-prims/src/linalg/eig.h b/cpp/src_prims/linalg/eig.h similarity index 100% rename from ml-prims/src/linalg/eig.h rename to cpp/src_prims/linalg/eig.h diff --git a/ml-prims/src/linalg/eltwise.h b/cpp/src_prims/linalg/eltwise.h similarity index 100% rename from ml-prims/src/linalg/eltwise.h rename to cpp/src_prims/linalg/eltwise.h diff --git a/ml-prims/src/linalg/eltwise2d.h b/cpp/src_prims/linalg/eltwise2d.h similarity index 100% rename from ml-prims/src/linalg/eltwise2d.h rename to cpp/src_prims/linalg/eltwise2d.h diff --git a/ml-prims/src/linalg/gemm.h b/cpp/src_prims/linalg/gemm.h similarity index 100% rename from ml-prims/src/linalg/gemm.h rename to cpp/src_prims/linalg/gemm.h diff --git a/ml-prims/src/linalg/gemv.h b/cpp/src_prims/linalg/gemv.h similarity index 100% rename from ml-prims/src/linalg/gemv.h rename to cpp/src_prims/linalg/gemv.h diff --git a/ml-prims/src/linalg/lstsq.h b/cpp/src_prims/linalg/lstsq.h similarity index 100% rename from ml-prims/src/linalg/lstsq.h rename to cpp/src_prims/linalg/lstsq.h diff --git a/ml-prims/src/linalg/map_then_reduce.h b/cpp/src_prims/linalg/map_then_reduce.h similarity index 100% rename from ml-prims/src/linalg/map_then_reduce.h rename to cpp/src_prims/linalg/map_then_reduce.h diff --git a/ml-prims/src/linalg/matrix_vector_op.h b/cpp/src_prims/linalg/matrix_vector_op.h similarity index 100% rename from ml-prims/src/linalg/matrix_vector_op.h rename to cpp/src_prims/linalg/matrix_vector_op.h diff --git a/ml-prims/src/linalg/mean_squared_error.h b/cpp/src_prims/linalg/mean_squared_error.h similarity index 100% rename from ml-prims/src/linalg/mean_squared_error.h rename to cpp/src_prims/linalg/mean_squared_error.h diff --git a/ml-prims/src/linalg/multiply.h b/cpp/src_prims/linalg/multiply.h similarity index 100% rename from ml-prims/src/linalg/multiply.h rename to cpp/src_prims/linalg/multiply.h diff --git a/ml-prims/src/linalg/norm.h b/cpp/src_prims/linalg/norm.h similarity index 100% rename from ml-prims/src/linalg/norm.h rename to cpp/src_prims/linalg/norm.h diff --git a/ml-prims/src/linalg/power.h b/cpp/src_prims/linalg/power.h similarity index 100% rename from ml-prims/src/linalg/power.h rename to cpp/src_prims/linalg/power.h diff --git a/ml-prims/src/linalg/qr.h b/cpp/src_prims/linalg/qr.h similarity index 100% rename from ml-prims/src/linalg/qr.h rename to cpp/src_prims/linalg/qr.h diff --git a/ml-prims/src/linalg/reduce.h b/cpp/src_prims/linalg/reduce.h similarity index 100% rename from ml-prims/src/linalg/reduce.h rename to cpp/src_prims/linalg/reduce.h diff --git a/ml-prims/src/linalg/reduce_rows_by_key.h b/cpp/src_prims/linalg/reduce_rows_by_key.h similarity index 100% rename from ml-prims/src/linalg/reduce_rows_by_key.h rename to cpp/src_prims/linalg/reduce_rows_by_key.h diff --git a/ml-prims/src/linalg/row_gemm.h b/cpp/src_prims/linalg/row_gemm.h similarity index 100% rename from ml-prims/src/linalg/row_gemm.h rename to cpp/src_prims/linalg/row_gemm.h diff --git a/ml-prims/src/linalg/rsvd.h b/cpp/src_prims/linalg/rsvd.h similarity index 100% rename from ml-prims/src/linalg/rsvd.h rename to cpp/src_prims/linalg/rsvd.h diff --git a/ml-prims/src/linalg/sqrt.h b/cpp/src_prims/linalg/sqrt.h similarity index 100% rename from ml-prims/src/linalg/sqrt.h rename to cpp/src_prims/linalg/sqrt.h diff --git a/ml-prims/src/linalg/strided_reduction.h b/cpp/src_prims/linalg/strided_reduction.h similarity index 100% rename from ml-prims/src/linalg/strided_reduction.h rename to cpp/src_prims/linalg/strided_reduction.h diff --git a/ml-prims/src/linalg/subtract.h b/cpp/src_prims/linalg/subtract.h similarity index 100% rename from ml-prims/src/linalg/subtract.h rename to cpp/src_prims/linalg/subtract.h diff --git a/ml-prims/src/linalg/svd.h b/cpp/src_prims/linalg/svd.h similarity index 100% rename from ml-prims/src/linalg/svd.h rename to cpp/src_prims/linalg/svd.h diff --git a/ml-prims/src/linalg/ternary_op.h b/cpp/src_prims/linalg/ternary_op.h similarity index 100% rename from ml-prims/src/linalg/ternary_op.h rename to cpp/src_prims/linalg/ternary_op.h diff --git a/ml-prims/src/linalg/transpose.h b/cpp/src_prims/linalg/transpose.h similarity index 100% rename from ml-prims/src/linalg/transpose.h rename to cpp/src_prims/linalg/transpose.h diff --git a/ml-prims/src/linalg/unary_op.h b/cpp/src_prims/linalg/unary_op.h similarity index 100% rename from ml-prims/src/linalg/unary_op.h rename to cpp/src_prims/linalg/unary_op.h diff --git a/ml-prims/src/linalg/vector_broadcast.h b/cpp/src_prims/linalg/vector_broadcast.h similarity index 100% rename from ml-prims/src/linalg/vector_broadcast.h rename to cpp/src_prims/linalg/vector_broadcast.h diff --git a/ml-prims/src/matrix/gather.h b/cpp/src_prims/matrix/gather.h similarity index 100% rename from ml-prims/src/matrix/gather.h rename to cpp/src_prims/matrix/gather.h diff --git a/ml-prims/src/matrix/math.h b/cpp/src_prims/matrix/math.h similarity index 100% rename from ml-prims/src/matrix/math.h rename to cpp/src_prims/matrix/math.h diff --git a/ml-prims/src/matrix/matrix.h b/cpp/src_prims/matrix/matrix.h similarity index 100% rename from ml-prims/src/matrix/matrix.h rename to cpp/src_prims/matrix/matrix.h diff --git a/ml-prims/src/random/curand_wrappers.h b/cpp/src_prims/random/curand_wrappers.h similarity index 100% rename from ml-prims/src/random/curand_wrappers.h rename to cpp/src_prims/random/curand_wrappers.h diff --git a/ml-prims/src/random/mvg.h b/cpp/src_prims/random/mvg.h similarity index 100% rename from ml-prims/src/random/mvg.h rename to cpp/src_prims/random/mvg.h diff --git a/ml-prims/src/random/permute.h b/cpp/src_prims/random/permute.h similarity index 100% rename from ml-prims/src/random/permute.h rename to cpp/src_prims/random/permute.h diff --git a/ml-prims/src/random/rng.h b/cpp/src_prims/random/rng.h similarity index 100% rename from ml-prims/src/random/rng.h rename to cpp/src_prims/random/rng.h diff --git a/ml-prims/src/random/rng_impl.h b/cpp/src_prims/random/rng_impl.h similarity index 100% rename from ml-prims/src/random/rng_impl.h rename to cpp/src_prims/random/rng_impl.h diff --git a/ml-prims/src/score/scores.h b/cpp/src_prims/score/scores.h similarity index 100% rename from ml-prims/src/score/scores.h rename to cpp/src_prims/score/scores.h diff --git a/ml-prims/src/selection/columnWiseSort.h b/cpp/src_prims/selection/columnWiseSort.h similarity index 100% rename from ml-prims/src/selection/columnWiseSort.h rename to cpp/src_prims/selection/columnWiseSort.h diff --git a/ml-prims/src/selection/kselection.h b/cpp/src_prims/selection/kselection.h similarity index 100% rename from ml-prims/src/selection/kselection.h rename to cpp/src_prims/selection/kselection.h diff --git a/ml-prims/src/sparse/coo.h b/cpp/src_prims/sparse/coo.h similarity index 100% rename from ml-prims/src/sparse/coo.h rename to cpp/src_prims/sparse/coo.h diff --git a/ml-prims/src/sparse/csr.h b/cpp/src_prims/sparse/csr.h similarity index 100% rename from ml-prims/src/sparse/csr.h rename to cpp/src_prims/sparse/csr.h diff --git a/ml-prims/src/sparse/cusparse_wrappers.h b/cpp/src_prims/sparse/cusparse_wrappers.h similarity index 100% rename from ml-prims/src/sparse/cusparse_wrappers.h rename to cpp/src_prims/sparse/cusparse_wrappers.h diff --git a/ml-prims/src/sparse/nvgraph_wrappers.h b/cpp/src_prims/sparse/nvgraph_wrappers.h similarity index 100% rename from ml-prims/src/sparse/nvgraph_wrappers.h rename to cpp/src_prims/sparse/nvgraph_wrappers.h diff --git a/ml-prims/src/stats/cov.h b/cpp/src_prims/stats/cov.h similarity index 100% rename from ml-prims/src/stats/cov.h rename to cpp/src_prims/stats/cov.h diff --git a/ml-prims/src/stats/mean.h b/cpp/src_prims/stats/mean.h similarity index 100% rename from ml-prims/src/stats/mean.h rename to cpp/src_prims/stats/mean.h diff --git a/ml-prims/src/stats/mean_center.h b/cpp/src_prims/stats/mean_center.h similarity index 100% rename from ml-prims/src/stats/mean_center.h rename to cpp/src_prims/stats/mean_center.h diff --git a/ml-prims/src/stats/minmax.h b/cpp/src_prims/stats/minmax.h similarity index 100% rename from ml-prims/src/stats/minmax.h rename to cpp/src_prims/stats/minmax.h diff --git a/ml-prims/src/stats/stddev.h b/cpp/src_prims/stats/stddev.h similarity index 100% rename from ml-prims/src/stats/stddev.h rename to cpp/src_prims/stats/stddev.h diff --git a/ml-prims/src/stats/sum.h b/cpp/src_prims/stats/sum.h similarity index 100% rename from ml-prims/src/stats/sum.h rename to cpp/src_prims/stats/sum.h diff --git a/ml-prims/src/stats/weighted_mean.h b/cpp/src_prims/stats/weighted_mean.h similarity index 100% rename from ml-prims/src/stats/weighted_mean.h rename to cpp/src_prims/stats/weighted_mean.h diff --git a/ml-prims/src/utils.h b/cpp/src_prims/utils.h similarity index 100% rename from ml-prims/src/utils.h rename to cpp/src_prims/utils.h diff --git a/ml-prims/src/vectorized.h b/cpp/src_prims/vectorized.h similarity index 100% rename from ml-prims/src/vectorized.h rename to cpp/src_prims/vectorized.h diff --git a/cuML/test/.gitkeep b/cpp/test/.gitkeep similarity index 100% rename from cuML/test/.gitkeep rename to cpp/test/.gitkeep diff --git a/cuML/test_mg/knn_test.cu b/cpp/test/mg/knn_test_mg.cu similarity index 100% rename from cuML/test_mg/knn_test.cu rename to cpp/test/mg/knn_test_mg.cu diff --git a/ml-prims/test/CMakeLists.txt b/cpp/test/prims/CMakeLists.txt similarity index 100% rename from ml-prims/test/CMakeLists.txt rename to cpp/test/prims/CMakeLists.txt diff --git a/ml-prims/test/add.cu b/cpp/test/prims/add.cu similarity index 100% rename from ml-prims/test/add.cu rename to cpp/test/prims/add.cu diff --git a/ml-prims/test/add.h b/cpp/test/prims/add.h similarity index 100% rename from ml-prims/test/add.h rename to cpp/test/prims/add.h diff --git a/ml-prims/test/add_and_sub_dev_scalar.cu b/cpp/test/prims/add_and_sub_dev_scalar.cu similarity index 100% rename from ml-prims/test/add_and_sub_dev_scalar.cu rename to cpp/test/prims/add_and_sub_dev_scalar.cu diff --git a/ml-prims/test/binary_op.cu b/cpp/test/prims/binary_op.cu similarity index 100% rename from ml-prims/test/binary_op.cu rename to cpp/test/prims/binary_op.cu diff --git a/ml-prims/test/binary_op.h b/cpp/test/prims/binary_op.h similarity index 100% rename from ml-prims/test/binary_op.h rename to cpp/test/prims/binary_op.h diff --git a/ml-prims/test/coalesced_reduction.cu b/cpp/test/prims/coalesced_reduction.cu similarity index 100% rename from ml-prims/test/coalesced_reduction.cu rename to cpp/test/prims/coalesced_reduction.cu diff --git a/ml-prims/test/columnSort.cu b/cpp/test/prims/columnSort.cu similarity index 100% rename from ml-prims/test/columnSort.cu rename to cpp/test/prims/columnSort.cu diff --git a/ml-prims/test/coo.cu b/cpp/test/prims/coo.cu similarity index 100% rename from ml-prims/test/coo.cu rename to cpp/test/prims/coo.cu diff --git a/ml-prims/test/coo.h b/cpp/test/prims/coo.h similarity index 100% rename from ml-prims/test/coo.h rename to cpp/test/prims/coo.h diff --git a/ml-prims/test/cov.cu b/cpp/test/prims/cov.cu similarity index 100% rename from ml-prims/test/cov.cu rename to cpp/test/prims/cov.cu diff --git a/ml-prims/test/csr.cu b/cpp/test/prims/csr.cu similarity index 100% rename from ml-prims/test/csr.cu rename to cpp/test/prims/csr.cu diff --git a/ml-prims/test/csr.h b/cpp/test/prims/csr.h similarity index 100% rename from ml-prims/test/csr.h rename to cpp/test/prims/csr.h diff --git a/ml-prims/test/cuda_utils.cu b/cpp/test/prims/cuda_utils.cu similarity index 100% rename from ml-prims/test/cuda_utils.cu rename to cpp/test/prims/cuda_utils.cu diff --git a/ml-prims/test/decoupled_lookback.cu b/cpp/test/prims/decoupled_lookback.cu similarity index 100% rename from ml-prims/test/decoupled_lookback.cu rename to cpp/test/prims/decoupled_lookback.cu diff --git a/ml-prims/test/dist_adj.cu b/cpp/test/prims/dist_adj.cu similarity index 100% rename from ml-prims/test/dist_adj.cu rename to cpp/test/prims/dist_adj.cu diff --git a/ml-prims/test/dist_cos.cu b/cpp/test/prims/dist_cos.cu similarity index 100% rename from ml-prims/test/dist_cos.cu rename to cpp/test/prims/dist_cos.cu diff --git a/ml-prims/test/dist_euc_exp.cu b/cpp/test/prims/dist_euc_exp.cu similarity index 100% rename from ml-prims/test/dist_euc_exp.cu rename to cpp/test/prims/dist_euc_exp.cu diff --git a/ml-prims/test/dist_euc_unexp.cu b/cpp/test/prims/dist_euc_unexp.cu similarity index 100% rename from ml-prims/test/dist_euc_unexp.cu rename to cpp/test/prims/dist_euc_unexp.cu diff --git a/ml-prims/test/dist_l1.cu b/cpp/test/prims/dist_l1.cu similarity index 100% rename from ml-prims/test/dist_l1.cu rename to cpp/test/prims/dist_l1.cu diff --git a/ml-prims/test/distance_base.h b/cpp/test/prims/distance_base.h similarity index 100% rename from ml-prims/test/distance_base.h rename to cpp/test/prims/distance_base.h diff --git a/ml-prims/test/divide.cu b/cpp/test/prims/divide.cu similarity index 100% rename from ml-prims/test/divide.cu rename to cpp/test/prims/divide.cu diff --git a/ml-prims/test/eig.cu b/cpp/test/prims/eig.cu similarity index 100% rename from ml-prims/test/eig.cu rename to cpp/test/prims/eig.cu diff --git a/ml-prims/test/eltwise.cu b/cpp/test/prims/eltwise.cu similarity index 100% rename from ml-prims/test/eltwise.cu rename to cpp/test/prims/eltwise.cu diff --git a/ml-prims/test/eltwise2d.cu b/cpp/test/prims/eltwise2d.cu similarity index 100% rename from ml-prims/test/eltwise2d.cu rename to cpp/test/prims/eltwise2d.cu diff --git a/ml-prims/test/gather.cu b/cpp/test/prims/gather.cu similarity index 100% rename from ml-prims/test/gather.cu rename to cpp/test/prims/gather.cu diff --git a/ml-prims/test/gemm.cu b/cpp/test/prims/gemm.cu similarity index 100% rename from ml-prims/test/gemm.cu rename to cpp/test/prims/gemm.cu diff --git a/ml-prims/test/grid_sync.cu b/cpp/test/prims/grid_sync.cu similarity index 100% rename from ml-prims/test/grid_sync.cu rename to cpp/test/prims/grid_sync.cu diff --git a/ml-prims/test/hinge.cu b/cpp/test/prims/hinge.cu similarity index 100% rename from ml-prims/test/hinge.cu rename to cpp/test/prims/hinge.cu diff --git a/ml-prims/test/kselection.cu b/cpp/test/prims/kselection.cu similarity index 100% rename from ml-prims/test/kselection.cu rename to cpp/test/prims/kselection.cu diff --git a/ml-prims/test/linearReg.cu b/cpp/test/prims/linearReg.cu similarity index 100% rename from ml-prims/test/linearReg.cu rename to cpp/test/prims/linearReg.cu diff --git a/ml-prims/test/log.cu b/cpp/test/prims/log.cu similarity index 100% rename from ml-prims/test/log.cu rename to cpp/test/prims/log.cu diff --git a/ml-prims/test/logisticReg.cu b/cpp/test/prims/logisticReg.cu similarity index 100% rename from ml-prims/test/logisticReg.cu rename to cpp/test/prims/logisticReg.cu diff --git a/ml-prims/test/map_then_reduce.cu b/cpp/test/prims/map_then_reduce.cu similarity index 100% rename from ml-prims/test/map_then_reduce.cu rename to cpp/test/prims/map_then_reduce.cu diff --git a/ml-prims/test/math.cu b/cpp/test/prims/math.cu similarity index 100% rename from ml-prims/test/math.cu rename to cpp/test/prims/math.cu diff --git a/ml-prims/test/matrix.cu b/cpp/test/prims/matrix.cu similarity index 100% rename from ml-prims/test/matrix.cu rename to cpp/test/prims/matrix.cu diff --git a/ml-prims/test/matrix_vector_op.cu b/cpp/test/prims/matrix_vector_op.cu similarity index 100% rename from ml-prims/test/matrix_vector_op.cu rename to cpp/test/prims/matrix_vector_op.cu diff --git a/ml-prims/test/matrix_vector_op.h b/cpp/test/prims/matrix_vector_op.h similarity index 100% rename from ml-prims/test/matrix_vector_op.h rename to cpp/test/prims/matrix_vector_op.h diff --git a/ml-prims/test/mean.cu b/cpp/test/prims/mean.cu similarity index 100% rename from ml-prims/test/mean.cu rename to cpp/test/prims/mean.cu diff --git a/ml-prims/test/mean_center.cu b/cpp/test/prims/mean_center.cu similarity index 100% rename from ml-prims/test/mean_center.cu rename to cpp/test/prims/mean_center.cu diff --git a/ml-prims/test/minmax.cu b/cpp/test/prims/minmax.cu similarity index 100% rename from ml-prims/test/minmax.cu rename to cpp/test/prims/minmax.cu diff --git a/ml-prims/test/multiply.cu b/cpp/test/prims/multiply.cu similarity index 100% rename from ml-prims/test/multiply.cu rename to cpp/test/prims/multiply.cu diff --git a/ml-prims/test/mvg.cu b/cpp/test/prims/mvg.cu similarity index 100% rename from ml-prims/test/mvg.cu rename to cpp/test/prims/mvg.cu diff --git a/ml-prims/test/norm.cu b/cpp/test/prims/norm.cu similarity index 100% rename from ml-prims/test/norm.cu rename to cpp/test/prims/norm.cu diff --git a/ml-prims/test/opg_distance.cu b/cpp/test/prims/opg_distance.cu similarity index 100% rename from ml-prims/test/opg_distance.cu rename to cpp/test/prims/opg_distance.cu diff --git a/ml-prims/test/penalty.cu b/cpp/test/prims/penalty.cu similarity index 100% rename from ml-prims/test/penalty.cu rename to cpp/test/prims/penalty.cu diff --git a/ml-prims/test/permute.cu b/cpp/test/prims/permute.cu similarity index 100% rename from ml-prims/test/permute.cu rename to cpp/test/prims/permute.cu diff --git a/ml-prims/test/power.cu b/cpp/test/prims/power.cu similarity index 100% rename from ml-prims/test/power.cu rename to cpp/test/prims/power.cu diff --git a/ml-prims/test/reduce.cu b/cpp/test/prims/reduce.cu similarity index 100% rename from ml-prims/test/reduce.cu rename to cpp/test/prims/reduce.cu diff --git a/ml-prims/test/reduce.h b/cpp/test/prims/reduce.h similarity index 100% rename from ml-prims/test/reduce.h rename to cpp/test/prims/reduce.h diff --git a/ml-prims/test/reduce_rows_by_key.cu b/cpp/test/prims/reduce_rows_by_key.cu similarity index 100% rename from ml-prims/test/reduce_rows_by_key.cu rename to cpp/test/prims/reduce_rows_by_key.cu diff --git a/ml-prims/test/rng.cu b/cpp/test/prims/rng.cu similarity index 100% rename from ml-prims/test/rng.cu rename to cpp/test/prims/rng.cu diff --git a/ml-prims/test/rng_int.cu b/cpp/test/prims/rng_int.cu similarity index 100% rename from ml-prims/test/rng_int.cu rename to cpp/test/prims/rng_int.cu diff --git a/ml-prims/test/rsvd.cu b/cpp/test/prims/rsvd.cu similarity index 100% rename from ml-prims/test/rsvd.cu rename to cpp/test/prims/rsvd.cu diff --git a/ml-prims/test/score.cu b/cpp/test/prims/score.cu similarity index 100% rename from ml-prims/test/score.cu rename to cpp/test/prims/score.cu diff --git a/ml-prims/test/sigmoid.cu b/cpp/test/prims/sigmoid.cu similarity index 100% rename from ml-prims/test/sigmoid.cu rename to cpp/test/prims/sigmoid.cu diff --git a/ml-prims/test/sqrt.cu b/cpp/test/prims/sqrt.cu similarity index 100% rename from ml-prims/test/sqrt.cu rename to cpp/test/prims/sqrt.cu diff --git a/ml-prims/test/stddev.cu b/cpp/test/prims/stddev.cu similarity index 100% rename from ml-prims/test/stddev.cu rename to cpp/test/prims/stddev.cu diff --git a/ml-prims/test/strided_reduction.cu b/cpp/test/prims/strided_reduction.cu similarity index 100% rename from ml-prims/test/strided_reduction.cu rename to cpp/test/prims/strided_reduction.cu diff --git a/ml-prims/test/subtract.cu b/cpp/test/prims/subtract.cu similarity index 100% rename from ml-prims/test/subtract.cu rename to cpp/test/prims/subtract.cu diff --git a/ml-prims/test/sum.cu b/cpp/test/prims/sum.cu similarity index 100% rename from ml-prims/test/sum.cu rename to cpp/test/prims/sum.cu diff --git a/ml-prims/test/svd.cu b/cpp/test/prims/svd.cu similarity index 100% rename from ml-prims/test/svd.cu rename to cpp/test/prims/svd.cu diff --git a/ml-prims/test/ternary_op.cu b/cpp/test/prims/ternary_op.cu similarity index 100% rename from ml-prims/test/ternary_op.cu rename to cpp/test/prims/ternary_op.cu diff --git a/ml-prims/test/test_utils.h b/cpp/test/prims/test_utils.h similarity index 100% rename from ml-prims/test/test_utils.h rename to cpp/test/prims/test_utils.h diff --git a/ml-prims/test/transpose.cu b/cpp/test/prims/transpose.cu similarity index 100% rename from ml-prims/test/transpose.cu rename to cpp/test/prims/transpose.cu diff --git a/ml-prims/test/unary_op.cu b/cpp/test/prims/unary_op.cu similarity index 100% rename from ml-prims/test/unary_op.cu rename to cpp/test/prims/unary_op.cu diff --git a/ml-prims/test/unary_op.h b/cpp/test/prims/unary_op.h similarity index 100% rename from ml-prims/test/unary_op.h rename to cpp/test/prims/unary_op.h diff --git a/ml-prims/test/vector_broadcast.cu b/cpp/test/prims/vector_broadcast.cu similarity index 100% rename from ml-prims/test/vector_broadcast.cu rename to cpp/test/prims/vector_broadcast.cu diff --git a/ml-prims/test/weighted_mean.cu b/cpp/test/prims/weighted_mean.cu similarity index 100% rename from ml-prims/test/weighted_mean.cu rename to cpp/test/prims/weighted_mean.cu diff --git a/cuML/test/cd_test.cu b/cpp/test/sg/cd_test.cu similarity index 100% rename from cuML/test/cd_test.cu rename to cpp/test/sg/cd_test.cu diff --git a/cuML/test/dbscan_test.cu b/cpp/test/sg/dbscan_test.cu similarity index 100% rename from cuML/test/dbscan_test.cu rename to cpp/test/sg/dbscan_test.cu diff --git a/cuML/test/handle_test.cu b/cpp/test/sg/handle_test.cu similarity index 100% rename from cuML/test/handle_test.cu rename to cpp/test/sg/handle_test.cu diff --git a/cuML/test/kmeans_test.cu b/cpp/test/sg/kmeans_test.cu similarity index 100% rename from cuML/test/kmeans_test.cu rename to cpp/test/sg/kmeans_test.cu diff --git a/cuML/test/knn_test.cu b/cpp/test/sg/knn_test.cu similarity index 100% rename from cuML/test/knn_test.cu rename to cpp/test/sg/knn_test.cu diff --git a/cuML/test/lkf_test.cu b/cpp/test/sg/lkf_test.cu similarity index 100% rename from cuML/test/lkf_test.cu rename to cpp/test/sg/lkf_test.cu diff --git a/cuML/test/ols.cu b/cpp/test/sg/ols.cu similarity index 100% rename from cuML/test/ols.cu rename to cpp/test/sg/ols.cu diff --git a/cuML/test/pca_test.cu b/cpp/test/sg/pca_test.cu similarity index 100% rename from cuML/test/pca_test.cu rename to cpp/test/sg/pca_test.cu diff --git a/cuML/test/quasi_newton.cu b/cpp/test/sg/quasi_newton.cu similarity index 100% rename from cuML/test/quasi_newton.cu rename to cpp/test/sg/quasi_newton.cu diff --git a/cuML/test/rf_test.cu b/cpp/test/sg/rf_test.cu similarity index 100% rename from cuML/test/rf_test.cu rename to cpp/test/sg/rf_test.cu diff --git a/cuML/test/ridge.cu b/cpp/test/sg/ridge.cu similarity index 100% rename from cuML/test/ridge.cu rename to cpp/test/sg/ridge.cu diff --git a/cuML/test/sgd.cu b/cpp/test/sg/sgd.cu similarity index 100% rename from cuML/test/sgd.cu rename to cpp/test/sg/sgd.cu diff --git a/cuML/test/spectral_test.cu b/cpp/test/sg/spectral_test.cu similarity index 100% rename from cuML/test/spectral_test.cu rename to cpp/test/sg/spectral_test.cu diff --git a/cuML/test/tsvd_test.cu b/cpp/test/sg/tsvd_test.cu similarity index 100% rename from cuML/test/tsvd_test.cu rename to cpp/test/sg/tsvd_test.cu diff --git a/cuML/test/umap_test.cu b/cpp/test/sg/umap_test.cu similarity index 100% rename from cuML/test/umap_test.cu rename to cpp/test/sg/umap_test.cu From 8cb9810fc6247666f019018786702b6cb14cf924 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 8 May 2019 15:27:31 -0500 Subject: [PATCH 034/156] FIX Path of cuml test sources --- cpp/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 62c3456932..c58f04da41 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -238,8 +238,8 @@ add_subdirectory(${GTEST_DIR} ${PROJECT_BINARY_DIR}/googletest) # Append source file in recursive manner, append header files to target for work with them in IDE file(GLOB_RECURSE ml_prims_header "${MLPRIMS_DIR}/src/*.h" "${MLPRIMS_DIR}/src/*.hpp") -file(GLOB_RECURSE cuml_test_cuda_sources "test/*.cu") -file(GLOB_RECURSE cuml_mg_test_cuda_sources "test_mg/*.cu") +file(GLOB_RECURSE cuml_test_cuda_sources "test/sg/*.cu") +file(GLOB_RECURSE cuml_mg_test_cuda_sources "test/mg/*.cu") ################################################################################################### # - build libcuml++ shared library ------------------------------------------------------------------ From 481107a843018a41c38b99eaa4faf8ab19ea81ae Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 8 May 2019 15:51:53 -0500 Subject: [PATCH 035/156] FIX prim paths in include --- cpp/CMakeLists.txt | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c58f04da41..21539ec856 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -224,12 +224,15 @@ add_dependencies(gpufaisslib faiss) set_property(TARGET faisslib PROPERTY IMPORTED_LOCATION ${FAISS_DIR}/libfaiss.a) set_property(TARGET gpufaisslib PROPERTY IMPORTED_LOCATION ${FAISS_DIR}/gpu/libgpufaiss.a) -include_directories(src +include_directories( + src + src_prims + test/prims ${CMAKE_CURRENT_BINARY_DIR}/faiss/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${GTEST_DIR}/include - ${MLPRIMS_DIR}/src - ${MLPRIMS_DIR}/test + # ${MLPRIMS_DIR}/src + # ${MLPRIMS_DIR}/test ${CUTLASS_DIR} ${CUB_DIR} ${ZLIB_INCLUDE_DIRS$}) @@ -237,7 +240,7 @@ include_directories(src add_subdirectory(${GTEST_DIR} ${PROJECT_BINARY_DIR}/googletest) # Append source file in recursive manner, append header files to target for work with them in IDE -file(GLOB_RECURSE ml_prims_header "${MLPRIMS_DIR}/src/*.h" "${MLPRIMS_DIR}/src/*.hpp") +file(GLOB_RECURSE ml_prims_header "src_prims/*.h" "src_prims/*.hpp") file(GLOB_RECURSE cuml_test_cuda_sources "test/sg/*.cu") file(GLOB_RECURSE cuml_mg_test_cuda_sources "test/mg/*.cu") From dbb32f22bab49b434fc9cddabdf574aebd88fd5c Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 8 May 2019 15:52:29 -0500 Subject: [PATCH 036/156] FIX Change path in 'common' file includes --- cpp/src/common/allocatorAdapter.hpp | 14 +++---- cpp/src/common/cumlHandle.cpp | 4 +- cpp/src/common/cuml_api.cpp | 2 +- cpp/src/common/rmmAllocatorAdapter.hpp | 2 +- cpp/src/common/tensor.hpp | 52 +++++++++++++------------- 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/cpp/src/common/allocatorAdapter.hpp b/cpp/src/common/allocatorAdapter.hpp index d02f87a7c0..2423251990 100644 --- a/cpp/src/common/allocatorAdapter.hpp +++ b/cpp/src/common/allocatorAdapter.hpp @@ -20,7 +20,7 @@ #include -#include "../../../ml-prims/src/utils.h" +#include "../../src_prims/utils.h" #include "../cuML.hpp" @@ -54,7 +54,7 @@ class stdAllocatorAdapter {} stdAllocatorAdapter& operator=(const stdAllocatorAdapter& other) = default; - + stdAllocatorAdapter(std::shared_ptr allocator, cudaStream_t stream) : _allocator(allocator), _stream(stream) {} @@ -74,7 +74,7 @@ class stdAllocatorAdapter { return static_cast(_allocator->allocate( size, _stream )); } - void deallocate(pointer ptr, size_type size) { + void deallocate(pointer ptr, size_type size) { _allocator->deallocate(ptr, size, _stream); } @@ -126,9 +126,9 @@ class thrustAllocatorAdapter thrustAllocatorAdapter(std::shared_ptr allocator, cudaStream_t stream) : _allocator(allocator), _stream(stream) {} - + ~thrustAllocatorAdapter() {} - + char* allocate(const size_t size) { return static_cast(_allocator->allocate( size, _stream )); @@ -144,7 +144,7 @@ class thrustAllocatorAdapter cudaStream_t _stream = 0; }; -namespace +namespace { thrustAllocatorAdapter _decltypeHelper{0,0}; } @@ -171,7 +171,7 @@ inline auto thrust_exec_policy(std::shared_ptr allocator, cudaS delete alloc; delete pointer; }; - + std::unique_ptr policy{new T(*alloc), deleter}; return policy; } diff --git a/cpp/src/common/cumlHandle.cpp b/cpp/src/common/cumlHandle.cpp index c81b11de07..105b35898e 100644 --- a/cpp/src/common/cumlHandle.cpp +++ b/cpp/src/common/cumlHandle.cpp @@ -16,7 +16,7 @@ #include "cumlHandle.hpp" -#include "../../../ml-prims/src/utils.h" +#include "../../src_prims/utils.h" //TODO: Delete CUBLAS_CHECK and CUSOLVER_CHECK once // https://github.com/rapidsai/cuml/issues/239 is addressed @@ -261,7 +261,7 @@ void cumlHandle_impl::waitOnUserStream() const void cumlHandle_impl::waitOnInternalStreams() const { - for (auto s : _streams) + for (auto s : _streams) { CUDA_CHECK( cudaEventRecord( _event, s ) ); CUDA_CHECK( cudaStreamWaitEvent( _userStream, _event, 0 ) ); diff --git a/cpp/src/common/cuml_api.cpp b/cpp/src/common/cuml_api.cpp index e4e198e9cb..a429cd9623 100644 --- a/cpp/src/common/cuml_api.cpp +++ b/cpp/src/common/cuml_api.cpp @@ -19,7 +19,7 @@ #include "cumlHandle.hpp" -#include "../../../ml-prims/src/utils.h" +#include "../../src_prims/utils.h" namespace ML { namespace detail { diff --git a/cpp/src/common/rmmAllocatorAdapter.hpp b/cpp/src/common/rmmAllocatorAdapter.hpp index 54c67b4394..3b02f697dc 100644 --- a/cpp/src/common/rmmAllocatorAdapter.hpp +++ b/cpp/src/common/rmmAllocatorAdapter.hpp @@ -18,7 +18,7 @@ #include -#include "../../../ml-prims/src/utils.h" +#include "../../src_prims/utils.h" #include "../cuML.hpp" diff --git a/cpp/src/common/tensor.hpp b/cpp/src/common/tensor.hpp index cf4bd959c2..ba9f23723d 100644 --- a/cpp/src/common/tensor.hpp +++ b/cpp/src/common/tensor.hpp @@ -18,7 +18,7 @@ #include #include -namespace ML { +namespace ML { template deallocate(_data, this->getSizeInBytes(), _stream); - }else if(memory_type(_data) == cudaMemoryTypeHost){ + }else if(memory_type(_data) == cudaMemoryTypeHost){ _hAllocator->deallocate(_data, this->getSizeInBytes(), _stream); } } - } + } - __host__ + __host__ Tensor(DataPtrT data, const std::vector &sizes) : _data(data), _state(AllocState::NotOwner){ static_assert(Dim > 0, - "must have > 0 dimensions"); + "must have > 0 dimensions"); ASSERT(sizes.size() == Dim, "invalid argument: # of entries in the input argument 'sizes' must match the tensor dimension" ); - + for (int i = 0; i < Dim; ++i) { _size[i] = sizes[i]; } @@ -58,11 +58,11 @@ class Tensor { for (int j = Dim - 2; j >= 0; --j) { _stride[j] = _stride[j + 1] * _size[j + 1]; } - } - + } + // allocate the data using the allocator and release when the object goes out of scope // allocating tensor is the owner of the data - __host__ + __host__ Tensor(const std::vector &sizes, std::shared_ptr allocator, cudaStream_t stream): @@ -71,11 +71,11 @@ class Tensor { _state(AllocState::Owner){ static_assert(Dim > 0, - "must have > 0 dimensions"); + "must have > 0 dimensions"); ASSERT(sizes.size() == Dim, "dimension mismatch" ); - + for (int i = 0; i < Dim; ++i) { _size[i] = sizes[i]; } @@ -84,16 +84,16 @@ class Tensor { for (int j = Dim - 2; j >= 0; --j) { _stride[j] = _stride[j + 1] * _size[j + 1]; } - + _data = static_cast(_dAllocator->allocate(this->getSizeInBytes(), _stream)); CUDA_CHECK( cudaStreamSynchronize( _stream ) ); - + ASSERT(this->data() || (this->getSizeInBytes() == 0), "device allocation failed"); } - + /// returns the total number of elements contained within our data __host__ size_t numElements() const { @@ -110,7 +110,7 @@ class Tensor { __host__ inline IndexT getSize(int i) const { return _size[i]; } - + /// returns the stride array __host__ inline const IndexT* strides() const { return _stride; @@ -120,7 +120,7 @@ class Tensor { __host__ inline const IndexT getStride(int i) const { return _stride[i]; } - + /// returns the total size in bytes of our data __host__ size_t getSizeInBytes() const { return numElements() * sizeof(DataT); @@ -136,7 +136,7 @@ class Tensor { __host__ inline DataPtrT begin() { return _data; } - + /// returns a raw pointer to the end of our data __host__ inline DataPtrT end() { return data() + numElements(); @@ -173,8 +173,8 @@ class Tensor { offset += start_pos[dim] * getStride(dim); } DataPtrT newData = this->data() + offset; - - + + // The total size of the new view must be the <= total size of the old view size_t curSize = numElements(); size_t newSize = 1; @@ -188,26 +188,26 @@ class Tensor { return Tensor(newData, sizes); } - + private: enum AllocState { /// This tensor itself owns the memory, which must be freed via /// cudaFree Owner, - + /// This tensor itself is not an owner of the memory; there is /// nothing to free NotOwner }; - + protected: - + std::shared_ptr _dAllocator; std::shared_ptr _hAllocator; - /// Raw pointer to where the tensor data begins + /// Raw pointer to where the tensor data begins DataPtrT _data; /// Array of strides (in sizeof(T) terms) per each dimension @@ -221,5 +221,5 @@ class Tensor { cudaStream_t _stream; }; -}; // end namespace ML +}; // end namespace ML From 57dd4c63a98f07f8e96b0b402be6f271f8d26a12 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 8 May 2019 16:06:15 -0500 Subject: [PATCH 037/156] FEA add BUILD_EXAMPLES cmake option --- cpp/CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 21539ec856..50dbc0c96f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -50,6 +50,8 @@ option(BUILD_CUML_TESTS "Build cuML algorithm tests" ON) option(BUILD_PRIM_TESTS "Build ml-prim tests" OFF) +option(BUILD_EXAMPLES "Build C++ API usage examples" OFF) + set(BLAS_LIBRARIES "" CACHE STRING "Location of BLAS library") @@ -282,7 +284,7 @@ endif(OPENMP_FOUND) target_link_libraries(${CUML_CPP_TARGET} ${CUML_LINK_LIBRARIES}) ################################################################################################### -# - build test executable ------------------------------------------------------------------------- +# - build ml_test executable ---------------------------------------------------------------------- add_executable(ml_test ${cuml_test_cuda_sources} ${ml_prims_header}) @@ -302,7 +304,7 @@ target_link_libraries(ml_test ${ZLIB_LIBRARIES}) ################################################################################################### -# - build test executable ------------------------------------------------------------------------- +# - build ml_mg_test executable ------------------------------------------------------------------- add_executable(ml_mg_test ${cuml_mg_test_cuda_sources} ${ml_prims_header}) @@ -324,9 +326,9 @@ target_link_libraries(ml_mg_test ################################################################################################### # - build examples ------------------------------------------------------------------------- -if (NOT DISABLE_EXAMPLES OR NOT ${DISABLE_EXAMPLES}) +if (DISABLE_EXAMPLES OR ${BUILD_EXAMPLES}) add_subdirectory(examples) -endif(NOT DISABLE_EXAMPLES OR NOT ${DISABLE_EXAMPLES}) +endif(DISABLE_EXAMPLES OR ${BUILD_EXAMPLES}) ################################################################################################### # - install targets ------------------------------------------------------------------------------- From 1c2a9021d8fde4843d8cee7f8f8c34281faca878 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 8 May 2019 16:06:29 -0500 Subject: [PATCH 038/156] FIX Move files result of a bad branch-0.8 merge --- cpp/src/common/rmmAllocatorAdapter.hpp | 4 +- cpp/src/knn/knn.cu | 10 +- cuML/src/common/rmmAllocatorAdapter.hpp | 105 ------- cuML/src/knn/knn.cu | 355 ------------------------ 4 files changed, 12 insertions(+), 462 deletions(-) delete mode 100644 cuML/src/common/rmmAllocatorAdapter.hpp delete mode 100644 cuML/src/knn/knn.cu diff --git a/cpp/src/common/rmmAllocatorAdapter.hpp b/cpp/src/common/rmmAllocatorAdapter.hpp index 3b02f697dc..3508b56f21 100644 --- a/cpp/src/common/rmmAllocatorAdapter.hpp +++ b/cpp/src/common/rmmAllocatorAdapter.hpp @@ -18,7 +18,7 @@ #include -#include "../../src_prims/utils.h" +#include "../../../ml-prims/src/utils.h" #include "../cuML.hpp" @@ -96,6 +96,8 @@ class rmmAllocatorAdapter : public ML::deviceAllocator { } } + virtual ~rmmAllocatorAdapter() {} + private: const bool _rmmInitialized; }; diff --git a/cpp/src/knn/knn.cu b/cpp/src/knn/knn.cu index 543cda7ca9..5f0a042927 100644 --- a/cpp/src/knn/knn.cu +++ b/cpp/src/knn/knn.cu @@ -174,9 +174,13 @@ namespace ML { try { faiss::gpu::StandardGpuResources gpu_res; + + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + gpu_res.noTempMemory(); gpu_res.setCudaMallocWarning(false); - gpu_res.setDefaultNullStreamAllDevices(); + gpu_res.setDefaultStream(att.device, stream); bruteForceKnn(&gpu_res, faiss::METRIC_L2, @@ -190,6 +194,10 @@ namespace ML { all_I+(long(i)*k*long(n))); CUDA_CHECK(cudaPeekAtLastError()); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + CUDA_CHECK(cudaStreamDestroy(stream)); + } catch(const std::exception &e) { std::cout << "Exception occurred: " << e.what() << std::endl; diff --git a/cuML/src/common/rmmAllocatorAdapter.hpp b/cuML/src/common/rmmAllocatorAdapter.hpp deleted file mode 100644 index 3508b56f21..0000000000 --- a/cuML/src/common/rmmAllocatorAdapter.hpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include "../../../ml-prims/src/utils.h" - -#include "../cuML.hpp" - -namespace ML { - -/** - * @brief Implemententation of ML::deviceAllocator using the RAPIDS Memory Manager (RMM) for allocations. - * - * rmmAllocatorAdapter does not initialize RMM. If RMM is not initialized on construction of rmmAllocatorAdapter - * allocations fall back to cudaMalloc. - */ -class rmmAllocatorAdapter : public ML::deviceAllocator { -public: - rmmAllocatorAdapter() - : _rmmInitialized( rmmIsInitialized( NULL ) ) - { - //@todo: Log warning if RMM is not initialized. Blocked by https://github.com/rapidsai/cuml/issues/229 - } - - /** - * @brief asynchronosly allocate n bytes that can be used after all work in stream sheduled prior to this call - * has completetd. - * - * @param[in] n size of the allocation in bytes - * @param[in] stream the stream to use for the asynchronous allocations - * @returns a pointer to n byte of device memory - */ - virtual void* allocate( std::size_t n, cudaStream_t stream ) - { - void* ptr = 0; - if (!_rmmInitialized) - { - CUDA_CHECK( cudaMalloc( &ptr, n ) ); - } - else - { - rmmError_t rmmStatus = RMM_ALLOC(&ptr, n, stream); - if ( RMM_SUCCESS != rmmStatus || 0 == ptr ) - { - std::ostringstream msg; - msg<<"RMM allocation of "< -#include -#include -#include -#include -#include - -#include -#include -#include - - -namespace ML { - - - /** - * Build a kNN object for training and querying a k-nearest neighbors model. - * @param D number of features in each vector - */ - kNN::kNN(int D, bool verbose): D(D), total_n(0), indices(0), verbose(verbose), owner(false){} - kNN::~kNN() { - - try { - if(this->owner) { - if(this->verbose) - std::cout << "Freeing kNN memory" << std::endl; - for(kNNParams p : knn_params) { CUDA_CHECK(cudaFree(p.ptr)); } - } - - } catch(const std::exception &e) { - std::cout << "An exception occurred releasing kNN memory: " << e.what() << std::endl; - } - } - - void kNN::reset() { - if(knn_params.size() > 0) { - knn_params.clear(); - this->id_ranges.clear(); - this->indices = 0; - this->total_n = 0; - } - } - - bool kNN::verify_size(size_t size, int device) { - size_t free, total; - cudaMemGetInfo(&free, &total); - - if(size > free) { - std::cout << "Not enough free memory on device " - << device - << " to run kneighbors. " - << "needed=" - << size - << ", free=" << free << std::endl; - return false; - } - - return true; - } - - /** - * Fit a kNN model by creating separate indices for multiple given - * instances of kNNParams. - * @param input an array of pointers to data on (possibly different) devices - * @param N number of items in input array. - */ - void kNN::fit(kNNParams *input, int N) { - - - if(this->owner) { - for(kNNParams p : knn_params) { CUDA_CHECK(cudaFree(p.ptr)); } - } - - if(this->verbose) - std::cout << "N=" << N << std::endl; - - reset(); - - for(int i = 0; i < N; i++) { - - kNNParams params = input[i]; - this->indices++; - this->knn_params.emplace_back(params); - if(i < params.N) { - id_ranges.push_back(total_n); - } - - this->total_n += params.N; - } - } - - /** - * Search the kNN for the k-nearest neighbors of a set of query vectors - * @param search_items set of vectors to query for neighbors - * @param n number of items in search_items - * @param res_I pointer to device memory for returning k nearest indices - * @param res_D pointer to device memory for returning k nearest distances - * @param k number of neighbors to query - */ - void kNN::search(const float *search_items, int n, - long *res_I, float *res_D, int k) { - - float *result_D = new float[k*size_t(n)]; - long*result_I = new long[k*size_t(n)]; - - float *all_D = new float[indices*k*size_t(n)]; - long *all_I = new long[indices*k*size_t(n)]; - - cudaPointerAttributes s_att; - cudaError_t s_err = cudaPointerGetAttributes(&s_att, search_items); - - if(s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in knn search: " << search_items << std::endl; - - s_err = cudaPointerGetAttributes(&s_att, res_I); - - if(s_err != 0 || s_att.device == -1) - std::cout << "Invalid index results pointer encountered in knn search: " << search_items << std::endl; - - s_err = cudaPointerGetAttributes(&s_att, res_D); - - if(s_err != 0 || s_att.device == -1) - std::cout << "Invalid distance results pointer encountered in knn search: " << search_items << std::endl; - - - /** - * Initial verification of memory - */ - for(int i = 0; i < indices; i++) { - kNNParams params = knn_params[i]; - - cudaPointerAttributes att; - cudaError_t err = cudaPointerGetAttributes(&att, params.ptr); - - if(err == 0 && att.device > -1) { - CUDA_CHECK(cudaSetDevice(att.device)); - - if(!verify_size(size_t(params.N)*size_t(this->D)*4l, att.device)) - return; - } - } - - - #pragma omp parallel - { - #pragma omp for - for(int i = 0; i < indices; i++) { - - kNNParams params = knn_params[i]; - - cudaPointerAttributes att; - cudaError_t err = cudaPointerGetAttributes(&att, params.ptr); - - if(err == 0 && att.device > -1) { - CUDA_CHECK(cudaSetDevice(att.device)); - - try { - faiss::gpu::StandardGpuResources gpu_res; - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - gpu_res.noTempMemory(); - gpu_res.setCudaMallocWarning(false); - gpu_res.setDefaultStream(att.device, stream); - - bruteForceKnn(&gpu_res, - faiss::METRIC_L2, - params.ptr, - params.N, - search_items, - n, - this->D, - k, - all_D+(long(i)*k*long(n)), - all_I+(long(i)*k*long(n))); - - CUDA_CHECK(cudaPeekAtLastError()); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - CUDA_CHECK(cudaStreamDestroy(stream)); - - - } catch(const std::exception &e) { - std::cout << "Exception occurred: " << e.what() << std::endl; - } - - - } else { - std::stringstream ss; - ss << "Input memory for " << ¶ms << " failed. isDevice?=" << att.devicePointer << ", N=" << params.N; - std::cout << "Exception: " << ss.str() << std::endl; - } - } - } - - merge_tables>(long(n), k, indices, - result_D, result_I, all_D, all_I, id_ranges.data()); - - MLCommon::updateDevice(res_D, result_D, k*size_t(n), 0); - MLCommon::updateDevice(res_I, result_I, k*size_t(n), 0); - - delete all_D; - delete all_I; - - delete result_D; - delete result_I; - } - - /** - * Chunk a host array up into one or many GPUs (determined by the provided - * list of gpu ids) and fit a knn model. - * - * @param ptr an array in host memory to chunk over devices - * @param n number of elements in ptr - * @param devices array of device ids for chunking the ptr - * @param n_chunks number of elements in gpus - * @param out host pointer (size n) to store output - */ - void kNN::fit_from_host(float *ptr, int n, int* devices, int n_chunks) { - - if(this->owner) { - for(kNNParams p : knn_params) { CUDA_CHECK(cudaFree(p.ptr)); } - } - - reset(); - - size_t chunk_size = MLCommon::ceildiv((size_t)n, (size_t)n_chunks); - kNNParams params[n_chunks]; - - this->owner = true; - - /** - * Initial verification of memory - */ - for(int i = 0; i < n_chunks; i++) { - - int device = devices[i]; - size_t length = chunk_size; - if(length * i >= n) - length = (chunk_size*i)-size_t(n); - CUDA_CHECK(cudaSetDevice(device)); - if(!verify_size(size_t(length)*size_t(D), device)) - return; - } - - #pragma omp parallel for - for(int i = 0; i < n_chunks; i++) { - - int device = devices[i]; - CUDA_CHECK(cudaSetDevice(device)); - - size_t length = chunk_size; - if(length * i >= n) - length = (size_t(chunk_size)*i)-size_t(n); - - float *ptr_d; - MLCommon::allocate(ptr_d, size_t(length)*size_t(D)); - MLCommon::updateDevice(ptr_d, ptr+(size_t(chunk_size)*i), size_t(length)*size_t(D), 0); - - kNNParams p; - p.N = length; - p.ptr = ptr_d; - - params[i] = p; - } - - fit(params, n_chunks); - } - - /** Merge results from several shards into a single result set. - * @param all_distances size nshard * n * k - * @param all_labels idem - * @param translartions label translations to apply, size nshard - */ - template - void kNN::merge_tables (long n, long k, long nshard, - float *distances, long *labels, - float *all_distances, - long *all_labels, - long *translations) { - if(k == 0) { - return; - } - - size_t stride = n * k; - #pragma omp parallel - { - std::vector buf (2 * nshard); - int * pointer = buf.data(); - int * shard_ids = pointer + nshard; - std::vector buf2 (nshard); - float * heap_vals = buf2.data(); - #pragma omp for - for (long i = 0; i < n; i++) { - // the heap maps values to the shard where they are - // produced. - const float *D_in = all_distances + i * k; - const long *I_in = all_labels + i * k; - int heap_size = 0; - - for (long s = 0; s < nshard; s++) { - pointer[s] = 0; - if (I_in[stride * s] >= 0) - heap_push (++heap_size, heap_vals, shard_ids, - D_in[stride * s], s); - } - - float *D = distances + i * k; - long *I = labels + i * k; - - for (int j = 0; j < k; j++) { - if (heap_size == 0) { - I[j] = -1; - D[j] = C::neutral(); - } else { - // pop best element - int s = shard_ids[0]; - int & p = pointer[s]; - D[j] = heap_vals[0]; - I[j] = I_in[stride * s + p] + translations[s]; - - heap_pop (heap_size--, heap_vals, shard_ids); - p++; - if (p < k && I_in[stride * s + p] >= 0) - heap_push (++heap_size, heap_vals, shard_ids, - D_in[stride * s + p], s); - } - } - } - } - }; - -}; - - -// end namespace ML From f3ca8ab0e06422fe7e940bab8cccd368c67f3a0a Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Wed, 8 May 2019 19:23:57 -0400 Subject: [PATCH 039/156] cumlhandle added to cd. Also "except +" added to cd and sgd cython. --- cuML/src/solver/cd.h | 78 ++++++++++++------------------------- python/cuml/solvers/cd.pyx | 8 ++-- python/cuml/solvers/sgd.pyx | 12 +++--- 3 files changed, 34 insertions(+), 64 deletions(-) diff --git a/cuML/src/solver/cd.h b/cuML/src/solver/cd.h index c9d50d5477..25aecd534c 100644 --- a/cuML/src/solver/cd.h +++ b/cuML/src/solver/cd.h @@ -41,6 +41,8 @@ using namespace MLCommon; /** * Fits a linear, lasso, and elastic-net regression model using Coordinate Descent solver + * @param cumlHandle_impl + * Reference of cumlHandle * @param input * pointer to an array in column-major format (size of n_rows, n_cols) * @param n_rows @@ -71,10 +73,6 @@ using namespace MLCommon; * tolerance to stop the solver * @param stream * cuda stream - * @param cublas_handle - * cublas handle - * @param cusolver_handle - * cusolver handle */ template void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, @@ -92,33 +90,25 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, cublasHandle_t cublas_handle = handle.getCublasHandle(); cusolverDnHandle_t cusolver_handle = handle.getcusolverDnHandle(); - math_t *mu_input = nullptr; - math_t *mu_labels = nullptr; - math_t *norm2_input = nullptr; - math_t *pred = nullptr; - math_t *residual = nullptr; - math_t *squared = nullptr; - math_t *loss_value = nullptr; - - //auto allocator = handle.getDeviceAllocator(); - //device_buffer components_all(allocator, stream, len); - - allocate(loss_value, 1); - allocate(pred, n_rows, true); - allocate(residual, n_rows, true); - allocate(squared, n_cols, true); + auto allocator = handle.getDeviceAllocator(); + device_buffer pred(allocator, stream, n_rows); + device_buffer residual(allocator, stream, n_rows); + device_buffer squared(allocator, stream, n_cols); + device_buffer mu_input(allocator, stream, 0); + device_buffer mu_labels(allocator, stream, 0); + device_buffer norm2_input(allocator, stream, 0); std::vector h_coef(n_cols, math_t(0)); if (fit_intercept) { - allocate(mu_input, n_cols); - allocate(mu_labels, 1); + mu_input.reserve(n_cols, stream); + mu_labels.reserve(1, stream); if (normalize) { - allocate(norm2_input, n_cols); + norm2_input.reserve(n_cols, stream); } GLM::preProcessData(handle, input, n_rows, n_cols, labels, - intercept, mu_input, mu_labels, norm2_input, fit_intercept, + intercept, mu_input.data(), mu_labels.data(), norm2_input.data(), fit_intercept, normalize, stream); } @@ -131,14 +121,14 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, if (normalize) { math_t scalar = math_t(1.0) + l2_alpha; - Matrix::setValue(squared, squared, scalar, n_cols, stream); + Matrix::setValue(squared.data(), squared.data(), scalar, n_cols, stream); } else { - LinAlg::colNorm(squared, input, n_cols, n_rows, LinAlg::L2Norm, false, + LinAlg::colNorm(squared.data(), input, n_cols, n_rows, LinAlg::L2Norm, false, stream); - LinAlg::addScalar(squared, squared, l2_alpha, n_cols, stream); + LinAlg::addScalar(squared.data(), squared.data(), l2_alpha, n_cols, stream); } - copy(residual, labels, n_rows, stream); + copy(residual.data(), labels, n_rows, stream); for (int i = 0; i < epochs; i++) { if (i > 0 && shuffle) { @@ -152,13 +142,13 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, for (int j = 0; j < n_cols; j++) { int ci = ri[j]; math_t *coef_loc = coef + ci; - math_t *squared_loc = squared + ci; + math_t *squared_loc = squared.data() + ci; math_t *input_col_loc = input + (ci * n_rows); - LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, + LinAlg::multiplyScalar(pred.data(), input_col_loc, h_coef[ci], n_rows, stream); - LinAlg::add(residual, residual, pred, n_rows, stream); - LinAlg::gemm(input_col_loc, n_rows, 1, residual, coef_loc, 1, 1, + LinAlg::add(residual.data(), residual.data(), pred.data(), n_rows, stream); + LinAlg::gemm(input_col_loc, n_rows, 1, residual.data(), coef_loc, 1, 1, CUBLAS_OP_T, CUBLAS_OP_N, cublas_handle, stream); if (l1_ratio > math_t(0.0)) @@ -177,9 +167,9 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, if (abs(h_coef[ci]) > coef_max) coef_max = abs(h_coef[ci]); - LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, + LinAlg::multiplyScalar(pred.data(), input_col_loc, h_coef[ci], n_rows, stream); - LinAlg::subtract(residual, residual, pred, n_rows, stream); + LinAlg::subtract(residual.data(), residual.data(), pred.data(), n_rows, stream); } bool flag_continue = true; @@ -198,33 +188,13 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, if (fit_intercept) { GLM::postProcessData(handle, input, n_rows, n_cols, labels, - coef, intercept, mu_input, mu_labels, norm2_input, + coef, intercept, mu_input.data(), mu_labels.data(), norm2_input.data(), fit_intercept, normalize, stream); - if (mu_input != nullptr) - CUDA_CHECK(cudaFree(mu_input)); - if (mu_labels != nullptr) - CUDA_CHECK(cudaFree(mu_labels)); - if (normalize) { - if (norm2_input != nullptr) - cudaFree(norm2_input); - } } else { *intercept = math_t(0); } - if (pred != nullptr) - CUDA_CHECK(cudaFree(pred)); - - if (residual != nullptr) - CUDA_CHECK(cudaFree(residual)); - - if (squared != nullptr) - CUDA_CHECK(cudaFree(squared)); - - if (loss_value != nullptr) - CUDA_CHECK(cudaFree(loss_value)); - } /** diff --git a/python/cuml/solvers/cd.pyx b/python/cuml/solvers/cd.pyx index 7c2e39498d..fd3d510884 100644 --- a/python/cuml/solvers/cd.pyx +++ b/python/cuml/solvers/cd.pyx @@ -43,7 +43,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": float alpha, float l1_ratio, bool shuffle, - float tol) + float tol) except + cdef void cdFit(cumlHandle& handle, @@ -60,7 +60,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": double alpha, double l1_ratio, bool shuffle, - double tol) + double tol) except + cdef void cdPredict(cumlHandle& handle, const float *input, @@ -69,7 +69,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": const float *coef, float intercept, float *preds, - int loss) + int loss) except + cdef void cdPredict(cumlHandle& handle, const double *input, @@ -78,7 +78,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": const double *coef, double intercept, double *preds, - int loss) + int loss) except + class CD(cuml.Base): """ diff --git a/python/cuml/solvers/sgd.pyx b/python/cuml/solvers/sgd.pyx index 7a486f3126..47eaaf2aba 100644 --- a/python/cuml/solvers/sgd.pyx +++ b/python/cuml/solvers/sgd.pyx @@ -48,7 +48,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": float l1_ratio, bool shuffle, float tol, - int n_iter_no_change) + int n_iter_no_change) except + cdef void sgdFit(cumlHandle& handle, @@ -70,7 +70,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": double l1_ratio, bool shuffle, double tol, - int n_iter_no_change) + int n_iter_no_change) except + cdef void sgdPredict(cumlHandle& handle, const float *input, @@ -79,7 +79,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": const float *coef, float intercept, float *preds, - int loss) + int loss) except + cdef void sgdPredict(cumlHandle& handle, const double *input, @@ -88,7 +88,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": const double *coef, double intercept, double *preds, - int loss) + int loss) except + cdef void sgdPredictBinaryClass(cumlHandle& handle, const float *input, @@ -97,7 +97,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": const float *coef, float intercept, float *preds, - int loss) + int loss) except + cdef void sgdPredictBinaryClass(cumlHandle& handle, const double *input, @@ -106,7 +106,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": const double *coef, double intercept, double *preds, - int loss) + int loss) except + class SGD(cuml.Base): """ From 90683f2f4b0276f87f49eb725cc91c054e30e67b Mon Sep 17 00:00:00 2001 From: Chirayu Date: Wed, 8 May 2019 17:30:48 -0700 Subject: [PATCH 040/156] Fix build break on TOT --- ml-prims/test/contingencyMatrix.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ml-prims/test/contingencyMatrix.cu b/ml-prims/test/contingencyMatrix.cu index 41ffffc932..13fcfbf67b 100644 --- a/ml-prims/test/contingencyMatrix.cu +++ b/ml-prims/test/contingencyMatrix.cu @@ -77,9 +77,9 @@ protected: if (workspaceSz != 0) MLCommon::allocate(pWorkspace, workspaceSz); - MLCommon::updateDeviceAsync(dYHat, &y_hat[0], numElements, stream); - MLCommon::updateDeviceAsync(dY, &y[0], numElements, stream); - MLCommon::updateDeviceAsync(dGoldenOutput, hGoldenOutput, + MLCommon::updateDevice(dYHat, &y_hat[0], numElements, stream); + MLCommon::updateDevice(dY, &y[0], numElements, stream); + MLCommon::updateDevice(dGoldenOutput, hGoldenOutput, numUniqueClasses*numUniqueClasses, stream); if (params.calcCardinality) { From 8693ce4dfc306c964be216cd7479d5e8cebab48a Mon Sep 17 00:00:00 2001 From: Chirayu Date: Wed, 8 May 2019 17:32:13 -0700 Subject: [PATCH 041/156] Add changelog entry --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00c40b3542..a94dbc9291 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # cuML 0.8.0 (Date TBD) ## New Features - +- PR #504: Contingency matrix ml-prim ## Improvements From 2ea240d4072873926a6289f2730e08afcea8d85c Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 8 May 2019 21:25:38 -0400 Subject: [PATCH 042/156] Adding const to all immutable pointer arguments --- ml-prims/src/sparse/coo.h | 36 ++++++++++++++++++------------------ ml-prims/src/sparse/csr.h | 38 +++++++++++++++++++------------------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/ml-prims/src/sparse/coo.h b/ml-prims/src/sparse/coo.h index eed47016e4..ad4f7eecdc 100644 --- a/ml-prims/src/sparse/coo.h +++ b/ml-prims/src/sparse/coo.h @@ -381,7 +381,7 @@ void coo_sort(int m, int n, int nnz, * @param stream: the cuda stream to use */ template - void coo_sort(COO *in, cudaStream_t stream = 0) { + void coo_sort(COO* const in, cudaStream_t stream = 0) { coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows, in->cols, in->vals, stream); } @@ -447,7 +447,7 @@ __global__ void coo_remove_scalar_kernel( * @param results array to place results */ template -__global__ void coo_row_count_kernel(int *rows, int nnz, +__global__ void coo_row_count_kernel(int* const rows, int nnz, int *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; if(row < nnz) { @@ -464,7 +464,7 @@ __global__ void coo_row_count_kernel(int *rows, int nnz, * @param stream: cuda stream to use */ template -void coo_row_count(int *rows, int nnz, int *results, +void coo_row_count(int* const rows, int nnz, int *results, cudaStream_t stream) { dim3 grid_rc(MLCommon::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -482,7 +482,7 @@ void coo_row_count(int *rows, int nnz, int *results, * @param stream: cuda stream to use */ template -void coo_row_count(COO *in, int *results, cudaStream_t stream = 0) { +void coo_row_count(COO* const in, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -491,7 +491,7 @@ void coo_row_count(COO *in, int *results, cudaStream_t stream = 0) { } template -__global__ void coo_row_count_nz_kernel(int *rows, T *vals, int nnz, +__global__ void coo_row_count_nz_kernel(int* const rows, T* const vals, int nnz, int *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; if(row < nnz && vals[row] != 0.0) { @@ -500,7 +500,7 @@ __global__ void coo_row_count_nz_kernel(int *rows, T *vals, int nnz, } template -__global__ void coo_row_count_scalar_kernel(int *rows, T *vals, int nnz, +__global__ void coo_row_count_scalar_kernel(int* const rows, T* const vals, int nnz, T scalar, int *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; if(row < nnz && vals[row] != scalar) { @@ -518,7 +518,7 @@ __global__ void coo_row_count_scalar_kernel(int *rows, T *vals, int nnz, * @param stream: cuda stream to use */ template -void coo_row_count_scalar(COO *in, T scalar, int *results, +void coo_row_count_scalar(COO* const in, T scalar, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -540,7 +540,7 @@ void coo_row_count_scalar(COO *in, T scalar, int *results, * @param stream: cuda stream to use */ template -void coo_row_count_scalar(int *rows, T *vals, int nnz, T scalar, +void coo_row_count_scalar(int* const rows, T* const vals, int nnz, T scalar, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -561,7 +561,7 @@ void coo_row_count_scalar(int *rows, T *vals, int nnz, T scalar, * @param stream: cuda stream to use */ template -void coo_row_count_nz(int *rows, T *vals, int nnz, int *results, +void coo_row_count_nz(int* const rows, T* const vals, int nnz, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -579,7 +579,7 @@ void coo_row_count_nz(int *rows, T *vals, int nnz, int *results, * @param stream: cuda stream to use */ template -void coo_row_count_nz(COO *in, int *results, cudaStream_t stream = 0) { +void coo_row_count_nz(COO* const in, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -656,7 +656,7 @@ void coo_remove_scalar( * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO *in, +void coo_remove_scalar(COO* const in, COO *out, T scalar, cudaStream_t stream) { @@ -703,14 +703,14 @@ void coo_remove_scalar(COO *in, * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO *in, +void coo_remove_zeros(COO* const in, COO *out, cudaStream_t stream) { coo_remove_scalar(in, out, T(0.0), stream); } template -__global__ void from_knn_graph_kernel(long *knn_indices, T *knn_dists, int m, int k, +__global__ void from_knn_graph_kernel(long* const knn_indices, T* const knn_dists, int m, int k, int *rows, int *cols, T *vals) { int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -737,7 +737,7 @@ __global__ void from_knn_graph_kernel(long *knn_indices, T *knn_dists, int m, in * @param vals: output COO val array */ template -void from_knn(long *knn_indices, T *knn_dists, int m, int k, +void from_knn(long* const knn_indices, T* const knn_dists, int m, int k, int *rows, int *cols, T *vals) { dim3 grid(ceildiv(m, 32), 1, 1); @@ -751,7 +751,7 @@ void from_knn(long *knn_indices, T *knn_dists, int m, int k, * into COO format. */ template -void from_knn(long *knn_indices, T *knn_dists, int m, int k, +void from_knn(long* const knn_indices, T* const knn_dists, int m, int k, COO *out) { out->allocate(m*k, m, m); @@ -771,7 +771,7 @@ void from_knn(long *knn_indices, T *knn_dists, int m, int k, */ template void sorted_coo_to_csr( - T *rows, int nnz, T *row_ind, int m, + T* const rows, int nnz, T *row_ind, int m, cudaStream_t stream = 0) { T *row_counts; @@ -798,7 +798,7 @@ void sorted_coo_to_csr( * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO *coo, int *row_ind, cudaStream_t stream = 0) { +void sorted_coo_to_csr(COO* const coo, int *row_ind, cudaStream_t stream = 0) { sorted_coo_to_csr(coo->rows, coo->nnz, row_ind, coo->n_rows, stream); } @@ -875,7 +875,7 @@ __global__ void coo_symmetrize_kernel( * @param stream: cuda stream to use */ template -void coo_symmetrize(COO *in, +void coo_symmetrize(COO* const in, COO *out, Lambda reduction_op, // two-argument reducer cudaStream_t stream) { diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index b3d14d2e43..21914732ba 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -62,7 +62,7 @@ class CSR { * @param n_rows: number of rows in the dense matrix * @param n_cols: number of cols in the dense matrix */ - CSR(int *row_ind, int *row_ind_ptr, T *vals, int nnz, int n_rows = -1, int n_cols = -1) { + CSR(int* const row_ind, int* const row_ind_ptr, T* const vals, int nnz, int n_rows = -1, int n_cols = -1) { this->row_ind = row_ind; this->row_ind_ptr = row_ind_ptr; this->vals = vals; @@ -250,8 +250,8 @@ __global__ void csr_row_normalize_l1_kernel( */ template void csr_row_normalize_l1( - int *ia, // csr row ex_scan (sorted by row) - T *vals, int nnz, // array of values and number of non-zeros + int* const ia, // csr row ex_scan (sorted by row) + T* const vals, int nnz, // array of values and number of non-zeros int m, // num rows in csr T *result, cudaStream_t stream) { // output array @@ -314,8 +314,8 @@ __global__ void csr_row_normalize_max_kernel( template void csr_row_normalize_max( - int *ia, // csr row ind array (sorted by row) - T *vals, int nnz, // array of values and number of non-zeros + int* const ia, // csr row ind array (sorted by row) + T* const vals, int nnz, // array of values and number of non-zeros int m, // num total rows in csr T *result, cudaStream_t stream) { @@ -495,8 +495,8 @@ __global__ void csr_add_kernel( */ template size_t csr_add_calc_inds( - int *a_ind, int *a_indptr, T *a_val, int nnz1, - int *b_ind, int *b_indptr, T *b_val, int nnz2, + int* const a_ind, int* const a_indptr, T* const a_val, int nnz1, + int* const b_ind, int* const b_indptr, T* const b_val, int nnz2, int m, int *out_ind, cudaStream_t stream ) { @@ -538,16 +538,16 @@ size_t csr_add_calc_inds( * @param b_val: right hand data array * @param nnz2: size of right hand index_ptr and val arrays * @param m: size of output array (number of rows in final matrix) - * @param c_ind: output row_ind arra + * @param c_ind: output row_ind array * @param c_indptr: output ind_ptr array * @param c_val: output data array * @param stream: cuda stream to use */ template void csr_add_finalize( - int *a_ind, int *a_indptr, T *a_val, int nnz1, - int *b_ind, int *b_indptr, T *b_val, int nnz2, - int m, int *c_ind, int *c_indptr, T *c_val, + int* const a_ind, int* const a_indptr, T* const a_val, int nnz1, + int* const b_ind, int* const b_indptr, T* const b_val, int nnz2, + int m, int* const c_ind, int *c_indptr, T *c_val, cudaStream_t stream ) { dim3 grid(MLCommon::ceildiv(m, TPB_X), 1, 1); @@ -562,7 +562,7 @@ void csr_add_finalize( } template -__global__ void csr_row_op_batched_kernel(T *row_ind, T total_rows, +__global__ void csr_row_op_batched_kernel(T* const row_ind, T total_rows, T batchSize, Lambda op) { T row = blockIdx.x*TPB_X + threadIdx.x; if(row < batchSize) { @@ -583,7 +583,7 @@ __global__ void csr_row_op_batched_kernel(T *row_ind, T total_rows, * @param stream cuda stream to use */ template -void csr_row_op_batched(T *row_ind, T total_rows, T batchSize, +void csr_row_op_batched(T* const row_ind, T total_rows, T batchSize, Lambda op, cudaStream_t stream) { dim3 grid(MLCommon::ceildiv(batchSize, TPB_X), 1, 1); @@ -604,7 +604,7 @@ void csr_row_op_batched(T *row_ind, T total_rows, T batchSize, * @param stream cuda stream to use */ template -void csr_row_op(T *row_ind, T n_rows, Lambda op, cudaStream_t stream) { +void csr_row_op(T* const row_ind, T n_rows, Lambda op, cudaStream_t stream) { csr_row_op_batched(row_ind, n_rows, n_rows, op, stream); } @@ -623,8 +623,8 @@ void csr_row_op(T *row_ind, T n_rows, Lambda op, cudaStream_t stream) { */ template -void csr_adj_graph_batched(T *row_ind, T total_rows, T batchSize, - bool *adj, T *row_ind_ptr, Lambda fused_op, cudaStream_t stream) { +void csr_adj_graph_batched(T* const row_ind, T total_rows, T batchSize, + bool* const adj, T *row_ind_ptr, Lambda fused_op, cudaStream_t stream) { csr_row_op_batched(row_ind, total_rows, batchSize, [fused_op, adj, total_rows, row_ind_ptr, batchSize] __device__ @@ -654,8 +654,8 @@ void csr_adj_graph_batched(T *row_ind, T total_rows, T batchSize, * @param stream cuda stream to use */ template -void csr_adj_graph(T *row_ind, T n_rows, - bool *adj, T *row_ind_ptr, cudaStream_t stream) { +void csr_adj_graph(T* const row_ind, T n_rows, + bool* const adj, T *row_ind_ptr, cudaStream_t stream) { csr_adj_graph_batched(row_ind, n_rows, n_rows, adj, row_ind_ptr, stream); } @@ -757,7 +757,7 @@ __global__ void weak_cc_init_all_kernel(Type *labels, bool *fa, bool *xa, template void weak_cc_label_batched(Type *labels, - Type *row_ind, Type *row_ind_ptr, Type nnz, Type N, + Type* const row_ind, Type* const row_ind_ptr, Type nnz, Type N, WeakCCState *state, Type startVertexId, Type batchSize, cudaStream_t stream, Lambda filter_op) { From 3b1469c3a747a3001e4f3588740081a508509708 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Wed, 8 May 2019 22:00:21 -0400 Subject: [PATCH 043/156] cumlhandle added to sgd --- cuML/src/glm/preprocess.h | 58 +++--- cuML/src/solver/sgd.h | 260 +++++++++++++++------------ ml-prims/src/functions/hinge.h | 41 ++--- ml-prims/src/functions/linearReg.h | 42 ++--- ml-prims/src/functions/logisticReg.h | 41 ++--- 5 files changed, 223 insertions(+), 219 deletions(-) diff --git a/cuML/src/glm/preprocess.h b/cuML/src/glm/preprocess.h index c5cfb7997c..8a903825af 100644 --- a/cuML/src/glm/preprocess.h +++ b/cuML/src/glm/preprocess.h @@ -32,11 +32,13 @@ namespace GLM { using namespace MLCommon; template -void preProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, math_t *labels, - math_t *intercept, math_t *mu_input, math_t *mu_labels, math_t *norm2_input, - bool fit_intercept, bool normalize, cudaStream_t stream) { - auto cublas_handle = handle.getCublasHandle(); - auto cusolver_handle = handle.getcusolverDnHandle(); +void preProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, + int n_cols, math_t *labels, math_t *intercept, math_t *mu_input, + math_t *mu_labels, math_t *norm2_input, bool fit_intercept, + bool normalize, cudaStream_t stream) { + + auto cublas_handle = handle.getCublasHandle(); + auto cusolver_handle = handle.getcusolverDnHandle(); ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); @@ -45,50 +47,54 @@ void preProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, in if (fit_intercept) { Stats::mean(mu_input, input, n_cols, n_rows, false, false, stream); - Stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, stream); + Stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, + stream); Stats::mean(mu_labels, labels, 1, n_rows, false, false, stream); - Stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, stream); + Stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, + stream); if (normalize) { LinAlg::colNorm(norm2_input, input, n_cols, n_rows, LinAlg::L2Norm, false, - stream, - []__device__(math_t v){ return MLCommon::mySqrt(v); }); - Matrix::matrixVectorBinaryDivSkipZero(input, norm2_input, n_rows, n_cols, false, true, stream, true); + stream, + []__device__(math_t v) {return MLCommon::mySqrt(v);}); + Matrix::matrixVectorBinaryDivSkipZero(input, norm2_input, n_rows, + n_cols, false, true, stream, true); } } } template -void postProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, math_t *labels, math_t *coef, - math_t *intercept, math_t *mu_input, math_t *mu_labels, math_t *norm2_input, +void postProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, + int n_cols, math_t *labels, math_t *coef, math_t *intercept, + math_t *mu_input, math_t *mu_labels, math_t *norm2_input, bool fit_intercept, bool normalize, cudaStream_t stream) { - auto cublas_handle = handle.getCublasHandle(); - auto cusolver_handle = handle.getcusolverDnHandle(); + + auto cublas_handle = handle.getCublasHandle(); + auto cusolver_handle = handle.getcusolverDnHandle(); ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two"); - math_t *d_intercept; - allocate(d_intercept, 1); + auto allocator = handle.getDeviceAllocator(); + device_buffer d_intercept(allocator, stream, 1); if (normalize) { - Matrix::matrixVectorBinaryMult(input, norm2_input, n_rows, n_cols, false, true, stream); - Matrix::matrixVectorBinaryDivSkipZero(coef, norm2_input, 1, n_cols, - false, true, stream, true); + Matrix::matrixVectorBinaryMult(input, norm2_input, n_rows, n_cols, + false, true, stream); + Matrix::matrixVectorBinaryDivSkipZero(coef, norm2_input, 1, n_cols, + false, true, stream, true); } - LinAlg::gemm(mu_input, 1, n_cols, coef, d_intercept, 1, 1, - CUBLAS_OP_N, CUBLAS_OP_N, cublas_handle, stream); + LinAlg::gemm(mu_input, 1, n_cols, coef, d_intercept.data(), 1, 1, + CUBLAS_OP_N, CUBLAS_OP_N, cublas_handle, stream); - LinAlg::subtract(d_intercept, mu_labels, d_intercept, 1, stream); - updateHost(intercept, d_intercept, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - if (d_intercept != NULL) - cudaFree(d_intercept); + LinAlg::subtract(d_intercept.data(), mu_labels, d_intercept.data(), 1, + stream); + updateHost(intercept, d_intercept.data(), 1, stream); Stats::meanAdd(input, input, mu_input, n_cols, n_rows, false, true, stream); Stats::meanAdd(labels, labels, mu_labels, 1, n_rows, false, true, stream); diff --git a/cuML/src/solver/sgd.h b/cuML/src/solver/sgd.h index 2d5fea507c..a87ece412b 100644 --- a/cuML/src/solver/sgd.h +++ b/cuML/src/solver/sgd.h @@ -42,59 +42,87 @@ namespace Solver { using namespace MLCommon; +/** + * Fits a linear, lasso, and elastic-net regression model using Coordinate Descent solver + * @param cumlHandle_impl + * Reference of cumlHandle + * @param input + * pointer to an array in column-major format (size of n_rows, n_cols) + * @param n_rows + * n_samples or rows in input + * @param n_cols + * n_features or columns in X + * @param labels + * pointer to an array for labels (size of n_rows) + * @param coef + * pointer to an array for coefficients (size of n_cols). This will be filled with coefficients + * once the function is executed. + * @param intercept + * pointer to a scalar for intercept. This will be filled + * once the function is executed + * @param fit_intercept + * boolean parameter to control if the intercept will be fitted or not + * @param batch_size + * number of rows in the minibatch + * @param epochs + * number of iterations that the solver will run + * @param lr_type + * type of the learning rate function (i.e. OPTIMAL, CONSTANT, INVSCALING, ADAPTIVE) + * @param eta0 + * learning rate for contant lr_type. It's used to calculate learning rate function for other types of lr_type + * @param power_t + * power value in the INVSCALING lr_type + * @param loss + * enum to use different loss functions. + * @param penalty + * None, L1, L2, or Elastic-net penalty + * @param alpha + * alpha value in L1 + * @param l1_ratio + * ratio of alpha will be used for L1. (1 - l1_ratio) * alpha will be used for L2. + * @param shuffle + * boolean parameter to control whether coordinates will be picked randomly or not. + * @param tol + * tolerance to stop the solver + * @param n_iter_no_change + * solver stops if there is no update greater than tol after n_iter_no_change iterations + * @param stream + * cuda stream + */ template -void sgdFit(const cumlHandle_impl& handle, - math_t *input, - int n_rows, - int n_cols, - math_t *labels, - math_t *coef, - math_t *intercept, - bool fit_intercept, - int batch_size, - int epochs, - ML::lr_type lr_type, - math_t eta0, - math_t power_t, - ML::loss_funct loss, - Functions::penalty penalty, - math_t alpha, - math_t l1_ratio, - bool shuffle, - math_t tol, - int n_iter_no_change, - cudaStream_t stream) { +void sgdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, + int n_cols, math_t *labels, math_t *coef, math_t *intercept, + bool fit_intercept, int batch_size, int epochs, ML::lr_type lr_type, + math_t eta0, math_t power_t, ML::loss_funct loss, + Functions::penalty penalty, math_t alpha, math_t l1_ratio, bool shuffle, + math_t tol, int n_iter_no_change, cudaStream_t stream) { ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two"); - math_t *mu_input = NULL; - math_t *mu_labels = NULL; - math_t *norm2_input = NULL; - cublasHandle_t cublas_handle = handle.getCublasHandle(); + auto allocator = handle.getDeviceAllocator(); + device_buffer mu_input(allocator, stream, 0); + device_buffer mu_labels(allocator, stream, 0); + device_buffer norm2_input(allocator, stream, 0); + if (fit_intercept) { - allocate(mu_input, n_cols); - allocate(mu_labels, 1); + mu_input.reserve(n_cols, stream); + mu_labels.reserve(1, stream); - GLM::preProcessData(handle, input, n_rows, n_cols, labels, intercept, mu_input, - mu_labels, norm2_input, fit_intercept, false, stream); + GLM::preProcessData(handle, input, n_rows, n_cols, labels, intercept, + mu_input.data(), mu_labels.data(), norm2_input.data(), + fit_intercept, false, stream); } - math_t *grads = NULL; - math_t *input_batch = NULL; - math_t *labels_batch = NULL; - math_t *loss_value = NULL; - int *indices = NULL; - - allocate(grads, n_cols, true); - allocate(indices, batch_size); - allocate(input_batch, batch_size * n_cols); - allocate(labels_batch, batch_size); - allocate(loss_value, 1); + device_buffer grads(allocator, stream, n_cols); + device_buffer indices(allocator, stream, batch_size); + device_buffer input_batch(allocator, stream, batch_size * n_cols); + device_buffer labels_batch(allocator, stream, batch_size); + device_buffer loss_value(allocator, stream, 1); math_t prev_loss_value = math_t(0); math_t curr_loss_value = math_t(0); @@ -131,29 +159,36 @@ void sgdFit(const cumlHandle_impl& handle, if (cbs == 0) break; - updateDevice(indices, &rand_indices[j], cbs, stream); - Matrix::copyRows(input, n_rows, n_cols, input_batch, indices, cbs, stream); - Matrix::copyRows(labels, n_rows, 1, labels_batch, indices, cbs, stream); + updateDevice(indices.data(), &rand_indices[j], cbs, stream); + Matrix::copyRows(input, n_rows, n_cols, input_batch.data(), + indices.data(), cbs, stream); + Matrix::copyRows(labels, n_rows, 1, labels_batch.data(), + indices.data(), cbs, stream); if (loss == ML::loss_funct::SQRD_LOSS) { - Functions::linearRegLossGrads(input_batch, cbs, n_cols, labels_batch, - coef, grads, penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::linearRegLossGrads(input_batch.data(), cbs, n_cols, + labels_batch.data(), coef, grads.data(), penalty, alpha, + l1_ratio, cublas_handle, allocator, stream); } else if (loss == ML::loss_funct::LOG) { - Functions::logisticRegLossGrads(input_batch, cbs, n_cols, labels_batch, - coef, grads, penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::logisticRegLossGrads(input_batch.data(), cbs, n_cols, + labels_batch.data(), coef, grads.data(), penalty, alpha, + l1_ratio, cublas_handle, allocator, stream); } else if (loss == ML::loss_funct::HINGE) { - Functions::hingeLossGrads(input_batch, cbs, n_cols, labels_batch, - coef, grads, penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::hingeLossGrads(input_batch.data(), cbs, n_cols, + labels_batch.data(), coef, grads.data(), penalty, alpha, + l1_ratio, cublas_handle, allocator, stream); } else { ASSERT(false, "sgd.h: Other loss functions have not been implemented yet!"); } if (lr_type != ML::lr_type::ADAPTIVE) - learning_rate = calLearningRate(lr_type, eta0, power_t, alpha, t); + learning_rate = calLearningRate(lr_type, eta0, power_t, alpha, + t); - LinAlg::scalarMultiply(grads, grads, learning_rate, n_cols, stream); - LinAlg::subtract(coef, coef, grads, n_cols, stream); + LinAlg::scalarMultiply(grads.data(), grads.data(), learning_rate, + n_cols, stream); + LinAlg::subtract(coef, coef, grads.data(), n_cols, stream); j = j + cbs; t = t + 1; @@ -161,59 +196,47 @@ void sgdFit(const cumlHandle_impl& handle, if (tol > math_t(0)) { if (loss == ML::loss_funct::SQRD_LOSS) { - Functions::linearRegLoss(input, n_rows, n_cols, labels, coef, loss_value, - penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::linearRegLoss(input, n_rows, n_cols, labels, coef, + loss_value.data(), penalty, alpha, l1_ratio, + cublas_handle, allocator, stream); } else if (loss == ML::loss_funct::LOG) { - Functions::logisticRegLoss(input, n_rows, n_cols, labels, coef, loss_value, - penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::logisticRegLoss(input, n_rows, n_cols, labels, coef, + loss_value.data(), penalty, alpha, l1_ratio, + cublas_handle, allocator, stream); } else if (loss == ML::loss_funct::HINGE) { - Functions::hingeLoss(input, n_rows, n_cols, labels, coef, loss_value, - penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::hingeLoss(input, n_rows, n_cols, labels, coef, + loss_value.data(), penalty, alpha, l1_ratio, + cublas_handle, allocator, stream); } - updateHost(&curr_loss_value, loss_value, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + updateHost(&curr_loss_value, loss_value.data(), 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); if (i > 0) { - if (curr_loss_value > (prev_loss_value - tol)) { - n_iter_no_change_curr = n_iter_no_change_curr + 1; - if (n_iter_no_change_curr > n_iter_no_change) { - if (lr_type == ML::lr_type::ADAPTIVE && learning_rate > math_t(1e-6)) { - learning_rate = learning_rate / math_t(5); - n_iter_no_change_curr = 0; - } else { - break; - } - } - } else { - n_iter_no_change_curr = 0; - } + if (curr_loss_value > (prev_loss_value - tol)) { + n_iter_no_change_curr = n_iter_no_change_curr + 1; + if (n_iter_no_change_curr > n_iter_no_change) { + if (lr_type == ML::lr_type::ADAPTIVE + && learning_rate > math_t(1e-6)) { + learning_rate = learning_rate / math_t(5); + n_iter_no_change_curr = 0; + } else { + break; + } + } + } else { + n_iter_no_change_curr = 0; + } } prev_loss_value = curr_loss_value; } } - if (grads != NULL) - CUDA_CHECK(cudaFree(grads)); - if (indices != NULL) - CUDA_CHECK(cudaFree(indices)); - if (input_batch != NULL) - CUDA_CHECK(cudaFree(input_batch)); - if (labels_batch != NULL) - CUDA_CHECK(cudaFree(labels_batch)); - if (loss_value != NULL) - CUDA_CHECK(cudaFree(loss_value)); - if (fit_intercept) { - GLM::postProcessData(handle, input, n_rows, n_cols, labels, coef, intercept, - mu_input, mu_labels, norm2_input, fit_intercept, false, - stream); - - if (mu_input != NULL) - CUDA_CHECK(cudaFree(mu_input)); - if (mu_labels != NULL) - CUDA_CHECK(cudaFree(mu_labels)); + GLM::postProcessData(handle, input, n_rows, n_cols, labels, coef, + intercept, mu_input.data(), mu_labels.data(), + norm2_input.data(), fit_intercept, false, stream); } else { *intercept = math_t(0); } @@ -222,8 +245,8 @@ void sgdFit(const cumlHandle_impl& handle, template void sgdPredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, - int n_cols, const math_t *coef, math_t intercept, math_t *preds, ML::loss_funct loss, - cudaStream_t stream) { + int n_cols, const math_t *coef, math_t intercept, math_t *preds, + ML::loss_funct loss, cudaStream_t stream) { ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); @@ -233,38 +256,43 @@ void sgdPredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, cublasHandle_t cublas_handle = handle.getCublasHandle(); if (loss == ML::loss_funct::SQRD_LOSS) { - Functions::linearRegH(input, n_rows, n_cols, coef, preds, intercept, cublas_handle, stream); + Functions::linearRegH(input, n_rows, n_cols, coef, preds, intercept, + cublas_handle, stream); } else if (loss == ML::loss_funct::LOG) { - Functions::logisticRegH(input, n_rows, n_cols, coef, preds, intercept, cublas_handle, stream); + Functions::logisticRegH(input, n_rows, n_cols, coef, preds, intercept, + cublas_handle, stream); } else if (loss == ML::loss_funct::HINGE) { - Functions::hingeH(input, n_rows, n_cols, coef, preds, intercept, cublas_handle, stream); + Functions::hingeH(input, n_rows, n_cols, coef, preds, intercept, + cublas_handle, stream); } } template -void sgdPredictBinaryClass(const cumlHandle_impl& handle, const math_t *input, int n_rows, int n_cols, const math_t *coef, - math_t intercept, math_t *preds, ML::loss_funct loss, cudaStream_t stream) { +void sgdPredictBinaryClass(const cumlHandle_impl& handle, const math_t *input, + int n_rows, int n_cols, const math_t *coef, math_t intercept, + math_t *preds, ML::loss_funct loss, cudaStream_t stream) { - sgdPredict(handle, input, n_rows, n_cols, coef, intercept, preds, loss, stream); + sgdPredict(handle, input, n_rows, n_cols, coef, intercept, preds, loss, + stream); math_t scalar = math_t(1); if (loss == ML::loss_funct::SQRD_LOSS || loss == ML::loss_funct::LOG) { - LinAlg::unaryOp(preds, preds, n_rows, [scalar] __device__ (math_t in) { - if (in >= math_t(0.5)) - return math_t(1); - else - return math_t(0); - }, - stream); - } else if (loss == ML::loss_funct::HINGE) { - LinAlg::unaryOp(preds, preds, n_rows, [scalar] __device__ (math_t in) { - if (in >= math_t(0.0)) - return math_t(1); - else - return math_t(0); - }, - stream); - } + LinAlg::unaryOp(preds, preds, n_rows, [scalar] __device__ (math_t in) { + if (in >= math_t(0.5)) + return math_t(1); + else + return math_t(0); + }, + stream); + } else if (loss == ML::loss_funct::HINGE) { + LinAlg::unaryOp(preds, preds, n_rows, [scalar] __device__ (math_t in) { + if (in >= math_t(0.0)) + return math_t(1); + else + return math_t(0); + }, + stream); +} } diff --git a/ml-prims/src/functions/hinge.h b/ml-prims/src/functions/hinge.h index ad1ef29ea7..56caf99f5c 100644 --- a/ml-prims/src/functions/hinge.h +++ b/ml-prims/src/functions/hinge.h @@ -78,21 +78,20 @@ void hingeH(const math_t *input, idx_type n_rows, idx_type n_cols, template void hingeLossGrads(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *grads, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); - math_t *input_t = NULL; - allocate(input_t, n_rows * n_cols); + device_buffer labels_pred(allocator, stream, n_rows); + device_buffer input_t(allocator, stream, n_rows * n_cols); - LinAlg::gemm(input, n_rows, n_cols, coef, labels_pred, n_rows, 1, CUBLAS_OP_N, + LinAlg::gemm(input, n_rows, n_cols, coef, labels_pred.data(), n_rows, 1, CUBLAS_OP_N, CUBLAS_OP_N, cublas_handle, stream); - LinAlg::eltwiseMultiply(labels_pred, labels_pred, labels, n_rows, stream); + LinAlg::eltwiseMultiply(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); - LinAlg::transpose(input, input_t, n_rows, n_cols, cublas_handle, stream); - hingeLossGradMult(input_t, labels, labels_pred, n_cols, n_rows, stream); - LinAlg::transpose(input_t, input, n_cols, n_rows, cublas_handle, stream); + LinAlg::transpose(input, input_t.data(), n_rows, n_cols, cublas_handle, stream); + hingeLossGradMult(input_t.data(), labels, labels_pred.data(), n_cols, n_rows, stream); + LinAlg::transpose(input_t.data(), input, n_cols, n_rows, cublas_handle, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); @@ -115,30 +114,25 @@ void hingeLossGrads(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_grads)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - - if (input_t != NULL) - CUDA_CHECK(cudaFree(input_t)); } template void hingeLoss(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *loss, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); - LinAlg::gemm(input, n_rows, n_cols, coef, labels_pred, n_rows, 1, CUBLAS_OP_N, + LinAlg::gemm(input, n_rows, n_cols, coef, labels_pred.data(), n_rows, 1, CUBLAS_OP_N, CUBLAS_OP_N, cublas_handle, stream); - LinAlg::eltwiseMultiply(labels_pred, labels_pred, labels, n_rows, stream); + LinAlg::eltwiseMultiply(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); - hingeLossSubtract(labels_pred, labels_pred, math_t(1), n_rows, stream); + hingeLossSubtract(labels_pred.data(), labels_pred.data(), math_t(1), n_rows, stream); - Stats::sum(loss, labels_pred, 1, n_rows, false, stream); + Stats::sum(loss, labels_pred.data(), 1, n_rows, false, stream); math_t *pen_val = NULL; @@ -159,9 +153,6 @@ void hingeLoss(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_val)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - } /** @} */ diff --git a/ml-prims/src/functions/linearReg.h b/ml-prims/src/functions/linearReg.h index 408e222f28..0817db0278 100644 --- a/ml-prims/src/functions/linearReg.h +++ b/ml-prims/src/functions/linearReg.h @@ -50,22 +50,20 @@ void linearRegH(const math_t *input, int n_rows, int n_cols, template void linearRegLossGrads(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *grads, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); + device_buffer input_t(allocator, stream, n_rows * n_cols); - math_t *input_t = NULL; - allocate(input_t, n_rows * n_cols); + linearRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); - linearRegH(input, n_rows, n_cols, coef, labels_pred, math_t(0), cublas_handle, stream); - - LinAlg::subtract(labels_pred, labels_pred, labels, n_rows, stream); + LinAlg::subtract(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); // TODO: implement a matrixVectorBinaryMult that runs on rows rather than columns. - LinAlg::transpose(input, input_t, n_rows, n_cols, cublas_handle, stream); - Matrix::matrixVectorBinaryMult(input_t, labels_pred, n_cols, n_rows, false, true, stream); - LinAlg::transpose(input_t, input, n_cols, n_rows, cublas_handle, stream); + LinAlg::transpose(input, input_t.data(), n_rows, n_cols, cublas_handle, stream); + Matrix::matrixVectorBinaryMult(input_t.data(), labels_pred.data(), n_cols, n_rows, false, true, stream); + LinAlg::transpose(input_t.data(), input, n_cols, n_rows, cublas_handle, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); LinAlg::scalarMultiply(grads, grads, math_t(2), n_cols, stream); @@ -89,27 +87,22 @@ void linearRegLossGrads(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_grads)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - - if (input_t != NULL) - CUDA_CHECK(cudaFree(input_t)); } template void linearRegLoss(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *loss, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); - linearRegH(input, n_rows, n_cols, coef, labels_pred, math_t(0), cublas_handle, stream); + linearRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); - LinAlg::subtract(labels_pred, labels, labels_pred, n_rows, stream); - Matrix::power(labels_pred, n_rows, stream); - Stats::mean(loss, labels_pred, 1, n_rows, false, false, stream); + LinAlg::subtract(labels_pred.data(), labels, labels_pred.data(), n_rows, stream); + Matrix::power(labels_pred.data(), n_rows, stream); + Stats::mean(loss, labels_pred.data(), 1, n_rows, false, false, stream); math_t *pen_val = NULL; @@ -130,9 +123,6 @@ void linearRegLoss(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_val)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - } /** @} */ diff --git a/ml-prims/src/functions/logisticReg.h b/ml-prims/src/functions/logisticReg.h index 6358557040..4540e3431a 100644 --- a/ml-prims/src/functions/logisticReg.h +++ b/ml-prims/src/functions/logisticReg.h @@ -52,22 +52,20 @@ void logisticRegH(const math_t *input, int n_rows, int n_cols, template void logisticRegLossGrads(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *grads, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); + device_buffer input_t(allocator, stream, n_rows * n_cols); - math_t *input_t = NULL; - allocate(input_t, n_rows * n_cols); + logisticRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); - logisticRegH(input, n_rows, n_cols, coef, labels_pred, math_t(0), cublas_handle, stream); - - LinAlg::subtract(labels_pred, labels_pred, labels, n_rows, stream); + LinAlg::subtract(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); // TODO: implement a matrixVectorBinaryMult that runs on rows rather than columns. - LinAlg::transpose(input, input_t, n_rows, n_cols, cublas_handle, stream); - Matrix::matrixVectorBinaryMult(input_t, labels_pred, n_cols, n_rows, false, true, stream); - LinAlg::transpose(input_t, input, n_cols, n_rows, cublas_handle, stream); + LinAlg::transpose(input, input_t.data(), n_rows, n_cols, cublas_handle, stream); + Matrix::matrixVectorBinaryMult(input_t.data(), labels_pred.data(), n_cols, n_rows, false, true, stream); + LinAlg::transpose(input_t.data(), input, n_cols, n_rows, cublas_handle, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); @@ -89,12 +87,6 @@ void logisticRegLossGrads(math_t *input, int n_rows, int n_cols, if (pen_grads != NULL) CUDA_CHECK(cudaFree(pen_grads)); } - - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - - if (input_t != NULL) - CUDA_CHECK(cudaFree(input_t)); } template @@ -118,15 +110,15 @@ inline void logLoss(double *out, double *label, double *label_pred, int len, cud template void logisticRegLoss(math_t *input, int n_rows, int n_cols, math_t *labels, const math_t *coef, math_t *loss, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); - logisticRegH(input, n_rows, n_cols, coef, labels_pred, math_t(0), cublas_handle, stream); - logLoss(labels_pred, labels, labels_pred, n_rows, stream); + logisticRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); + logLoss(labels_pred.data(), labels, labels_pred.data(), n_rows, stream); - Stats::mean(loss, labels_pred, 1, n_rows, false, false, stream); + Stats::mean(loss, labels_pred.data(), 1, n_rows, false, false, stream); math_t *pen_val = NULL; @@ -147,9 +139,6 @@ void logisticRegLoss(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_val)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - } /** @} */ From 2982c3af51bad981786b61f9a0b76b6b7d1fa761 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Thu, 9 May 2019 03:02:52 -0700 Subject: [PATCH 044/156] resolved the compilation issue due to the previous merge --- ml-prims/src/matrix/reverse.h | 6 +++--- ml-prims/test/reverse.cu | 9 ++++++--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/ml-prims/src/matrix/reverse.h b/ml-prims/src/matrix/reverse.h index 0ceec939af..7454aa5621 100644 --- a/ml-prims/src/matrix/reverse.h +++ b/ml-prims/src/matrix/reverse.h @@ -104,13 +104,13 @@ void reverseImpl(math_t *out, const math_t *in, int nrows, int ncols, * @param ncols number of cols in the input matrix * @param rowMajor input matrix is row major or not * @param alongRows whether to reverse along rows or not - * @param op the device-lambda to perform any unary operations on each element * @param stream cuda stream where to launch work + * @param op the device-lambda to perform any unary operations on each element */ template , int TPB = 256> void reverse(math_t *out, const math_t *in, int nrows, int ncols, - bool rowMajor, bool alongRows, Lambda op = Nop(), - cudaStream_t stream = 0) { + bool rowMajor, bool alongRows, cudaStream_t stream, + Lambda op = Nop()) { size_t bytes = (rowMajor? ncols : nrows) * sizeof(math_t); if (16 / sizeof(math_t) && bytes % 16 == 0) { reverseImpl( diff --git a/ml-prims/test/reverse.cu b/ml-prims/test/reverse.cu index 0d993ec10c..22c06be1e5 100644 --- a/ml-prims/test/reverse.cu +++ b/ml-prims/test/reverse.cu @@ -34,28 +34,31 @@ template class ReverseTest : public ::testing::TestWithParam> { protected: void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); params = ::testing::TestWithParam>::GetParam(); Random::Rng r(params.seed); int len = params.nrows * params.ncols; allocate(in, len); allocate(out, len); - r.uniform(in, len, T(-1.0), T(1.0)); + r.uniform(in, len, T(-1.0), T(1.0), stream); // applying reverse twice should yield the same output! // this will in turn also verify the inplace mode of reverse method reverse(out, in, params.nrows, params.ncols, params.rowMajor, - params.alongRows); + params.alongRows, stream); reverse(out, out, params.nrows, params.ncols, params.rowMajor, - params.alongRows); + params.alongRows, stream); } void TearDown() override { CUDA_CHECK(cudaFree(in)); CUDA_CHECK(cudaFree(out)); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: ReverseInputs params; T *in, *out; + cudaStream_t stream; }; const std::vector> inputsf = { From 77eb2467e8ab3714df688d07f3d4387cd0859b92 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Thu, 9 May 2019 03:08:53 -0700 Subject: [PATCH 045/156] updated doxygen comment for reverse method --- ml-prims/src/matrix/reverse.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ml-prims/src/matrix/reverse.h b/ml-prims/src/matrix/reverse.h index 7454aa5621..74de862cce 100644 --- a/ml-prims/src/matrix/reverse.h +++ b/ml-prims/src/matrix/reverse.h @@ -94,7 +94,7 @@ void reverseImpl(math_t *out, const math_t *in, int nrows, int ncols, } /** - * @brief perform element-wise binary operation on the input arrays + * @brief Reversal of the input matrix along the specified dimension * @tparam math_t data-type upon which the math operation will be performed * @tparam Lambda the device-lambda performing the actual operation * @tparam TPB threads-per-block in the final kernel launched @@ -105,7 +105,8 @@ void reverseImpl(math_t *out, const math_t *in, int nrows, int ncols, * @param rowMajor input matrix is row major or not * @param alongRows whether to reverse along rows or not * @param stream cuda stream where to launch work - * @param op the device-lambda to perform any unary operations on each element + * @param op the device-lambda to perform an optional final unary operation on + * each element after the reverse */ template , int TPB = 256> void reverse(math_t *out, const math_t *in, int nrows, int ncols, From ef41c004cc0357161411842390abade396c38242 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Thu, 9 May 2019 03:10:05 -0700 Subject: [PATCH 046/156] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 265768cda4..77f9e31bda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ - PR #540: Use latest release version in update-version CI script - PR #552: Re-enable assert in kmeans tests with xfail as needed - PR #581: Add shared memory fast col major to row major function back with bound checks +- PR #592: More efficient matrix copy/reverse methods ## Bug Fixes From 85cdef63ddd09f2b57292979167eef2049f103d6 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 10:30:34 -0500 Subject: [PATCH 047/156] FEA Add building prims test to cuML cmake --- cpp/CMakeLists.txt | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 50dbc0c96f..aeeb2a6c47 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -50,7 +50,7 @@ option(BUILD_CUML_TESTS "Build cuML algorithm tests" ON) option(BUILD_PRIM_TESTS "Build ml-prim tests" OFF) -option(BUILD_EXAMPLES "Build C++ API usage examples" OFF) +option(BUILD_CUML_EXAMPLES "Build C++ API usage examples" OFF) set(BLAS_LIBRARIES "" CACHE STRING "Location of BLAS library") @@ -241,10 +241,10 @@ include_directories( add_subdirectory(${GTEST_DIR} ${PROJECT_BINARY_DIR}/googletest) -# Append source file in recursive manner, append header files to target for work with them in IDE file(GLOB_RECURSE ml_prims_header "src_prims/*.h" "src_prims/*.hpp") file(GLOB_RECURSE cuml_test_cuda_sources "test/sg/*.cu") file(GLOB_RECURSE cuml_mg_test_cuda_sources "test/mg/*.cu") +file(GLOB_RECURSE mlprims_test_cuda_sources "test/prims/*.cu") ################################################################################################### # - build libcuml++ shared library ------------------------------------------------------------------ @@ -286,7 +286,9 @@ target_link_libraries(${CUML_CPP_TARGET} ${CUML_LINK_LIBRARIES}) ################################################################################################### # - build ml_test executable ---------------------------------------------------------------------- -add_executable(ml_test ${cuml_test_cuda_sources} ${ml_prims_header}) +add_executable(ml_test + ${cuml_test_cuda_sources} + ${ml_prims_header}) target_link_libraries(ml_test ${GTEST_LIBNAME} @@ -306,7 +308,9 @@ target_link_libraries(ml_test ################################################################################################### # - build ml_mg_test executable ------------------------------------------------------------------- -add_executable(ml_mg_test ${cuml_mg_test_cuda_sources} ${ml_prims_header}) +add_executable(ml_mg_test + ${cuml_mg_test_cuda_sources} + ${ml_prims_header}) target_link_libraries(ml_mg_test ${GTEST_LIBNAME} @@ -323,12 +327,31 @@ target_link_libraries(ml_mg_test pthread ${ZLIB_LIBRARIES}) +################################################################################################### +# - build prims_test executable ---------------------------------------------------------------- + +set(MLPRIMS_LINK_LIBRARIES + ${CUDA_cublas_LIBRARY} + ${CUDA_curand_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY} + pthread + ${ZLIB_LIBRARIES}) + +add_executable(prims_test + ${mlprims_test_cuda_sources} + ${ml_prims_header}) + +target_link_libraries(prims_test + ${GTEST_LIBNAME} + ${MLPRIMS_LINK_LIBRARIES}) + ################################################################################################### # - build examples ------------------------------------------------------------------------- -if (DISABLE_EXAMPLES OR ${BUILD_EXAMPLES}) +if (DISABLE_EXAMPLES OR ${BUILD_CUML_EXAMPLES}) add_subdirectory(examples) -endif(DISABLE_EXAMPLES OR ${BUILD_EXAMPLES}) +endif(DISABLE_EXAMPLES OR ${BUILD_CUML_EXAMPLES}) ################################################################################################### # - install targets ------------------------------------------------------------------------------- From bc755db165d1f64ab30cf6a55302c8b7cf520154 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 10:30:59 -0500 Subject: [PATCH 048/156] FIX Remove unused test/prim cmakelists --- cpp/test/prims/CMakeLists.txt | 84 ----------------------------------- 1 file changed, 84 deletions(-) delete mode 100644 cpp/test/prims/CMakeLists.txt diff --git a/cpp/test/prims/CMakeLists.txt b/cpp/test/prims/CMakeLists.txt deleted file mode 100644 index 9bf642c15b..0000000000 --- a/cpp/test/prims/CMakeLists.txt +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2018-2019, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -cmake_minimum_required(VERSION 3.8 FATAL_ERROR) -project(mlcommon_test LANGUAGES CXX CUDA) - -include_directories(${GTEST_DIR}/googletest/include) - -# single gpu unit-tests -# (please keep the filenames in alphabetical order) -add_executable(mlcommon_test - add.cu - binary_op.cu - ternary_op.cu - coalesced_reduction.cu - cuda_utils.cu - columnSort.cu - coo.cu - cov.cu - csr.cu - decoupled_lookback.cu - dist_adj.cu - dist_cos.cu - dist_euc_exp.cu - dist_euc_unexp.cu - dist_l1.cu - divide.cu - eig.cu - eltwise.cu - eltwise2d.cu - gather.cu - gemm.cu - grid_sync.cu - hinge.cu - kselection.cu - linearReg.cu - log.cu - logisticReg.cu - map_then_reduce.cu - math.cu - matrix.cu - matrix_vector_op.cu - mean.cu - mean_center.cu - minmax.cu - mvg.cu - multiply.cu - norm.cu - penalty.cu - permute.cu - power.cu - reduce.cu - reduce_rows_by_key.cu - rng.cu - rng_int.cu - rsvd.cu - score.cu - sigmoid.cu - sqrt.cu - stddev.cu - strided_reduction.cu - subtract.cu - sum.cu - svd.cu - transpose.cu - unary_op.cu - weighted_mean.cu - ) - -target_link_libraries(mlcommon_test - ${GTEST_LIBNAME} - ${MLPRIMS_LIBS}) From 4fd89953f481075884d249cff51bd229ff37f11c Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 10:32:16 -0500 Subject: [PATCH 049/156] FIX Move unused prim tests to 'test/old' folder --- .../{prims => old}/add_and_sub_dev_scalar.cu | 0 cpp/test/{prims => old}/vector_broadcast.cu | 0 cpp/test/prims/opg_distance.cu | 455 ------------------ 3 files changed, 455 deletions(-) rename cpp/test/{prims => old}/add_and_sub_dev_scalar.cu (100%) rename cpp/test/{prims => old}/vector_broadcast.cu (100%) delete mode 100644 cpp/test/prims/opg_distance.cu diff --git a/cpp/test/prims/add_and_sub_dev_scalar.cu b/cpp/test/old/add_and_sub_dev_scalar.cu similarity index 100% rename from cpp/test/prims/add_and_sub_dev_scalar.cu rename to cpp/test/old/add_and_sub_dev_scalar.cu diff --git a/cpp/test/prims/vector_broadcast.cu b/cpp/test/old/vector_broadcast.cu similarity index 100% rename from cpp/test/prims/vector_broadcast.cu rename to cpp/test/old/vector_broadcast.cu diff --git a/cpp/test/prims/opg_distance.cu b/cpp/test/prims/opg_distance.cu deleted file mode 100644 index f0e4998db5..0000000000 --- a/cpp/test/prims/opg_distance.cu +++ /dev/null @@ -1,455 +0,0 @@ -#include -#include "distance/distance.h" -#include "test_utils.h" -#include "random/rng.h" -#include "cuda_utils.h" -#include "nvToolsExt.h" - -#include -#include -#include -#include -#include - - -namespace MLCommon { -namespace Distance { - -// TODO(minseok): double check the result difference between CPU and GPU for the small matrx size -// TODO(minseok): it would be useful if we have a macro for the current device recovery -// TODO(minseok): consider non-power of two cases for m, n, and n_gpu - -template -__global__ void naiveDistanceKernel(Type* out, const Type* x, const Type* y, - int m, int n, int k, DistanceType type) { - int midx = threadIdx.x + blockIdx.x * blockDim.x; - int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if(midx >= m || nidx >= n) - return; - Type acc = Type(0); - for(int i=0; i -__global__ void naiveL1DistanceKernel( - Type* out, const Type* x, const Type* y, - int m, int n, int k) -{ - int midx = threadIdx.x + blockIdx.x * blockDim.x; - int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if(midx >= m || nidx >= n) { - return; - } - - Type acc = Type(0); - for(int i = 0; i < k; ++i) { - auto a = x[i + midx * k]; - auto b = y[i + nidx * k]; - auto diff = (a > b) ? (a - b) : (b - a); - acc += diff; - } - - out[midx * n + nidx] = acc; -} - -template -__global__ void naiveCosineDistanceKernel( - Type* out, const Type* x, const Type* y, - int m, int n, int k) -{ - int midx = threadIdx.x + blockIdx.x * blockDim.x; - int nidx = threadIdx.y + blockIdx.y * blockDim.y; - if(midx >= m || nidx >= n) { - return; - } - - Type acc_a = Type(0); - Type acc_b = Type(0); - Type acc_ab = Type(0); - - for(int i = 0; i < k; ++i) { - auto a = x[i + midx * k]; - auto b = y[i + nidx * k]; - - acc_a += a * a; - acc_b += b * b; - acc_ab += a * b; - } - - out[midx * n + nidx] = acc_ab / (sqrt(acc_a) * sqrt(acc_b)); -} - -template -void naiveDistance(Type* out, const Type* x, const Type* y, int m, int n, int k, - DistanceType type) { - static const dim3 TPB(16, 32, 1); - dim3 nblks(ceildiv(m, (int)TPB.x), ceildiv(n, (int)TPB.y), 1); - - switch (type) { - case EucUnexpandedL1: - naiveL1DistanceKernel<<>>(out, x, y, m, n, k); - break; - case EucUnexpandedL2Sqrt: - case EucUnexpandedL2: - case EucExpandedL2Sqrt: - case EucExpandedL2: - naiveDistanceKernel<<>>(out, x, y, m, n, k, type); - break; - case EucExpandedCosine: - naiveCosineDistanceKernel<<>>(out, x, y, m, n, k); - break; - default: - FAIL() << "should be here\n"; - } - CUDA_CHECK(cudaPeekAtLastError()); -} - -enum PartitionScheme { - XOnly, - YOnly, - Both -}; - -template -struct OpgDistanceInputs { - T tolerance; - int m, n, k; - DistanceType type; - unsigned long long int seed; - int n_gpu; - PartitionScheme scheme; -}; - -template -::std::ostream& operator<<(::std::ostream& os, const OpgDistanceInputs& dims) { - return os; -} - -template -struct Task { - T* d_X; - T* d_Y; - T* dist; - DistanceType type; - int m, n, k; -}; - - -template -class Worker { - public: - typedef cutlass::Shape<8, 128, 128> OutputTile_t; - Worker(int device_id); - ~Worker(); - void enqueue(Task* t); - void execute(); - Task* dequeue(); - ::testing::AssertionResult verify(T tolerance); - bool empty() const; - int getDeviceId() const; - private: - int device_id_; - T* workspace_; - size_t worksize_; - DistanceType type_; - std::list*> active_queue_; -}; - - -template -Worker::Worker(int device_id) - : device_id_(device_id), - workspace_(nullptr), - worksize_(0) { -} - -template -Worker::~Worker() { - if(workspace_) - CUDA_CHECK(cudaFree(workspace_)); -} - -template -void Worker::enqueue(Task* t) { - ASSERT(t != nullptr, "t == nullptr"); - - int current_device_id; - CUDA_CHECK(cudaGetDevice(¤t_device_id)); - - size_t new_worksize = 0; - - CUDA_CHECK(cudaSetDevice(device_id_)); - - distance,Task, OutputTile_t >(t->d_X, t->d_Y, t->m, t->n, t->k, - *t, *t, t->type, - nullptr, new_worksize); - if (new_worksize != 0) { - if(new_worksize > worksize_) { - if(worksize_ != 0) - CUDA_CHECK(cudaFree(workspace_)); - worksize_ = new_worksize; - workspace_ = nullptr; - allocate(workspace_, worksize_); - } - } - - active_queue_.push_back(t); - - CUDA_CHECK(cudaSetDevice(current_device_id)); -} - -template -void Worker::execute() { - int current_device_id; - CUDA_CHECK(cudaGetDevice(¤t_device_id)); - - CUDA_CHECK(cudaSetDevice(device_id_)); - - for(auto it = active_queue_.begin(); it != active_queue_.end(); ++it) { - Task* t = *it; - distance,Task, OutputTile_t >(t->d_X, t->d_Y, t->m, t->n, t->k, - *t, *t, t->type, - (void*)workspace_, worksize_); - } - - CUDA_CHECK(cudaSetDevice(current_device_id)); -} - -template -Task* Worker::dequeue() { - if(empty()) - return nullptr; - - Task* t = active_queue_.front(); - active_queue_.pop_front(); - return t; -} - -template -::testing::AssertionResult Worker::verify(T tolerance) { - int current_device_id; - CUDA_CHECK(cudaGetDevice(¤t_device_id)); - - CUDA_CHECK(cudaSetDevice(device_id_)); - - auto ret = ::testing::AssertionSuccess(); - for(auto it = active_queue_.begin(); it != active_queue_.end(); ++it) { - Task* t = *it; - T* dist_ref = nullptr; - allocate(dist_ref, t->m*t->n); - naiveDistance(dist_ref, t->d_X, t->d_Y, t->m, t->n, t->k, t->type); - auto ret = devArrMatch(dist_ref, t->dist, t->m, t->n, CompareApprox(tolerance)); - CUDA_CHECK(cudaFree(dist_ref)); - if(ret != ::testing::AssertionSuccess()) - break; - } - - CUDA_CHECK(cudaSetDevice(current_device_id)); - - return ret; -} - -template -bool Worker::empty() const { - return active_queue_.empty(); -} - -template -int Worker::getDeviceId() const { - return device_id_; -} - -void getNumberOfTiles(PartitionScheme scheme, - int m, int n, int k, int n_gpu, int& n_vertical_tiles, int& n_horizontal_tiles) { - switch(scheme) { - case XOnly: - n_vertical_tiles = n_gpu; - n_horizontal_tiles = 1; - break; - case YOnly: - n_vertical_tiles = 1; - n_horizontal_tiles = n_gpu; - break; - case Both: - n_vertical_tiles = std::max(1, m / 4096); - n_horizontal_tiles = std::max(1, n / 4096); - break; - default: - ASSERT(false, "Invalid PartitionScheme '%d'!", scheme); - } -} - - -template -void assignTasks(std::vector*>& workers, int n_gpu, - int m, int n, int k, DistanceType type, PartitionScheme scheme, unsigned long long int seed) { - ASSERT(workers.size() == n_gpu, "# workers(%d) != # GPUs(%d)", workers.size(), n_gpu); - - int current_device_id; - CUDA_CHECK(cudaGetDevice(¤t_device_id)); - - int n_vertical_tiles = 0, n_horizontal_tiles = 0; - getNumberOfTiles(scheme, m, n, k, n_gpu, n_vertical_tiles, n_horizontal_tiles); - - for(int y=0; y* worker = workers[id]; - ASSERT(id == worker->getDeviceId(), "id(%d) != deviceId(%d)", id, worker->getDeviceId()); - CUDA_CHECK(cudaSetDevice(worker->getDeviceId())); - - Task* task = new Task; - task->m = m / n_vertical_tiles; - task->n = n / n_horizontal_tiles; - task->k = k; - task->type = type; - int x_len = task->m*task->k; - int y_len = task->n*task->k; - int dist_len = task->m*task->n; - allocate(task->d_X, x_len); - allocate(task->d_Y, y_len); - allocate(task->dist, dist_len); - Random::Rng r(seed); - r.uniform(task->d_X, x_len, T(-1.0), T(1.0)); - r.uniform(task->d_Y, y_len, T(-1.0), T(1.0)); - worker->enqueue(task); - } - } - - CUDA_CHECK(cudaSetDevice(current_device_id)); -} - -template -void finalizeTasks(std::vector*>& workers) { - int current_device_id; - CUDA_CHECK(cudaGetDevice(¤t_device_id)); - - while(!workers.empty()) { - Worker* worker = workers.back(); - workers.pop_back(); - CUDA_CHECK(cudaSetDevice(worker->getDeviceId())); - while(!worker->empty()) { - Task* task = worker->dequeue(); - CUDA_CHECK(cudaFree(task->d_X)); - CUDA_CHECK(cudaFree(task->d_Y)); - CUDA_CHECK(cudaFree(task->dist)); - delete task; - } - delete worker; - } - - CUDA_CHECK(cudaSetDevice(current_device_id)); -} - -void syncAll(int n_gpu) { - int current_device_id; - CUDA_CHECK(cudaGetDevice(¤t_device_id)); - - for(int i=0; i -class OpgDistanceTest: public ::testing::TestWithParam > { -protected: - void SetUp() override { - // Get the parameters - params = ::testing::TestWithParam>::GetParam(); - int n_gpu = params.n_gpu; - - // Skip the test if # available GPUs is less than the specified one. - int avail_gpu; - CUDA_CHECK(cudaGetDeviceCount(&avail_gpu)); - if(avail_gpu < n_gpu) - GTEST_SKIP(); - - ASSERT(params.m > n_gpu, "Invalid m(%d)", params.m); - - // Initialize all GPU workers and assign tasks to them - for(int i=0; i(i)); - assignTasks(workers, n_gpu, - params.m, params.n, params.k, params.type, params.scheme, params.seed); - - - int n_rep = 1; - float elapsed = 0; - double time_min = 1e100; - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - - for(int r=0; rexecute(); - - syncAll(n_gpu); - - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&elapsed, start, stop); - double time = (double)elapsed / 1000.; - time_min = std::min(time, time_min); - } - - cudaEventDestroy(start); - cudaEventDestroy(stop); - } - - void TearDown() override { - finalizeTasks(workers); - } - -protected: - OpgDistanceInputs params; - std::vector* > workers; -}; - -const std::vector > inputsf = { - {0.001f, 1024, 1024, 1024, EucExpandedL2, 1234ULL, 8, XOnly}, - {0.001f, 2048, 2048, 2048, EucExpandedL2, 1234ULL, 8, XOnly}, - {0.001f, 4096, 4096, 4096, EucExpandedL2, 1234ULL, 8, XOnly}, - {0.001f, 8192, 8192, 8192, EucExpandedL2, 1234ULL, 8, XOnly}, - {0.001f, 16384, 16384, 16384, EucExpandedL2, 1234ULL, 8, XOnly}, - - {0.001f, 1024, 1024, 1024, EucExpandedL2, 1234ULL, 8, YOnly}, - {0.001f, 2048, 2048, 2048, EucExpandedL2, 1234ULL, 8, YOnly}, - {0.001f, 4096, 4096, 4096, EucExpandedL2, 1234ULL, 8, YOnly}, - {0.001f, 8192, 8192, 8192, EucExpandedL2, 1234ULL, 8, YOnly}, - {0.001f, 16384, 16384, 16384, EucExpandedL2, 1234ULL, 8, YOnly}, - - {0.001f, 1024, 1024, 1024, EucExpandedL2, 1234ULL, 8, Both}, - {0.001f, 2048, 2048, 2048, EucExpandedL2, 1234ULL, 8, Both}, - {0.001f, 4096, 4096, 4096, EucExpandedL2, 1234ULL, 8, Both}, - {0.001f, 8192, 8192, 8192, EucExpandedL2, 1234ULL, 8, Both}, - {0.001f, 16384, 16384, 16384, EucExpandedL2, 1234ULL, 8, Both}, -}; - -typedef OpgDistanceTest TestF; -TEST_P(TestF, Result) { - // verify the result - for(int i=0; iverify(params.tolerance)); - } -} - -INSTANTIATE_TEST_CASE_P(OpgDistanceTests, TestF, ::testing::ValuesIn(inputsf)); - -} // end namespace Distance -} // end namespace MLCommon From 15351c0066d163d450c384731607426c15f6fab0 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Thu, 9 May 2019 11:43:03 -0400 Subject: [PATCH 050/156] Used new version of matrixVecOp not to do unnecessary transpose operations. --- cuML/src/solver/sgd.h | 42 ++++++++++++++++++++++++++++ ml-prims/src/functions/hinge.h | 9 ++---- ml-prims/src/functions/linearReg.h | 8 +----- ml-prims/src/functions/logisticReg.h | 8 +----- 4 files changed, 46 insertions(+), 21 deletions(-) diff --git a/cuML/src/solver/sgd.h b/cuML/src/solver/sgd.h index a87ece412b..7dbf4d7b90 100644 --- a/cuML/src/solver/sgd.h +++ b/cuML/src/solver/sgd.h @@ -243,6 +243,27 @@ void sgdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, } +/** + * Make predictions + * @param cumlHandle_impl + * Reference of cumlHandle + * @param input + * pointer to an array in column-major format (size of n_rows, n_cols) + * @param n_rows + * n_samples or rows in input + * @param n_cols + * n_features or columns in X + * @param coef + * pointer to an array for coefficients (size of n_cols). Calculated in cdFit function. + * @param intercept + * intercept value calculated in cdFit function + * @param preds + * pointer to an array for predictions (size of n_rows). This will be fitted once functions is executed. + * @param loss + * enum to use different loss functions. Only linear regression loss functions is supported right now. + * @param stream + * cuda stream + */ template void sgdPredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, int n_cols, const math_t *coef, math_t intercept, math_t *preds, @@ -267,6 +288,27 @@ void sgdPredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, } } +/** + * Make binary classifications + * @param cumlHandle_impl + * Reference of cumlHandle + * @param input + * pointer to an array in column-major format (size of n_rows, n_cols) + * @param n_rows + * n_samples or rows in input + * @param n_cols + * n_features or columns in X + * @param coef + * pointer to an array for coefficients (size of n_cols). Calculated in cdFit function. + * @param intercept + * intercept value calculated in cdFit function + * @param preds + * pointer to an array for predictions (size of n_rows). This will be fitted once functions is executed. + * @param loss + * enum to use different loss functions. Only linear regression loss functions is supported right now. + * @param stream + * cuda stream + */ template void sgdPredictBinaryClass(const cumlHandle_impl& handle, const math_t *input, int n_rows, int n_cols, const math_t *coef, math_t intercept, diff --git a/ml-prims/src/functions/hinge.h b/ml-prims/src/functions/hinge.h index 56caf99f5c..dc3aeccac9 100644 --- a/ml-prims/src/functions/hinge.h +++ b/ml-prims/src/functions/hinge.h @@ -38,7 +38,7 @@ namespace Functions { template void hingeLossGradMult(math_t* data, const math_t* vec1, const math_t* vec2, idx_type n_row, idx_type n_col, cudaStream_t stream) { - LinAlg::matrixVectorOp(data, data, vec1, vec2, n_col, n_row, false, true, + LinAlg::matrixVectorOp(data, data, vec1, vec2, n_col, n_row, false, false, [] __device__ (math_t a, math_t b, math_t c) { if (c < math_t(1)) return -a * b; @@ -82,17 +82,12 @@ void hingeLossGrads(math_t *input, int n_rows, int n_cols, std::shared_ptr allocator, cudaStream_t stream) { device_buffer labels_pred(allocator, stream, n_rows); - device_buffer input_t(allocator, stream, n_rows * n_cols); LinAlg::gemm(input, n_rows, n_cols, coef, labels_pred.data(), n_rows, 1, CUBLAS_OP_N, CUBLAS_OP_N, cublas_handle, stream); LinAlg::eltwiseMultiply(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); - - LinAlg::transpose(input, input_t.data(), n_rows, n_cols, cublas_handle, stream); - hingeLossGradMult(input_t.data(), labels, labels_pred.data(), n_cols, n_rows, stream); - LinAlg::transpose(input_t.data(), input, n_cols, n_rows, cublas_handle, stream); - + hingeLossGradMult(input, labels, labels_pred.data(), n_rows, n_cols, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); math_t *pen_grads = NULL; diff --git a/ml-prims/src/functions/linearReg.h b/ml-prims/src/functions/linearReg.h index 0817db0278..9fd84ab9ba 100644 --- a/ml-prims/src/functions/linearReg.h +++ b/ml-prims/src/functions/linearReg.h @@ -54,16 +54,10 @@ void linearRegLossGrads(math_t *input, int n_rows, int n_cols, std::shared_ptr allocator, cudaStream_t stream) { device_buffer labels_pred(allocator, stream, n_rows); - device_buffer input_t(allocator, stream, n_rows * n_cols); linearRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); - LinAlg::subtract(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); - - // TODO: implement a matrixVectorBinaryMult that runs on rows rather than columns. - LinAlg::transpose(input, input_t.data(), n_rows, n_cols, cublas_handle, stream); - Matrix::matrixVectorBinaryMult(input_t.data(), labels_pred.data(), n_cols, n_rows, false, true, stream); - LinAlg::transpose(input_t.data(), input, n_cols, n_rows, cublas_handle, stream); + Matrix::matrixVectorBinaryMult(input, labels_pred.data(), n_rows, n_cols, false, false, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); LinAlg::scalarMultiply(grads, grads, math_t(2), n_cols, stream); diff --git a/ml-prims/src/functions/logisticReg.h b/ml-prims/src/functions/logisticReg.h index 4540e3431a..df4173a303 100644 --- a/ml-prims/src/functions/logisticReg.h +++ b/ml-prims/src/functions/logisticReg.h @@ -56,16 +56,10 @@ void logisticRegLossGrads(math_t *input, int n_rows, int n_cols, std::shared_ptr allocator, cudaStream_t stream) { device_buffer labels_pred(allocator, stream, n_rows); - device_buffer input_t(allocator, stream, n_rows * n_cols); logisticRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); - LinAlg::subtract(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); - - // TODO: implement a matrixVectorBinaryMult that runs on rows rather than columns. - LinAlg::transpose(input, input_t.data(), n_rows, n_cols, cublas_handle, stream); - Matrix::matrixVectorBinaryMult(input_t.data(), labels_pred.data(), n_cols, n_rows, false, true, stream); - LinAlg::transpose(input_t.data(), input, n_cols, n_rows, cublas_handle, stream); + Matrix::matrixVectorBinaryMult(input, labels_pred.data(), n_rows, n_cols, false, false, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); From 851146dd17c976ce13dd7aa762c9c359aacf21b2 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Tue, 7 May 2019 19:04:15 -0400 Subject: [PATCH 051/156] Added cumlhandle to coordinate descent and stochastic gradient descent --- cuML/src/solver/cd.h | 72 ++--- cuML/src/solver/sgd.h | 312 +++++++++++++-------- cuML/src/solver/solver.cu | 304 +++++++------------- cuML/src/solver/{solver_c.h => solver.hpp} | 36 ++- cuML/test/cd_test.cu | 27 +- cuML/test/sgd.cu | 67 ++--- ml-prims/src/functions/hinge.h | 43 +-- ml-prims/src/functions/linearReg.h | 42 +-- ml-prims/src/functions/logisticReg.h | 41 +-- python/cuml/solvers/cd.pyx | 209 +++++++------- python/cuml/solvers/sgd.pyx | 271 +++++++++--------- 11 files changed, 646 insertions(+), 778 deletions(-) rename cuML/src/solver/{solver_c.h => solver.hpp} (73%) diff --git a/cuML/src/solver/cd.h b/cuML/src/solver/cd.h index a0d6d2428d..c9d50d5477 100644 --- a/cuML/src/solver/cd.h +++ b/cuML/src/solver/cd.h @@ -75,25 +75,12 @@ using namespace MLCommon; * cublas handle * @param cusolver_handle * cusolver handle -*/ + */ template -void cdFit(math_t *input, - int n_rows, - int n_cols, - math_t *labels, - math_t *coef, - math_t *intercept, - bool fit_intercept, - bool normalize, - int epochs, - ML::loss_funct loss, - math_t alpha, - math_t l1_ratio, - bool shuffle, - math_t tol, - cudaStream_t stream, - cublasHandle_t cublas_handle, - cusolverDnHandle_t cusolver_handle) { +void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, + math_t *labels, math_t *coef, math_t *intercept, bool fit_intercept, + bool normalize, int epochs, ML::loss_funct loss, math_t alpha, + math_t l1_ratio, bool shuffle, math_t tol, cudaStream_t stream) { ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); @@ -102,6 +89,9 @@ void cdFit(math_t *input, ASSERT(loss == ML::loss_funct::SQRD_LOSS, "Parameter loss: Only SQRT_LOSS function is supported for now"); + cublasHandle_t cublas_handle = handle.getCublasHandle(); + cusolverDnHandle_t cusolver_handle = handle.getcusolverDnHandle(); + math_t *mu_input = nullptr; math_t *mu_labels = nullptr; math_t *norm2_input = nullptr; @@ -110,6 +100,9 @@ void cdFit(math_t *input, math_t *squared = nullptr; math_t *loss_value = nullptr; + //auto allocator = handle.getDeviceAllocator(); + //device_buffer components_all(allocator, stream, len); + allocate(loss_value, 1); allocate(pred, n_rows, true); allocate(residual, n_rows, true); @@ -124,13 +117,9 @@ void cdFit(math_t *input, allocate(norm2_input, n_cols); } - ///@todo: remove this cumlHandle and use the cumlHandle_impl - /// passed to this method instead!! - cumlHandle handle; - handle.setStream(stream); - GLM::preProcessData(handle.getImpl(), input, n_rows, n_cols, labels, intercept, - mu_input, mu_labels, norm2_input, fit_intercept, normalize, - stream); + GLM::preProcessData(handle, input, n_rows, n_cols, labels, + intercept, mu_input, mu_labels, norm2_input, fit_intercept, + normalize, stream); } std::vector ri(n_cols); @@ -166,15 +155,17 @@ void cdFit(math_t *input, math_t *squared_loc = squared + ci; math_t *input_col_loc = input + (ci * n_rows); - LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, stream); + LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, + stream); LinAlg::add(residual, residual, pred, n_rows, stream); LinAlg::gemm(input_col_loc, n_rows, 1, residual, coef_loc, 1, 1, - CUBLAS_OP_T, CUBLAS_OP_N, cublas_handle, stream); + CUBLAS_OP_T, CUBLAS_OP_N, cublas_handle, stream); if (l1_ratio > math_t(0.0)) Functions::softThres(coef_loc, coef_loc, alpha, 1, stream); - LinAlg::eltwiseDivideCheckZero(coef_loc, coef_loc, squared_loc, 1, stream); + LinAlg::eltwiseDivideCheckZero(coef_loc, coef_loc, squared_loc, 1, + stream); coef_prev = h_coef[ci]; updateHost(&(h_coef[ci]), coef_loc, 1, stream); @@ -186,7 +177,8 @@ void cdFit(math_t *input, if (abs(h_coef[ci]) > coef_max) coef_max = abs(h_coef[ci]); - LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, stream); + LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, + stream); LinAlg::subtract(residual, residual, pred, n_rows, stream); } @@ -205,13 +197,9 @@ void cdFit(math_t *input, } if (fit_intercept) { - ///@todo: remove this cumlHandle and use the cumlHandle_impl - /// passed to this method instead!! - cumlHandle handle; - handle.setStream(stream); - GLM::postProcessData(handle.getImpl(), input, n_rows, n_cols, labels, coef, - intercept, mu_input, mu_labels, norm2_input, - fit_intercept, normalize, stream); + GLM::postProcessData(handle, input, n_rows, n_cols, labels, + coef, intercept, mu_input, mu_labels, norm2_input, + fit_intercept, normalize, stream); if (mu_input != nullptr) CUDA_CHECK(cudaFree(mu_input)); @@ -259,11 +247,11 @@ void cdFit(math_t *input, * cuda stream * @param cublas_handle * cublas handle -*/ + */ template -void cdPredict(const math_t *input, int n_rows, int n_cols, const math_t *coef, - math_t intercept, math_t *preds, ML::loss_funct loss, cudaStream_t stream, - cublasHandle_t cublas_handle) { +void cdPredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, + int n_cols, const math_t *coef, math_t intercept, math_t *preds, + ML::loss_funct loss, cudaStream_t stream) { ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); @@ -272,7 +260,9 @@ void cdPredict(const math_t *input, int n_rows, int n_cols, const math_t *coef, ASSERT(loss == ML::loss_funct::SQRD_LOSS, "Parameter loss: Only SQRT_LOSS function is supported for now"); - Functions::linearRegH(input, n_rows, n_cols, coef, preds, intercept, cublas_handle, stream); + cublasHandle_t cublas_handle = handle.getCublasHandle(); + Functions::linearRegH(input, n_rows, n_cols, coef, preds, intercept, + cublas_handle, stream); } diff --git a/cuML/src/solver/sgd.h b/cuML/src/solver/sgd.h index 97670556e1..7dbf4d7b90 100644 --- a/cuML/src/solver/sgd.h +++ b/cuML/src/solver/sgd.h @@ -35,68 +35,94 @@ #include #include #include "learning_rate.h" -#include "cuML.hpp" +#include "common/cumlHandle.hpp" namespace ML { namespace Solver { using namespace MLCommon; +/** + * Fits a linear, lasso, and elastic-net regression model using Coordinate Descent solver + * @param cumlHandle_impl + * Reference of cumlHandle + * @param input + * pointer to an array in column-major format (size of n_rows, n_cols) + * @param n_rows + * n_samples or rows in input + * @param n_cols + * n_features or columns in X + * @param labels + * pointer to an array for labels (size of n_rows) + * @param coef + * pointer to an array for coefficients (size of n_cols). This will be filled with coefficients + * once the function is executed. + * @param intercept + * pointer to a scalar for intercept. This will be filled + * once the function is executed + * @param fit_intercept + * boolean parameter to control if the intercept will be fitted or not + * @param batch_size + * number of rows in the minibatch + * @param epochs + * number of iterations that the solver will run + * @param lr_type + * type of the learning rate function (i.e. OPTIMAL, CONSTANT, INVSCALING, ADAPTIVE) + * @param eta0 + * learning rate for contant lr_type. It's used to calculate learning rate function for other types of lr_type + * @param power_t + * power value in the INVSCALING lr_type + * @param loss + * enum to use different loss functions. + * @param penalty + * None, L1, L2, or Elastic-net penalty + * @param alpha + * alpha value in L1 + * @param l1_ratio + * ratio of alpha will be used for L1. (1 - l1_ratio) * alpha will be used for L2. + * @param shuffle + * boolean parameter to control whether coordinates will be picked randomly or not. + * @param tol + * tolerance to stop the solver + * @param n_iter_no_change + * solver stops if there is no update greater than tol after n_iter_no_change iterations + * @param stream + * cuda stream + */ template -void sgdFit(math_t *input, - int n_rows, - int n_cols, - math_t *labels, - math_t *coef, - math_t *intercept, - bool fit_intercept, - int batch_size, - int epochs, - ML::lr_type lr_type, - math_t eta0, - math_t power_t, - ML::loss_funct loss, - Functions::penalty penalty, - math_t alpha, - math_t l1_ratio, - bool shuffle, - math_t tol, - int n_iter_no_change, - cublasHandle_t cublas_handle, - cusolverDnHandle_t cusolver_handle, - cudaStream_t stream) { +void sgdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, + int n_cols, math_t *labels, math_t *coef, math_t *intercept, + bool fit_intercept, int batch_size, int epochs, ML::lr_type lr_type, + math_t eta0, math_t power_t, ML::loss_funct loss, + Functions::penalty penalty, math_t alpha, math_t l1_ratio, bool shuffle, + math_t tol, int n_iter_no_change, cudaStream_t stream) { ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two"); - math_t *mu_input = NULL; - math_t *mu_labels = NULL; - math_t *norm2_input = NULL; + cublasHandle_t cublas_handle = handle.getCublasHandle(); + + auto allocator = handle.getDeviceAllocator(); + device_buffer mu_input(allocator, stream, 0); + device_buffer mu_labels(allocator, stream, 0); + device_buffer norm2_input(allocator, stream, 0); - ///@todo: the below line should go away once we expose - /// cumlHandle in the interface of sgd - cumlHandle handle; if (fit_intercept) { - allocate(mu_input, n_cols); - allocate(mu_labels, 1); + mu_input.reserve(n_cols, stream); + mu_labels.reserve(1, stream); - GLM::preProcessData(handle.getImpl(), input, n_rows, n_cols, labels, intercept, mu_input, - mu_labels, norm2_input, fit_intercept, false, stream); + GLM::preProcessData(handle, input, n_rows, n_cols, labels, intercept, + mu_input.data(), mu_labels.data(), norm2_input.data(), + fit_intercept, false, stream); } - math_t *grads = NULL; - math_t *input_batch = NULL; - math_t *labels_batch = NULL; - math_t *loss_value = NULL; - int *indices = NULL; - - allocate(grads, n_cols, true); - allocate(indices, batch_size); - allocate(input_batch, batch_size * n_cols); - allocate(labels_batch, batch_size); - allocate(loss_value, 1); + device_buffer grads(allocator, stream, n_cols); + device_buffer indices(allocator, stream, batch_size); + device_buffer input_batch(allocator, stream, batch_size * n_cols); + device_buffer labels_batch(allocator, stream, batch_size); + device_buffer loss_value(allocator, stream, 1); math_t prev_loss_value = math_t(0); math_t curr_loss_value = math_t(0); @@ -133,29 +159,36 @@ void sgdFit(math_t *input, if (cbs == 0) break; - updateDevice(indices, &rand_indices[j], cbs, stream); - Matrix::copyRows(input, n_rows, n_cols, input_batch, indices, cbs, stream); - Matrix::copyRows(labels, n_rows, 1, labels_batch, indices, cbs, stream); + updateDevice(indices.data(), &rand_indices[j], cbs, stream); + Matrix::copyRows(input, n_rows, n_cols, input_batch.data(), + indices.data(), cbs, stream); + Matrix::copyRows(labels, n_rows, 1, labels_batch.data(), + indices.data(), cbs, stream); if (loss == ML::loss_funct::SQRD_LOSS) { - Functions::linearRegLossGrads(input_batch, cbs, n_cols, labels_batch, - coef, grads, penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::linearRegLossGrads(input_batch.data(), cbs, n_cols, + labels_batch.data(), coef, grads.data(), penalty, alpha, + l1_ratio, cublas_handle, allocator, stream); } else if (loss == ML::loss_funct::LOG) { - Functions::logisticRegLossGrads(input_batch, cbs, n_cols, labels_batch, - coef, grads, penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::logisticRegLossGrads(input_batch.data(), cbs, n_cols, + labels_batch.data(), coef, grads.data(), penalty, alpha, + l1_ratio, cublas_handle, allocator, stream); } else if (loss == ML::loss_funct::HINGE) { - Functions::hingeLossGrads(input_batch, cbs, n_cols, labels_batch, - coef, grads, penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::hingeLossGrads(input_batch.data(), cbs, n_cols, + labels_batch.data(), coef, grads.data(), penalty, alpha, + l1_ratio, cublas_handle, allocator, stream); } else { ASSERT(false, "sgd.h: Other loss functions have not been implemented yet!"); } if (lr_type != ML::lr_type::ADAPTIVE) - learning_rate = calLearningRate(lr_type, eta0, power_t, alpha, t); + learning_rate = calLearningRate(lr_type, eta0, power_t, alpha, + t); - LinAlg::scalarMultiply(grads, grads, learning_rate, n_cols, stream); - LinAlg::subtract(coef, coef, grads, n_cols, stream); + LinAlg::scalarMultiply(grads.data(), grads.data(), learning_rate, + n_cols, stream); + LinAlg::subtract(coef, coef, grads.data(), n_cols, stream); j = j + cbs; t = t + 1; @@ -163,108 +196,145 @@ void sgdFit(math_t *input, if (tol > math_t(0)) { if (loss == ML::loss_funct::SQRD_LOSS) { - Functions::linearRegLoss(input, n_rows, n_cols, labels, coef, loss_value, - penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::linearRegLoss(input, n_rows, n_cols, labels, coef, + loss_value.data(), penalty, alpha, l1_ratio, + cublas_handle, allocator, stream); } else if (loss == ML::loss_funct::LOG) { - Functions::logisticRegLoss(input, n_rows, n_cols, labels, coef, loss_value, - penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::logisticRegLoss(input, n_rows, n_cols, labels, coef, + loss_value.data(), penalty, alpha, l1_ratio, + cublas_handle, allocator, stream); } else if (loss == ML::loss_funct::HINGE) { - Functions::hingeLoss(input, n_rows, n_cols, labels, coef, loss_value, - penalty, alpha, l1_ratio, cublas_handle, stream); + Functions::hingeLoss(input, n_rows, n_cols, labels, coef, + loss_value.data(), penalty, alpha, l1_ratio, + cublas_handle, allocator, stream); } - updateHost(&curr_loss_value, loss_value, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); + updateHost(&curr_loss_value, loss_value.data(), 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); if (i > 0) { - if (curr_loss_value > (prev_loss_value - tol)) { - n_iter_no_change_curr = n_iter_no_change_curr + 1; - if (n_iter_no_change_curr > n_iter_no_change) { - if (lr_type == ML::lr_type::ADAPTIVE && learning_rate > math_t(1e-6)) { - learning_rate = learning_rate / math_t(5); - n_iter_no_change_curr = 0; - } else { - break; - } - } - } else { - n_iter_no_change_curr = 0; - } + if (curr_loss_value > (prev_loss_value - tol)) { + n_iter_no_change_curr = n_iter_no_change_curr + 1; + if (n_iter_no_change_curr > n_iter_no_change) { + if (lr_type == ML::lr_type::ADAPTIVE + && learning_rate > math_t(1e-6)) { + learning_rate = learning_rate / math_t(5); + n_iter_no_change_curr = 0; + } else { + break; + } + } + } else { + n_iter_no_change_curr = 0; + } } prev_loss_value = curr_loss_value; } } - if (grads != NULL) - CUDA_CHECK(cudaFree(grads)); - if (indices != NULL) - CUDA_CHECK(cudaFree(indices)); - if (input_batch != NULL) - CUDA_CHECK(cudaFree(input_batch)); - if (labels_batch != NULL) - CUDA_CHECK(cudaFree(labels_batch)); - if (loss_value != NULL) - CUDA_CHECK(cudaFree(loss_value)); - if (fit_intercept) { - GLM::postProcessData(handle.getImpl(), input, n_rows, n_cols, labels, coef, intercept, - mu_input, mu_labels, norm2_input, fit_intercept, false, - stream); - - if (mu_input != NULL) - CUDA_CHECK(cudaFree(mu_input)); - if (mu_labels != NULL) - CUDA_CHECK(cudaFree(mu_labels)); + GLM::postProcessData(handle, input, n_rows, n_cols, labels, coef, + intercept, mu_input.data(), mu_labels.data(), + norm2_input.data(), fit_intercept, false, stream); } else { *intercept = math_t(0); } } +/** + * Make predictions + * @param cumlHandle_impl + * Reference of cumlHandle + * @param input + * pointer to an array in column-major format (size of n_rows, n_cols) + * @param n_rows + * n_samples or rows in input + * @param n_cols + * n_features or columns in X + * @param coef + * pointer to an array for coefficients (size of n_cols). Calculated in cdFit function. + * @param intercept + * intercept value calculated in cdFit function + * @param preds + * pointer to an array for predictions (size of n_rows). This will be fitted once functions is executed. + * @param loss + * enum to use different loss functions. Only linear regression loss functions is supported right now. + * @param stream + * cuda stream + */ template -void sgdPredict(const math_t *input, int n_rows, int n_cols, const math_t *coef, - math_t intercept, math_t *preds, ML::loss_funct loss, cublasHandle_t cublas_handle, - cudaStream_t stream) { +void sgdPredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, + int n_cols, const math_t *coef, math_t intercept, math_t *preds, + ML::loss_funct loss, cudaStream_t stream) { ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two"); + cublasHandle_t cublas_handle = handle.getCublasHandle(); + if (loss == ML::loss_funct::SQRD_LOSS) { - Functions::linearRegH(input, n_rows, n_cols, coef, preds, intercept, cublas_handle, stream); + Functions::linearRegH(input, n_rows, n_cols, coef, preds, intercept, + cublas_handle, stream); } else if (loss == ML::loss_funct::LOG) { - Functions::logisticRegH(input, n_rows, n_cols, coef, preds, intercept, cublas_handle, stream); + Functions::logisticRegH(input, n_rows, n_cols, coef, preds, intercept, + cublas_handle, stream); } else if (loss == ML::loss_funct::HINGE) { - Functions::hingeH(input, n_rows, n_cols, coef, preds, intercept, cublas_handle, stream); + Functions::hingeH(input, n_rows, n_cols, coef, preds, intercept, + cublas_handle, stream); } } +/** + * Make binary classifications + * @param cumlHandle_impl + * Reference of cumlHandle + * @param input + * pointer to an array in column-major format (size of n_rows, n_cols) + * @param n_rows + * n_samples or rows in input + * @param n_cols + * n_features or columns in X + * @param coef + * pointer to an array for coefficients (size of n_cols). Calculated in cdFit function. + * @param intercept + * intercept value calculated in cdFit function + * @param preds + * pointer to an array for predictions (size of n_rows). This will be fitted once functions is executed. + * @param loss + * enum to use different loss functions. Only linear regression loss functions is supported right now. + * @param stream + * cuda stream + */ template -void sgdPredictBinaryClass(const math_t *input, int n_rows, int n_cols, const math_t *coef, - math_t intercept, math_t *preds, ML::loss_funct loss, cublasHandle_t cublas_handle, cudaStream_t stream) { +void sgdPredictBinaryClass(const cumlHandle_impl& handle, const math_t *input, + int n_rows, int n_cols, const math_t *coef, math_t intercept, + math_t *preds, ML::loss_funct loss, cudaStream_t stream) { - sgdPredict(input, n_rows, n_cols, coef, intercept, preds, loss, cublas_handle, stream); + sgdPredict(handle, input, n_rows, n_cols, coef, intercept, preds, loss, + stream); math_t scalar = math_t(1); if (loss == ML::loss_funct::SQRD_LOSS || loss == ML::loss_funct::LOG) { - LinAlg::unaryOp(preds, preds, n_rows, [scalar] __device__ (math_t in) { - if (in >= math_t(0.5)) - return math_t(1); - else - return math_t(0); - }, - stream); - } else if (loss == ML::loss_funct::HINGE) { - LinAlg::unaryOp(preds, preds, n_rows, [scalar] __device__ (math_t in) { - if (in >= math_t(0.0)) - return math_t(1); - else - return math_t(0); - }, - stream); - } + LinAlg::unaryOp(preds, preds, n_rows, [scalar] __device__ (math_t in) { + if (in >= math_t(0.5)) + return math_t(1); + else + return math_t(0); + }, + stream); + } else if (loss == ML::loss_funct::HINGE) { + LinAlg::unaryOp(preds, preds, n_rows, [scalar] __device__ (math_t in) { + if (in >= math_t(0.0)) + return math_t(1); + else + return math_t(0); + }, + stream); +} } diff --git a/cuML/src/solver/solver.cu b/cuML/src/solver/solver.cu index 3d7298a696..ba270ffc74 100644 --- a/cuML/src/solver/solver.cu +++ b/cuML/src/solver/solver.cu @@ -16,17 +16,16 @@ #include "sgd.h" #include "cd.h" -#include "solver_c.h" +#include "solver.hpp" #include "ml_utils.h" -#include -#include namespace ML { namespace Solver { using namespace ML; -void sgdFit(float *input, +void sgdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -88,45 +87,32 @@ void sgdFit(float *input, "glm.cu: this learning rate type is not supported."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdFit(input, - n_rows, - n_cols, - labels, - coef, - intercept, - fit_intercept, - batch_size, - epochs, - learning_rate_type, - eta0, - power_t, - loss_funct, - pen, - alpha, - l1_ratio, - shuffle, - tol, - n_iter_no_change, - cublas_handle, - cusolver_handle, - stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdFit(handle.getImpl(), + input, + n_rows, + n_cols, + labels, + coef, + intercept, + fit_intercept, + batch_size, + epochs, + learning_rate_type, + eta0, + power_t, + loss_funct, + pen, + alpha, + l1_ratio, + shuffle, + tol, + n_iter_no_change, + handle.getStream()); } -void sgdFit(double *input, +void sgdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -186,45 +172,31 @@ void sgdFit(double *input, "glm.cu: this learning rate type is not supported."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdFit(input, - n_rows, - n_cols, - labels, - coef, - intercept, - fit_intercept, - batch_size, - epochs, - learning_rate_type, - eta0, - power_t, - loss_funct, - pen, - alpha, - l1_ratio, - shuffle, - tol, - n_iter_no_change, - cublas_handle, - cusolver_handle, - stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdFit(handle.getImpl(), + input, + n_rows, + n_cols, + labels, + coef, + intercept, + fit_intercept, + batch_size, + epochs, + learning_rate_type, + eta0, + power_t, + loss_funct, + pen, + alpha, + l1_ratio, + shuffle, + tol, + n_iter_no_change, + handle.getStream()); } -void sgdPredict(const float *input, int n_rows, int n_cols, const float *coef, +void sgdPredict(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -239,20 +211,11 @@ void sgdPredict(const float *input, int n_rows, int n_cols, const float *coef, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdPredict(input, n_rows, n_cols, coef, intercept, preds, loss_funct, cublas_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void sgdPredict(const double *input, int n_rows, int n_cols, +void sgdPredict(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -267,20 +230,11 @@ void sgdPredict(const double *input, int n_rows, int n_cols, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdPredict(input, n_rows, n_cols, coef, intercept, preds, loss_funct, cublas_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void sgdPredictBinaryClass(const float *input, int n_rows, int n_cols, const float *coef, +void sgdPredictBinaryClass(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -295,20 +249,11 @@ void sgdPredictBinaryClass(const float *input, int n_rows, int n_cols, const flo "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdPredictBinaryClass(input, n_rows, n_cols, coef, intercept, preds, loss_funct, cublas_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdPredictBinaryClass(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void sgdPredictBinaryClass(const double *input, int n_rows, int n_cols, +void sgdPredictBinaryClass(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -323,22 +268,12 @@ void sgdPredictBinaryClass(const double *input, int n_rows, int n_cols, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - sgdPredictBinaryClass(input, n_rows, n_cols, coef, intercept, preds, loss_funct, cublas_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - - // should probably do a stream sync before destroy - CUDA_CHECK(cudaStreamDestroy(stream)); + sgdPredictBinaryClass(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void cdFit(float *input, +void cdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -358,40 +293,26 @@ void cdFit(float *input, ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - cdFit(input, - n_rows, - n_cols, - labels, - coef, - intercept, - fit_intercept, - normalize, - epochs, - loss_funct, - alpha, - l1_ratio, - shuffle, - tol, - stream, - cublas_handle, - cusolver_handle); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - + cdFit(handle.getImpl(), + input, + n_rows, + n_cols, + labels, + coef, + intercept, + fit_intercept, + normalize, + epochs, + loss_funct, + alpha, + l1_ratio, + shuffle, + tol, + handle.getStream()); } -void cdFit(double *input, +void cdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -407,44 +328,30 @@ void cdFit(double *input, double tol) { ASSERT(loss == 0, - "Parameter loss: Only SQRT_LOSS function is supported for now"); + "Parameter loss: Only SQRT_LOSS function is supported for now"); ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - cdFit(input, - n_rows, - n_cols, - labels, - coef, - intercept, - fit_intercept, - normalize, - epochs, - loss_funct, - alpha, - l1_ratio, - shuffle, - tol, - stream, - cublas_handle, - cusolver_handle); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + cdFit(handle.getImpl(), + input, + n_rows, + n_cols, + labels, + coef, + intercept, + fit_intercept, + normalize, + epochs, + loss_funct, + alpha, + l1_ratio, + shuffle, + tol, + handle.getStream()); } -void cdPredict(const float *input, int n_rows, int n_cols, const float *coef, +void cdPredict(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -455,20 +362,10 @@ void cdPredict(const float *input, int n_rows, int n_cols, const float *coef, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - cdPredict(input, n_rows, n_cols, coef, intercept, preds, loss_funct, stream, cublas_handle); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - + cdPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } -void cdPredict(const double *input, int n_rows, int n_cols, +void cdPredict(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss) { ML::loss_funct loss_funct = ML::loss_funct::SQRD_LOSS; @@ -479,16 +376,7 @@ void cdPredict(const double *input, int n_rows, int n_cols, "glm.cu: other functions are not supported yet."); } - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - cdPredict(input, n_rows, n_cols, coef, intercept, preds, loss_funct, stream, cublas_handle); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + cdPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, loss_funct, handle.getStream()); } } diff --git a/cuML/src/solver/solver_c.h b/cuML/src/solver/solver.hpp similarity index 73% rename from cuML/src/solver/solver_c.h rename to cuML/src/solver/solver.hpp index 750e36db23..b2cd32804f 100644 --- a/cuML/src/solver/solver_c.h +++ b/cuML/src/solver/solver.hpp @@ -14,11 +14,17 @@ * limitations under the License. */ +#pragma once + +#include "ml_utils.h" +#include "cuML.hpp" + + namespace ML { namespace Solver { - -void sgdFit(float *input, +void sgdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -38,7 +44,8 @@ void sgdFit(float *input, float tol, int n_iter_no_change); -void sgdFit(double *input, +void sgdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -58,20 +65,20 @@ void sgdFit(double *input, double tol, int n_iter_no_change); -void sgdPredict(const float *input, int n_rows, int n_cols, const float *coef, +void sgdPredict(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss); -void sgdPredict(const double *input, int n_rows, int n_cols, +void sgdPredict(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss); -void sgdPredictBinaryClass(const float *input, int n_rows, int n_cols, const float *coef, +void sgdPredictBinaryClass(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss); -void sgdPredictBinaryClass(const double *input, int n_rows, int n_cols, +void sgdPredictBinaryClass(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss); - -void cdFit(float *input, +void cdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -86,7 +93,8 @@ void cdFit(float *input, bool shuffle, float tol); -void cdFit(double *input, +void cdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -101,11 +109,11 @@ void cdFit(double *input, bool shuffle, double tol); -void cdPredict(const float *input, int n_rows, int n_cols, const float *coef, +void cdPredict(cumlHandle& handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds, int loss); -void cdPredict(const double *input, int n_rows, int n_cols, +void cdPredict(cumlHandle& handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds, int loss); -} -} +}; +}; // end namespace ML diff --git a/cuML/test/cd_test.cu b/cuML/test/cd_test.cu index 33743b45f4..4c655173d0 100644 --- a/cuML/test/cd_test.cu +++ b/cuML/test/cd_test.cu @@ -42,12 +42,6 @@ protected: params = ::testing::TestWithParam>::GetParam(); int len = params.n_row * params.n_col; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - allocate(data, len); allocate(labels, params.n_row); allocate(coef, params.n_col, true); @@ -87,40 +81,39 @@ protected: ML::loss_funct loss = ML::loss_funct::SQRD_LOSS; intercept = T(0); - cdFit(data, params.n_row, params.n_col, labels, coef, &intercept, + cdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef, &intercept, fit_intercept, normalize, epochs, loss, alpha, l1_ratio, shuffle, - tol, stream, cublas_handle, cusolver_handle); + tol, stream); fit_intercept = true; intercept2 = T(0); - cdFit(data, params.n_row, params.n_col, labels, coef2, &intercept2, + cdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef2, &intercept2, fit_intercept, normalize, epochs, loss, alpha, l1_ratio, shuffle, - tol, stream, cublas_handle, cusolver_handle); + tol, stream); alpha = T(1.0); l1_ratio = T(0.5); fit_intercept = false; intercept = T(0); - cdFit(data, params.n_row, params.n_col, labels, coef3, &intercept, + cdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef3, &intercept, fit_intercept, normalize, epochs, loss, alpha, l1_ratio, shuffle, - tol, stream, cublas_handle, cusolver_handle); + tol, stream); fit_intercept = true; normalize = true; intercept2 = T(0); - cdFit(data, params.n_row, params.n_col, labels, coef4, &intercept2, + cdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef4, &intercept2, fit_intercept, normalize, epochs, loss, alpha, l1_ratio, shuffle, - tol, stream, cublas_handle, cusolver_handle); + tol, stream); - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); } void SetUp() override { CUDA_CHECK(cudaStreamCreate(&stream)); + handle.setStream(stream); lasso(); } @@ -146,7 +139,7 @@ protected: T *coef4, *coef4_ref; T intercept, intercept2; cudaStream_t stream; - + cumlHandle handle; }; const std::vector > inputsf2 = { { 0.01f, 4, 2 } }; diff --git a/cuML/test/sgd.cu b/cuML/test/sgd.cu index 34ae6605de..3ac3fd326d 100644 --- a/cuML/test/sgd.cu +++ b/cuML/test/sgd.cu @@ -29,15 +29,6 @@ protected: params = ::testing::TestWithParam>::GetParam(); int len = params.n_row * params.n_col; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(data, len); allocate(labels, params.n_row); allocate(coef, params.n_col, true); @@ -71,21 +62,17 @@ protected: MLCommon::Functions::penalty pen = MLCommon::Functions::penalty::NONE; int n_iter_no_change = 10; - sgdFit(data, params.n_row, params.n_col, labels, coef, &intercept, + sgdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef, &intercept, fit_intercept, params.batch_size, epochs, lr_type, lr, power_t, loss, pen, alpha, l1_ratio, shuffle, tol, n_iter_no_change, - cublas_handle, cusolver_handle, stream); + stream); fit_intercept = true; intercept2 = T(0); - sgdFit(data, params.n_row, params.n_col, labels, coef2, &intercept2, + sgdFit(handle.getImpl(), data, params.n_row, params.n_col, labels, coef2, &intercept2, fit_intercept, params.batch_size, epochs, ML::lr_type::CONSTANT, lr, power_t, loss, pen, alpha, l1_ratio, shuffle, tol, - n_iter_no_change, cublas_handle, cusolver_handle, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); + n_iter_no_change, stream); } @@ -93,15 +80,6 @@ protected: params = ::testing::TestWithParam>::GetParam(); int len = params.n_row2 * params.n_col2; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - T *coef_class; allocate(data_logreg, len); allocate(data_logreg_test, len); @@ -138,35 +116,22 @@ protected: MLCommon::Functions::penalty pen = MLCommon::Functions::penalty::NONE; int n_iter_no_change = 10; - sgdFit(data_logreg, params.n_row2, params.n_col2, labels_logreg, + sgdFit(handle.getImpl(), data_logreg, params.n_row2, params.n_col2, labels_logreg, coef_class, &intercept_class, fit_intercept, params.batch_size, epochs, lr_type, lr, power_t, loss, pen, alpha, l1_ratio, shuffle, tol, - n_iter_no_change, cublas_handle, cusolver_handle, stream); + n_iter_no_change, stream); - sgdPredictBinaryClass(data_logreg_test, params.n_row2, params.n_col2, - coef_class, intercept_class, pred_log, loss, cublas_handle, stream); + sgdPredictBinaryClass(handle.getImpl(), data_logreg_test, params.n_row2, params.n_col2, + coef_class, intercept_class, pred_log, loss, stream); CUDA_CHECK(cudaFree(coef_class)); - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - } void svmTest() { params = ::testing::TestWithParam>::GetParam(); int len = params.n_row2 * params.n_col2; - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cusolverDnHandle_t cusolver_handle = NULL; - CUSOLVER_CHECK(cusolverDnCreate(&cusolver_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - T *coef_class; allocate(data_svmreg, len); allocate(data_svmreg_test, len); @@ -203,23 +168,21 @@ protected: MLCommon::Functions::penalty pen = MLCommon::Functions::penalty::L2; int n_iter_no_change = 10; - sgdFit(data_svmreg, params.n_row2, params.n_col2, labels_svmreg, + sgdFit(handle.getImpl(), data_svmreg, params.n_row2, params.n_col2, labels_svmreg, coef_class, &intercept_class, fit_intercept, params.batch_size, epochs, lr_type, lr, power_t, loss, pen, alpha, l1_ratio, shuffle, tol, - n_iter_no_change, cublas_handle, cusolver_handle, stream); + n_iter_no_change, stream); - sgdPredictBinaryClass(data_svmreg_test, params.n_row2, params.n_col2, - coef_class, intercept_class, pred_svm, loss, cublas_handle, stream); + sgdPredictBinaryClass(handle.getImpl(), data_svmreg_test, params.n_row2, params.n_col2, + coef_class, intercept_class, pred_svm, loss, stream); CUDA_CHECK(cudaFree(coef_class)); - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUSOLVER_CHECK(cusolverDnDestroy(cusolver_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - } void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); + handle.setStream(stream); linearRegressionTest(); logisticRegressionTest(); svmTest(); @@ -252,6 +215,8 @@ protected: T *data_svmreg, *data_svmreg_test, *labels_svmreg; T *pred_svm, *pred_svm_ref, *pred_log, *pred_log_ref; T intercept, intercept2; + cudaStream_t stream; + cumlHandle handle; }; diff --git a/ml-prims/src/functions/hinge.h b/ml-prims/src/functions/hinge.h index ad1ef29ea7..08f4128c1a 100644 --- a/ml-prims/src/functions/hinge.h +++ b/ml-prims/src/functions/hinge.h @@ -1,3 +1,4 @@ + /* * Copyright (c) 2018, NVIDIA CORPORATION. * @@ -38,7 +39,7 @@ namespace Functions { template void hingeLossGradMult(math_t* data, const math_t* vec1, const math_t* vec2, idx_type n_row, idx_type n_col, cudaStream_t stream) { - LinAlg::matrixVectorOp(data, data, vec1, vec2, n_col, n_row, false, true, + LinAlg::matrixVectorOp(data, data, vec1, vec2, n_col, n_row, false, false, [] __device__ (math_t a, math_t b, math_t c) { if (c < math_t(1)) return -a * b; @@ -78,22 +79,16 @@ void hingeH(const math_t *input, idx_type n_rows, idx_type n_cols, template void hingeLossGrads(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *grads, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); - math_t *input_t = NULL; - allocate(input_t, n_rows * n_cols); + device_buffer labels_pred(allocator, stream, n_rows); - LinAlg::gemm(input, n_rows, n_cols, coef, labels_pred, n_rows, 1, CUBLAS_OP_N, + LinAlg::gemm(input, n_rows, n_cols, coef, labels_pred.data(), n_rows, 1, CUBLAS_OP_N, CUBLAS_OP_N, cublas_handle, stream); - LinAlg::eltwiseMultiply(labels_pred, labels_pred, labels, n_rows, stream); - - LinAlg::transpose(input, input_t, n_rows, n_cols, cublas_handle, stream); - hingeLossGradMult(input_t, labels, labels_pred, n_cols, n_rows, stream); - LinAlg::transpose(input_t, input, n_cols, n_rows, cublas_handle, stream); - + LinAlg::eltwiseMultiply(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); + hingeLossGradMult(input, labels, labels_pred.data(), n_rows, n_cols, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); math_t *pen_grads = NULL; @@ -115,30 +110,25 @@ void hingeLossGrads(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_grads)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - - if (input_t != NULL) - CUDA_CHECK(cudaFree(input_t)); } template void hingeLoss(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *loss, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); - LinAlg::gemm(input, n_rows, n_cols, coef, labels_pred, n_rows, 1, CUBLAS_OP_N, + LinAlg::gemm(input, n_rows, n_cols, coef, labels_pred.data(), n_rows, 1, CUBLAS_OP_N, CUBLAS_OP_N, cublas_handle, stream); - LinAlg::eltwiseMultiply(labels_pred, labels_pred, labels, n_rows, stream); + LinAlg::eltwiseMultiply(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); - hingeLossSubtract(labels_pred, labels_pred, math_t(1), n_rows, stream); + hingeLossSubtract(labels_pred.data(), labels_pred.data(), math_t(1), n_rows, stream); - Stats::sum(loss, labels_pred, 1, n_rows, false, stream); + Stats::sum(loss, labels_pred.data(), 1, n_rows, false, stream); math_t *pen_val = NULL; @@ -159,9 +149,6 @@ void hingeLoss(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_val)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - } /** @} */ diff --git a/ml-prims/src/functions/linearReg.h b/ml-prims/src/functions/linearReg.h index 408e222f28..9fd84ab9ba 100644 --- a/ml-prims/src/functions/linearReg.h +++ b/ml-prims/src/functions/linearReg.h @@ -50,22 +50,14 @@ void linearRegH(const math_t *input, int n_rows, int n_cols, template void linearRegLossGrads(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *grads, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); - math_t *input_t = NULL; - allocate(input_t, n_rows * n_cols); - - linearRegH(input, n_rows, n_cols, coef, labels_pred, math_t(0), cublas_handle, stream); - - LinAlg::subtract(labels_pred, labels_pred, labels, n_rows, stream); - - // TODO: implement a matrixVectorBinaryMult that runs on rows rather than columns. - LinAlg::transpose(input, input_t, n_rows, n_cols, cublas_handle, stream); - Matrix::matrixVectorBinaryMult(input_t, labels_pred, n_cols, n_rows, false, true, stream); - LinAlg::transpose(input_t, input, n_cols, n_rows, cublas_handle, stream); + linearRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); + LinAlg::subtract(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); + Matrix::matrixVectorBinaryMult(input, labels_pred.data(), n_rows, n_cols, false, false, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); LinAlg::scalarMultiply(grads, grads, math_t(2), n_cols, stream); @@ -89,27 +81,22 @@ void linearRegLossGrads(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_grads)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - - if (input_t != NULL) - CUDA_CHECK(cudaFree(input_t)); } template void linearRegLoss(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *loss, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); - linearRegH(input, n_rows, n_cols, coef, labels_pred, math_t(0), cublas_handle, stream); + linearRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); - LinAlg::subtract(labels_pred, labels, labels_pred, n_rows, stream); - Matrix::power(labels_pred, n_rows, stream); - Stats::mean(loss, labels_pred, 1, n_rows, false, false, stream); + LinAlg::subtract(labels_pred.data(), labels, labels_pred.data(), n_rows, stream); + Matrix::power(labels_pred.data(), n_rows, stream); + Stats::mean(loss, labels_pred.data(), 1, n_rows, false, false, stream); math_t *pen_val = NULL; @@ -130,9 +117,6 @@ void linearRegLoss(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_val)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - } /** @} */ diff --git a/ml-prims/src/functions/logisticReg.h b/ml-prims/src/functions/logisticReg.h index 6358557040..df4173a303 100644 --- a/ml-prims/src/functions/logisticReg.h +++ b/ml-prims/src/functions/logisticReg.h @@ -52,22 +52,14 @@ void logisticRegH(const math_t *input, int n_rows, int n_cols, template void logisticRegLossGrads(math_t *input, int n_rows, int n_cols, const math_t *labels, const math_t *coef, math_t *grads, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); - math_t *input_t = NULL; - allocate(input_t, n_rows * n_cols); - - logisticRegH(input, n_rows, n_cols, coef, labels_pred, math_t(0), cublas_handle, stream); - - LinAlg::subtract(labels_pred, labels_pred, labels, n_rows, stream); - - // TODO: implement a matrixVectorBinaryMult that runs on rows rather than columns. - LinAlg::transpose(input, input_t, n_rows, n_cols, cublas_handle, stream); - Matrix::matrixVectorBinaryMult(input_t, labels_pred, n_cols, n_rows, false, true, stream); - LinAlg::transpose(input_t, input, n_cols, n_rows, cublas_handle, stream); + logisticRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); + LinAlg::subtract(labels_pred.data(), labels_pred.data(), labels, n_rows, stream); + Matrix::matrixVectorBinaryMult(input, labels_pred.data(), n_rows, n_cols, false, false, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); @@ -89,12 +81,6 @@ void logisticRegLossGrads(math_t *input, int n_rows, int n_cols, if (pen_grads != NULL) CUDA_CHECK(cudaFree(pen_grads)); } - - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - - if (input_t != NULL) - CUDA_CHECK(cudaFree(input_t)); } template @@ -118,15 +104,15 @@ inline void logLoss(double *out, double *label, double *label_pred, int len, cud template void logisticRegLoss(math_t *input, int n_rows, int n_cols, math_t *labels, const math_t *coef, math_t *loss, penalty pen, - math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, cudaStream_t stream) { + math_t alpha, math_t l1_ratio, cublasHandle_t cublas_handle, + std::shared_ptr allocator, cudaStream_t stream) { - math_t *labels_pred = NULL; - allocate(labels_pred, n_rows); + device_buffer labels_pred(allocator, stream, n_rows); - logisticRegH(input, n_rows, n_cols, coef, labels_pred, math_t(0), cublas_handle, stream); - logLoss(labels_pred, labels, labels_pred, n_rows, stream); + logisticRegH(input, n_rows, n_cols, coef, labels_pred.data(), math_t(0), cublas_handle, stream); + logLoss(labels_pred.data(), labels, labels_pred.data(), n_rows, stream); - Stats::mean(loss, labels_pred, 1, n_rows, false, false, stream); + Stats::mean(loss, labels_pred.data(), 1, n_rows, false, false, stream); math_t *pen_val = NULL; @@ -147,9 +133,6 @@ void logisticRegLoss(math_t *input, int n_rows, int n_cols, CUDA_CHECK(cudaFree(pen_val)); } - if (labels_pred != NULL) - CUDA_CHECK(cudaFree(labels_pred)); - } /** @} */ diff --git a/python/cuml/solvers/cd.pyx b/python/cuml/solvers/cd.pyx index b89f17f568..7aa9cf541e 100644 --- a/python/cuml/solvers/cd.pyx +++ b/python/cuml/solvers/cd.pyx @@ -13,10 +13,6 @@ # limitations under the License. # -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 import ctypes import cudf @@ -28,11 +24,13 @@ from libcpp cimport bool from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free -from cuml.common.base import Base +import cuml +from cuml.common.handle cimport cumlHandle -cdef extern from "solver/solver_c.h" namespace "ML::Solver": +cdef extern from "solver/solver.hpp" namespace "ML::Solver": - cdef void cdFit(float *input, + cdef void cdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -45,10 +43,11 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": float alpha, float l1_ratio, bool shuffle, - float tol) + float tol) except + - - cdef void cdFit(double *input, + + cdef void cdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -61,112 +60,98 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": double alpha, double l1_ratio, bool shuffle, - double tol) - - cdef void cdPredict(const float *input, - int n_rows, - int n_cols, + double tol) except + + + cdef void cdPredict(cumlHandle& handle, + const float *input, + int n_rows, + int n_cols, const float *coef, - float intercept, + float intercept, float *preds, - int loss) + int loss) except + - cdef void cdPredict(const double *input, - int n_rows, + cdef void cdPredict(cumlHandle& handle, + const double *input, + int n_rows, int n_cols, - const double *coef, - double intercept, + const double *coef, + double intercept, double *preds, - int loss) + int loss) except + -class CD(Base): +class CD(cuml.Base): """ - Coordinate Descent (CD) is a very common optimization algorithm that minimizes along + Coordinate Descent (CD) is a very common optimization algorithm that minimizes along coordinate directions to find the minimum of a function. - cuML's CD algorithm accepts a numpy matrix or a cuDF DataFrame as the input dataset. The CD algorithm currently works with linear regression and ridge, lasso, and elastic-net penalties. - Examples --------- - .. code-block:: python - import numpy as np import cudf from cuml.solvers import CD as cumlCD - cd = cumlCD(alpha=0.0) - X = cudf.DataFrame() X['col1'] = np.array([1,1,2,2], dtype = np.float32) X['col2'] = np.array([1,2,2,3], dtype = np.float32) - y = cudf.Series( np.array([6.0, 8.0, 9.0, 11.0], dtype = np.float32) ) - reg = cd.fit(X,y) print("Coefficients:") print(reg.coef_) print("intercept:") print(reg.intercept_) - X_new = cudf.DataFrame() X_new['col1'] = np.array([3,2], dtype = np.float32) X_new['col2'] = np.array([5,5], dtype = np.float32) preds = cd.predict(X_new) - print(preds) - Output: - .. code-block:: python - Coefficients: - 0 1.0019531 1 1.9980469 - Intercept: 3.0 - Preds: - 0 15.997 1 14.995 - - + Parameters ----------- - loss : 'squared_loss' (Only 'squared_loss' is supported right now) + loss : 'squared_loss' (Only 'squared_loss' is supported right now) 'squared_loss' uses linear regression alpha: float (default = 0.0001) The constant value which decides the degree of regularization. 'alpha = 0' is equivalent to an ordinary least square, solved by the LinearRegression object. l1_ratio: float (default = 0.15) - The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an L2 penalty. + The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an L2 penalty. For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. fit_intercept : boolean (default = True) If True, the model tries to correct for the global mean of y. - If False, the model expects that you have centered the data. + If False, the model expects that you have centered the data. max_iter : int (default = 1000) The number of times the model should iterate through the entire dataset during training (default = 1000) tol : float (default = 1e-3) - The tolerance for the optimization: if the updates are smaller than tol, solver stops. + The tolerance for the optimization: if the updates are smaller than tol, solver stops. shuffle : boolean (default = True) - If set to ‘True’, a random coefficient is updated every iteration rather than looping over features sequentially by default. + If set to ‘True’, a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to ‘True’) often leads to significantly faster convergence especially when tol is higher than 1e-4. - + """ - - def __init__(self, loss='squared_loss', alpha=0.0001, l1_ratio=0.15, - fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, shuffle=True): - + + def __init__(self, loss='squared_loss', alpha=0.0001, l1_ratio=0.15, + fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, shuffle=True, + handle=None): + if loss in ['squared_loss']: self.loss = self._get_loss_int(loss) else: msg = "loss {!r} is not supported" raise NotImplementedError(msg.format(loss)) + super(CD, self).__init__(handle=handle, verbose=False) self.alpha = alpha self.l1_ratio = l1_ratio self.fit_intercept = fit_intercept @@ -201,15 +186,12 @@ class CD(Base): def fit(self, X, y): """ Fit the model with X and y. - Parameters ---------- X : cuDF DataFrame Dense matrix (floats or doubles) of shape (n_samples, n_features) - y: cuDF DataFrame Dense vector (floats or doubles) of shape (n_samples, 1) - """ cdef uintptr_t X_ptr @@ -229,14 +211,14 @@ class CD(Base): msg = "X matrix must be a cuDF dataframe or Numpy ndarray" raise TypeError(msg) - X_ptr = self._get_dev_array_ptr(X_m) + X_ptr = self._get_ctype_ptr(X_m) cdef uintptr_t y_ptr if (isinstance(y, cudf.Series)): - y_ptr = self._get_cudf_column_ptr(y) + y_ptr = self._get_column_ptr(y) elif (isinstance(y, np.ndarray)): y_m = cuda.to_device(y) - y_ptr = self._get_dev_array_ptr(y_m) + y_ptr = self._get_ctype_ptr(y_m) else: msg = "y vector must be a cuDF series or Numpy ndarray" raise TypeError(msg) @@ -244,62 +226,64 @@ class CD(Base): self.n_alpha = 1 self.coef_ = cudf.Series(np.zeros(self.n_cols, dtype=self.gdf_datatype)) - cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) + cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) - cdef float c_intercept1 + cdef float c_intercept1 cdef double c_intercept2 + cdef cumlHandle* handle_ = self.handle.getHandle() if self.gdf_datatype.type == np.float32: - cdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.normalize, - self.max_iter, - self.loss, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol) + cdFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.normalize, + self.max_iter, + self.loss, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol) self.intercept_ = c_intercept1 else: - cdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.normalize, - self.max_iter, - self.loss, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol) - + cdFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.normalize, + self.max_iter, + self.loss, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol) + self.intercept_ = c_intercept2 + self.handle.sync() + return self def predict(self, X): """ Predicts the y for X. - Parameters ---------- X : cuDF DataFrame Dense matrix (floats or doubles) of shape (n_samples, n_features) - Returns ---------- y: cuDF DataFrame Dense vector (floats or doubles) of shape (n_samples, 1) - """ cdef uintptr_t X_ptr @@ -319,28 +303,33 @@ class CD(Base): msg = "X matrix format not supported" raise TypeError(msg) - X_ptr = self._get_dev_array_ptr(X_m) + X_ptr = self._get_ctype_ptr(X_m) - cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) + cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) preds = cudf.Series(np.zeros(n_rows, dtype=pred_datatype)) - cdef uintptr_t preds_ptr = self._get_cudf_column_ptr(preds) + cdef uintptr_t preds_ptr = self._get_column_ptr(preds) + cdef cumlHandle* handle_ = self.handle.getHandle() if pred_datatype.type == np.float32: - cdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + cdPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) else: - cdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + cdPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) + + self.handle.sync() del(X_m) diff --git a/python/cuml/solvers/sgd.pyx b/python/cuml/solvers/sgd.pyx index b38cebe416..4ac2bba9c5 100644 --- a/python/cuml/solvers/sgd.pyx +++ b/python/cuml/solvers/sgd.pyx @@ -13,10 +13,6 @@ # limitations under the License. # -# cython: profile=False -# distutils: language = c++ -# cython: embedsignature = True -# cython: language_level = 3 import ctypes import cudf @@ -28,11 +24,13 @@ from libcpp cimport bool from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free -from cuml.common.base import Base +import cuml +from cuml.common.handle cimport cumlHandle -cdef extern from "solver/solver_c.h" namespace "ML::Solver": +cdef extern from "solver/solver.hpp" namespace "ML::Solver": - cdef void sgdFit(float *input, + cdef void sgdFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -50,10 +48,11 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": float l1_ratio, bool shuffle, float tol, - int n_iter_no_change) + int n_iter_no_change) except + - - cdef void sgdFit(double *input, + + cdef void sgdFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -71,58 +70,57 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": double l1_ratio, bool shuffle, double tol, - int n_iter_no_change) - - cdef void sgdPredict(const float *input, - int n_rows, - int n_cols, + int n_iter_no_change) except + + + cdef void sgdPredict(cumlHandle& handle, + const float *input, + int n_rows, + int n_cols, const float *coef, - float intercept, + float intercept, float *preds, - int loss) + int loss) except + - cdef void sgdPredict(const double *input, - int n_rows, + cdef void sgdPredict(cumlHandle& handle, + const double *input, + int n_rows, int n_cols, - const double *coef, - double intercept, + const double *coef, + double intercept, double *preds, - int loss) - - cdef void sgdPredictBinaryClass(const float *input, - int n_rows, - int n_cols, + int loss) except + + + cdef void sgdPredictBinaryClass(cumlHandle& handle, + const float *input, + int n_rows, + int n_cols, const float *coef, - float intercept, + float intercept, float *preds, - int loss) + int loss) except + - cdef void sgdPredictBinaryClass(const double *input, - int n_rows, + cdef void sgdPredictBinaryClass(cumlHandle& handle, + const double *input, + int n_rows, int n_cols, - const double *coef, - double intercept, + const double *coef, + double intercept, double *preds, - int loss) + int loss) except + -class SGD(Base): +class SGD(cuml.Base): """ Stochastic Gradient Descent is a very common machine learning algorithm where one optimizes some cost function via gradient steps. This makes SGD very attractive for large problems when the exact solution is hard or even impossible to find. - cuML's SGD algorithm accepts a numpy matrix or a cuDF DataFrame as the input dataset. The SGD algorithm currently works with linear regression, ridge regression and SVM models. - Examples --------- - .. code-block:: python - import numpy as np import cudf from cuml.solvers import SGD as cumlSGD - X = cudf.DataFrame() X['col1'] = np.array([1,1,2,2], dtype = np.float32) X['col2'] = np.array([1,2,2,3], dtype = np.float32) @@ -130,32 +128,28 @@ class SGD(Base): pred_data = cudf.DataFrame() pred_data['col1'] = np.asarray([3, 2], dtype=datatype) pred_data['col2'] = np.asarray([5, 5], dtype=datatype) - cu_sgd = cumlSGD(learning_rate=lrate, eta0=0.005, epochs=2000, fit_intercept=True, batch_size=2, tol=0.0, penalty=penalty, loss=loss) - cu_sgd.fit(X, y) cu_pred = cu_sgd.predict(pred_data).to_array() print(" cuML intercept : ", cu_sgd.intercept_) print(" cuML coef : ", cu_sgd.coef_) print("cuML predictions : ", cu_pred) - Output: - .. code-block:: python - + cuML intercept : 0.004561662673950195 cuML coef : 0 0.9834546 1 0.010128272 dtype: float32 cuML predictions : [3.0055666 2.0221121] - - + + Parameters ----------- loss : 'hinge', 'log', 'squared_loss' (default = 'squared_loss') - 'hinge' uses linear SVM + 'hinge' uses linear SVM 'log' uses logistic regression 'squared_loss' uses linear regression penalty: 'none', 'l1', 'l2', 'elasticnet' (default = 'none') @@ -167,11 +161,11 @@ class SGD(Base): The constant value which decides the degree of regularization fit_intercept : boolean (default = True) If True, the model tries to correct for the global mean of y. - If False, the model expects that you have centered the data. + If False, the model expects that you have centered the data. epochs : int (default = 1000) The number of times the model should iterate through the entire dataset during training (default = 1000) tol : float (default = 1e-3) - The training process will stop if current_loss > previous_loss - tol + The training process will stop if current_loss > previous_loss - tol shuffle : boolean (default = True) True, shuffles the training data after each epoch False, does not shuffle the training data after each epoch @@ -186,15 +180,15 @@ class SGD(Base): The old learning rate is generally divide by 5 n_iter_no_change : int (default = 5) the number of epochs to train without any imporvement in the model - Notes ------ For additional docs, see `scikitlearn's OLS """ - - def __init__(self, loss='squared_loss', penalty='none', alpha=0.0001, l1_ratio=0.15, - fit_intercept=True, epochs=1000, tol=1e-3, shuffle=True, learning_rate='constant', eta0=0.0, power_t=0.5, batch_size=32, n_iter_no_change=5): - + + def __init__(self, loss='squared_loss', penalty='none', alpha=0.0001, l1_ratio=0.15, + fit_intercept=True, epochs=1000, tol=1e-3, shuffle=True, learning_rate='constant', eta0=0.0, + power_t=0.5, batch_size=32, n_iter_no_change=5, handle=None): + if loss in ['hinge', 'log', 'squared_loss']: self.loss = self._get_loss_int(loss) else: @@ -207,6 +201,7 @@ class SGD(Base): msg = "penalty {!r} is not supported" raise TypeError(msg.format(penalty)) + super(SGD, self).__init__(handle=handle, verbose=False) self.alpha = alpha self.l1_ratio = l1_ratio self.fit_intercept = fit_intercept @@ -215,7 +210,7 @@ class SGD(Base): self.shuffle = shuffle self.eta0 = eta0 self.power_t = power_t - + if learning_rate in ['optimal', 'constant', 'invscaling', 'adaptive']: self.learning_rate = learning_rate @@ -232,7 +227,7 @@ class SGD(Base): raise ValueError("alpha must be > 0 since " "learning_rate is 'optimal'. alpha is used " "to compute the optimal learning rate.") - + elif learning_rate == 'constant': self.lr_type = 1 self.lr = eta0 @@ -271,18 +266,24 @@ class SGD(Base): 'elasticnet': 3 }[penalty] + def _get_ctype_ptr(self, obj): + # The manner to access the pointers in the gdf's might change, so + # encapsulating access in the following 3 methods. They might also be + # part of future gdf versions. + return obj.device_ctypes_pointer.value + + def _get_column_ptr(self, obj): + return self._get_ctype_ptr(obj._column._data.to_gpu_array()) + def fit(self, X, y): """ Fit the model with X and y. - Parameters ---------- X : cuDF DataFrame Dense matrix (floats or doubles) of shape (n_samples, n_features) - y: cuDF DataFrame Dense vector (floats or doubles) of shape (n_samples, 1) - """ cdef uintptr_t X_ptr @@ -302,14 +303,14 @@ class SGD(Base): msg = "X matrix must be a cuDF dataframe or Numpy ndarray" raise TypeError(msg) - X_ptr = self._get_dev_array_ptr(X_m) + X_ptr = self._get_ctype_ptr(X_m) cdef uintptr_t y_ptr if (isinstance(y, cudf.Series)): - y_ptr = self._get_cudf_column_ptr(y) + y_ptr = self._get_column_ptr(y) elif (isinstance(y, np.ndarray)): y_m = cuda.to_device(y) - y_ptr = self._get_dev_array_ptr(y_m) + y_ptr = self._get_ctype_ptr(y_m) else: msg = "y vector must be a cuDF series or Numpy ndarray" raise TypeError(msg) @@ -317,72 +318,74 @@ class SGD(Base): self.n_alpha = 1 self.coef_ = cudf.Series(np.zeros(self.n_cols, dtype=self.gdf_datatype)) - cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) + cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) - cdef float c_intercept1 + cdef float c_intercept1 cdef double c_intercept2 - + cdef cumlHandle* handle_ = self.handle.getHandle() + if self.gdf_datatype.type == np.float32: - sgdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.batch_size, - self.epochs, - self.lr_type, - self.eta0, - self.power_t, - self.loss, - self.penalty, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol, + sgdFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.batch_size, + self.epochs, + self.lr_type, + self.eta0, + self.power_t, + self.loss, + self.penalty, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol, self.n_iter_no_change) self.intercept_ = c_intercept1 else: - sgdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.batch_size, - self.epochs, - self.lr_type, - self.eta0, - self.power_t, - self.loss, - self.penalty, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol, + sgdFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.batch_size, + self.epochs, + self.lr_type, + self.eta0, + self.power_t, + self.loss, + self.penalty, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol, self.n_iter_no_change) - + self.intercept_ = c_intercept2 + self.handle.sync() + return self def predict(self, X): """ Predicts the y for X. - Parameters ---------- X : cuDF DataFrame Dense matrix (floats or doubles) of shape (n_samples, n_features) - Returns ---------- y: cuDF DataFrame Dense vector (floats or doubles) of shape (n_samples, 1) - """ cdef uintptr_t X_ptr @@ -402,28 +405,34 @@ class SGD(Base): msg = "X matrix format not supported" raise TypeError(msg) - X_ptr = self._get_dev_array_ptr(X_m) + X_ptr = self._get_ctype_ptr(X_m) - cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) + cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) preds = cudf.Series(np.zeros(n_rows, dtype=pred_datatype)) - cdef uintptr_t preds_ptr = self._get_cudf_column_ptr(preds) + cdef uintptr_t preds_ptr = self._get_column_ptr(preds) + + cdef cumlHandle* handle_ = self.handle.getHandle() if pred_datatype.type == np.float32: - sgdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + sgdPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) else: - sgdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + sgdPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) + + self.handle.sync() del(X_m) @@ -432,17 +441,14 @@ class SGD(Base): def predictClass(self, X): """ Predicts the y for X. - Parameters ---------- X : cuDF DataFrame Dense matrix (floats or doubles) of shape (n_samples, n_features) - Returns ---------- y: cuDF DataFrame Dense vector (floats or doubles) of shape (n_samples, 1) - """ cdef uintptr_t X_ptr @@ -464,12 +470,14 @@ class SGD(Base): X_ptr = self._get_ctype_ptr(X_m) - cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) + cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) preds = cudf.Series(np.zeros(n_rows, dtype=pred_datatype)) - cdef uintptr_t preds_ptr = self._get_cudf_column_ptr(preds) - + cdef uintptr_t preds_ptr = self._get_column_ptr(preds) + cdef cumlHandle* handle_ = self.handle.getHandle() + if pred_datatype.type == np.float32: - sgdPredictBinaryClass(X_ptr, + sgdPredictBinaryClass(handle_[0], + X_ptr, n_rows, n_cols, coef_ptr, @@ -477,7 +485,8 @@ class SGD(Base): preds_ptr, self.loss) else: - sgdPredictBinaryClass(X_ptr, + sgdPredictBinaryClass(handle_[0], + X_ptr, n_rows, n_cols, coef_ptr, @@ -485,6 +494,8 @@ class SGD(Base): preds_ptr, self.loss) + self.handle.sync() + del(X_m) return preds From 6352c7c8c3d4ff0eb9072f89bba2b415708bd9a2 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Wed, 8 May 2019 19:23:57 -0400 Subject: [PATCH 052/156] cumlhandle added to cd. Also "except +" added to cd and sgd cython. --- cuML/src/solver/cd.h | 78 ++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 54 deletions(-) diff --git a/cuML/src/solver/cd.h b/cuML/src/solver/cd.h index c9d50d5477..25aecd534c 100644 --- a/cuML/src/solver/cd.h +++ b/cuML/src/solver/cd.h @@ -41,6 +41,8 @@ using namespace MLCommon; /** * Fits a linear, lasso, and elastic-net regression model using Coordinate Descent solver + * @param cumlHandle_impl + * Reference of cumlHandle * @param input * pointer to an array in column-major format (size of n_rows, n_cols) * @param n_rows @@ -71,10 +73,6 @@ using namespace MLCommon; * tolerance to stop the solver * @param stream * cuda stream - * @param cublas_handle - * cublas handle - * @param cusolver_handle - * cusolver handle */ template void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, @@ -92,33 +90,25 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, cublasHandle_t cublas_handle = handle.getCublasHandle(); cusolverDnHandle_t cusolver_handle = handle.getcusolverDnHandle(); - math_t *mu_input = nullptr; - math_t *mu_labels = nullptr; - math_t *norm2_input = nullptr; - math_t *pred = nullptr; - math_t *residual = nullptr; - math_t *squared = nullptr; - math_t *loss_value = nullptr; - - //auto allocator = handle.getDeviceAllocator(); - //device_buffer components_all(allocator, stream, len); - - allocate(loss_value, 1); - allocate(pred, n_rows, true); - allocate(residual, n_rows, true); - allocate(squared, n_cols, true); + auto allocator = handle.getDeviceAllocator(); + device_buffer pred(allocator, stream, n_rows); + device_buffer residual(allocator, stream, n_rows); + device_buffer squared(allocator, stream, n_cols); + device_buffer mu_input(allocator, stream, 0); + device_buffer mu_labels(allocator, stream, 0); + device_buffer norm2_input(allocator, stream, 0); std::vector h_coef(n_cols, math_t(0)); if (fit_intercept) { - allocate(mu_input, n_cols); - allocate(mu_labels, 1); + mu_input.reserve(n_cols, stream); + mu_labels.reserve(1, stream); if (normalize) { - allocate(norm2_input, n_cols); + norm2_input.reserve(n_cols, stream); } GLM::preProcessData(handle, input, n_rows, n_cols, labels, - intercept, mu_input, mu_labels, norm2_input, fit_intercept, + intercept, mu_input.data(), mu_labels.data(), norm2_input.data(), fit_intercept, normalize, stream); } @@ -131,14 +121,14 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, if (normalize) { math_t scalar = math_t(1.0) + l2_alpha; - Matrix::setValue(squared, squared, scalar, n_cols, stream); + Matrix::setValue(squared.data(), squared.data(), scalar, n_cols, stream); } else { - LinAlg::colNorm(squared, input, n_cols, n_rows, LinAlg::L2Norm, false, + LinAlg::colNorm(squared.data(), input, n_cols, n_rows, LinAlg::L2Norm, false, stream); - LinAlg::addScalar(squared, squared, l2_alpha, n_cols, stream); + LinAlg::addScalar(squared.data(), squared.data(), l2_alpha, n_cols, stream); } - copy(residual, labels, n_rows, stream); + copy(residual.data(), labels, n_rows, stream); for (int i = 0; i < epochs; i++) { if (i > 0 && shuffle) { @@ -152,13 +142,13 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, for (int j = 0; j < n_cols; j++) { int ci = ri[j]; math_t *coef_loc = coef + ci; - math_t *squared_loc = squared + ci; + math_t *squared_loc = squared.data() + ci; math_t *input_col_loc = input + (ci * n_rows); - LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, + LinAlg::multiplyScalar(pred.data(), input_col_loc, h_coef[ci], n_rows, stream); - LinAlg::add(residual, residual, pred, n_rows, stream); - LinAlg::gemm(input_col_loc, n_rows, 1, residual, coef_loc, 1, 1, + LinAlg::add(residual.data(), residual.data(), pred.data(), n_rows, stream); + LinAlg::gemm(input_col_loc, n_rows, 1, residual.data(), coef_loc, 1, 1, CUBLAS_OP_T, CUBLAS_OP_N, cublas_handle, stream); if (l1_ratio > math_t(0.0)) @@ -177,9 +167,9 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, if (abs(h_coef[ci]) > coef_max) coef_max = abs(h_coef[ci]); - LinAlg::multiplyScalar(pred, input_col_loc, h_coef[ci], n_rows, + LinAlg::multiplyScalar(pred.data(), input_col_loc, h_coef[ci], n_rows, stream); - LinAlg::subtract(residual, residual, pred, n_rows, stream); + LinAlg::subtract(residual.data(), residual.data(), pred.data(), n_rows, stream); } bool flag_continue = true; @@ -198,33 +188,13 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, if (fit_intercept) { GLM::postProcessData(handle, input, n_rows, n_cols, labels, - coef, intercept, mu_input, mu_labels, norm2_input, + coef, intercept, mu_input.data(), mu_labels.data(), norm2_input.data(), fit_intercept, normalize, stream); - if (mu_input != nullptr) - CUDA_CHECK(cudaFree(mu_input)); - if (mu_labels != nullptr) - CUDA_CHECK(cudaFree(mu_labels)); - if (normalize) { - if (norm2_input != nullptr) - cudaFree(norm2_input); - } } else { *intercept = math_t(0); } - if (pred != nullptr) - CUDA_CHECK(cudaFree(pred)); - - if (residual != nullptr) - CUDA_CHECK(cudaFree(residual)); - - if (squared != nullptr) - CUDA_CHECK(cudaFree(squared)); - - if (loss_value != nullptr) - CUDA_CHECK(cudaFree(loss_value)); - } /** From 37fcbd8dccb9d2c1790837a01238103e8d3ea837 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Wed, 8 May 2019 22:00:21 -0400 Subject: [PATCH 053/156] cumlhandle added to sgd --- cuML/src/glm/preprocess.h | 58 +++++++++++++++++++--------------- ml-prims/src/functions/hinge.h | 1 - 2 files changed, 32 insertions(+), 27 deletions(-) diff --git a/cuML/src/glm/preprocess.h b/cuML/src/glm/preprocess.h index c5cfb7997c..8a903825af 100644 --- a/cuML/src/glm/preprocess.h +++ b/cuML/src/glm/preprocess.h @@ -32,11 +32,13 @@ namespace GLM { using namespace MLCommon; template -void preProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, math_t *labels, - math_t *intercept, math_t *mu_input, math_t *mu_labels, math_t *norm2_input, - bool fit_intercept, bool normalize, cudaStream_t stream) { - auto cublas_handle = handle.getCublasHandle(); - auto cusolver_handle = handle.getcusolverDnHandle(); +void preProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, + int n_cols, math_t *labels, math_t *intercept, math_t *mu_input, + math_t *mu_labels, math_t *norm2_input, bool fit_intercept, + bool normalize, cudaStream_t stream) { + + auto cublas_handle = handle.getCublasHandle(); + auto cusolver_handle = handle.getcusolverDnHandle(); ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); @@ -45,50 +47,54 @@ void preProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, in if (fit_intercept) { Stats::mean(mu_input, input, n_cols, n_rows, false, false, stream); - Stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, stream); + Stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, + stream); Stats::mean(mu_labels, labels, 1, n_rows, false, false, stream); - Stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, stream); + Stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, + stream); if (normalize) { LinAlg::colNorm(norm2_input, input, n_cols, n_rows, LinAlg::L2Norm, false, - stream, - []__device__(math_t v){ return MLCommon::mySqrt(v); }); - Matrix::matrixVectorBinaryDivSkipZero(input, norm2_input, n_rows, n_cols, false, true, stream, true); + stream, + []__device__(math_t v) {return MLCommon::mySqrt(v);}); + Matrix::matrixVectorBinaryDivSkipZero(input, norm2_input, n_rows, + n_cols, false, true, stream, true); } } } template -void postProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, math_t *labels, math_t *coef, - math_t *intercept, math_t *mu_input, math_t *mu_labels, math_t *norm2_input, +void postProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, + int n_cols, math_t *labels, math_t *coef, math_t *intercept, + math_t *mu_input, math_t *mu_labels, math_t *norm2_input, bool fit_intercept, bool normalize, cudaStream_t stream) { - auto cublas_handle = handle.getCublasHandle(); - auto cusolver_handle = handle.getcusolverDnHandle(); + + auto cublas_handle = handle.getCublasHandle(); + auto cusolver_handle = handle.getcusolverDnHandle(); ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two"); - math_t *d_intercept; - allocate(d_intercept, 1); + auto allocator = handle.getDeviceAllocator(); + device_buffer d_intercept(allocator, stream, 1); if (normalize) { - Matrix::matrixVectorBinaryMult(input, norm2_input, n_rows, n_cols, false, true, stream); - Matrix::matrixVectorBinaryDivSkipZero(coef, norm2_input, 1, n_cols, - false, true, stream, true); + Matrix::matrixVectorBinaryMult(input, norm2_input, n_rows, n_cols, + false, true, stream); + Matrix::matrixVectorBinaryDivSkipZero(coef, norm2_input, 1, n_cols, + false, true, stream, true); } - LinAlg::gemm(mu_input, 1, n_cols, coef, d_intercept, 1, 1, - CUBLAS_OP_N, CUBLAS_OP_N, cublas_handle, stream); + LinAlg::gemm(mu_input, 1, n_cols, coef, d_intercept.data(), 1, 1, + CUBLAS_OP_N, CUBLAS_OP_N, cublas_handle, stream); - LinAlg::subtract(d_intercept, mu_labels, d_intercept, 1, stream); - updateHost(intercept, d_intercept, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - if (d_intercept != NULL) - cudaFree(d_intercept); + LinAlg::subtract(d_intercept.data(), mu_labels, d_intercept.data(), 1, + stream); + updateHost(intercept, d_intercept.data(), 1, stream); Stats::meanAdd(input, input, mu_input, n_cols, n_rows, false, true, stream); Stats::meanAdd(labels, labels, mu_labels, 1, n_rows, false, true, stream); diff --git a/ml-prims/src/functions/hinge.h b/ml-prims/src/functions/hinge.h index 08f4128c1a..dc3aeccac9 100644 --- a/ml-prims/src/functions/hinge.h +++ b/ml-prims/src/functions/hinge.h @@ -1,4 +1,3 @@ - /* * Copyright (c) 2018, NVIDIA CORPORATION. * From 799f766291591d0fd797963b30730cf883f5a237 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Thu, 9 May 2019 12:27:51 -0400 Subject: [PATCH 054/156] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5846b2ae2..c9d2ee485f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - PR #482: Introduce cumlHandle for pca and tsvd - PR #573: Remove use of unnecessary cuDF column and series copies +- PR #579: Introduce cumlHandle for cd and sgd, and propagate C++ errors in cython level for cd and sgd ## Bug Fixes - PR #584: Added missing virtual destructor to deviceAllocator and hostAllocator From 1656ed497fac8a1ee1591867a947b1191e398479 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 12:35:41 -0500 Subject: [PATCH 055/156] FEA Add build cuml and different test flags --- cpp/CMakeLists.txt | 300 ++++++++++++++++++++++++++------------------- 1 file changed, 173 insertions(+), 127 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index aeeb2a6c47..cb3f6b7639 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -14,9 +14,10 @@ # limitations under the License. #============================================================================= - set (CMAKE_FIND_NO_INSTALL_PREFIX TRUE FORCE) -cmake_minimum_required(VERSION 3.13 FATAL_ERROR) + +cmake_minimum_required(VERSION 3.12 FATAL_ERROR) + project(CUML VERSION 0.8.0 LANGUAGES CXX CUDA) ################################################################################################### @@ -37,20 +38,23 @@ endif() ################################################################################################### # - User Options --------------------------------------------------------------------------------- -set(CMAKE_IGNORE_PATH "${CMAKE_INSTALL_DIR}/lib" CACHE STRING - "Ignore any libs added implicitly from the CMAKE_INSTALL_DIR") +option(BUILD_CUML_CPP_LIBRARY "Build libcuml++ shared library" ON) + +option(BUILD_CUML_TESTS "Build cuML algorithm tests" ON) -option(LINEINFO "Enable lineinfo in nvcc" OFF) +option(BUILD_CUML_MG_TESTS "Build cuML multigpu algorithm tests" ON) -option(KERNELINFO "Enable kernel resource usage info" OFF) +option(BUILD_PRIM_TESTS "Build ml-prim tests" ON) -option(DEBUG "Get a debug build" OFF) +option(BUILD_CUML_EXAMPLES "Build C++ API usage examples" ON) -option(BUILD_CUML_TESTS "Build cuML algorithm tests" ON) +option(CMAKE_CXX11_ABI "Enable the GLIBCXX11 ABI" ON) + +option(DISABLE_OPENMP "Disable OpenMP" OFF) -option(BUILD_PRIM_TESTS "Build ml-prim tests" OFF) +option(KERNEL_INFO "Enable kernel resource usage info" OFF) -option(BUILD_CUML_EXAMPLES "Build C++ API usage examples" OFF) +option(LINE_INFO "Enable lineinfo in nvcc" OFF) set(BLAS_LIBRARIES "" CACHE STRING "Location of BLAS library") @@ -58,15 +62,14 @@ set(BLAS_LIBRARIES "" CACHE STRING set(GPU_ARCHS "" CACHE STRING "List of GPU architectures (semicolon-separated) to be compiled for") -if(NOT "${GPU_ARCHS}") - set(GPU_ARCHS "60") - if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9)) - set(GPU_ARCHS "${GPU_ARCHS};70") - endif() - if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10)) - set(GPU_ARCHS "${GPU_ARCHS};75") - endif() -endif() +set(CMAKE_IGNORE_PATH "${CMAKE_INSTALL_DIR}/lib" CACHE STRING + "Ignore any libs added implicitly from the CMAKE_INSTALL_DIR") + +# Bulding cuml_test or cuml_mg_test executables forces building libcuml++ +if(BUILD_CUML_TESTS OR BUILD_CUML_MG_TESTS) + set(BUILD_CUML_CPP_LIBRARY ON) +endif(BUILD_CUML_TESTS OR BUILD_CUML_MG_TESTS) + ################################################################################################### # - Requirements ---------------------------------------------------------------------------------- @@ -75,6 +78,9 @@ find_package(CUDA 9.0 REQUIRED) if (NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP}) find_package(OpenMP) + if(OPENMP_FOUND) + message(STATUS "OpenMP found in ${OPENMP_INCLUDE_DIRS}") + endif(OPENMP_FOUND) endif(NOT DISABLE_OPENMP OR NOT ${DISABLE_OPENMP}) find_package(ZLIB REQUIRED) @@ -102,20 +108,26 @@ endif() # endif() ################################################################################################### -# - Submodules ------------------------------------------------------------------------------------ +# - External Dependencies-------------------------------------------------------------------------- set(GTEST_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims/external/googletest/googletest CACHE STRING "Path to the googletest repo") + set(GTEST_LIBNAME "gtest_main" CACHE STRING "Name of the googletest library") + set(FAISS_DIR ${PROJECT_SOURCE_DIR}/external/faiss CACHE STRING "Path to FAISS source directory") + set(MLPRIMS_DIR ${PROJECT_SOURCE_DIR}/src_prims/ CACHE STRING "Path to the ml-prims repo") + set(CUB_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims/external/cub CACHE STRING "Path to cub repo") + set(CUTLASS_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims/external/cutlass CACHE STRING "Path to the cutlass repo") + set(CUDA_nvgraph_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvgraph.so CACHE STRING "Path to nvGraph lib") @@ -123,6 +135,7 @@ set(CUDA_nvgraph_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvgraph.so CACHE STRI ################################################################################################### # - Compiler Options ----------------------------------------------------------------------------- +# TODO: Update to c++14, github issue set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -135,19 +148,29 @@ endif(OPENMP_FOUND) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++11") -if(LINEINFO) +if(LINE_INFO) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") -endif() +endif(LINE_INFO) -if(KERNELINFO) +if(KERNEL_INFO) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas=-v") -endif() +endif(KERNEL_INFO) -if(DEBUG) +if(CMAKE_BUILD_TYPE MATCHES Debug) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G -g") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") endif() +if(NOT "${GPU_ARCHS}") + set(GPU_ARCHS "60") + if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9)) + set(GPU_ARCHS "${GPU_ARCHS};70") + endif() + if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10)) + set(GPU_ARCHS "${GPU_ARCHS};75") + endif() +endif() + foreach(arch ${GPU_ARCHS}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}") endforeach() @@ -166,65 +189,66 @@ if(CMAKE_COMPILER_IS_GNUCXX) endif(NOT CMAKE_CXX11_ABI) endif(CMAKE_COMPILER_IS_GNUCXX) - -## end of other compiler options - ################################################################################################### # - FAISS Build ---------------------------------------------------------------------------------- -# Configuration of faiss for the correct architectures - -# TODO: Update faiss submodule and use new flags -file(READ ${FAISS_DIR}/makefile.inc.in CONFIG_FILE) - string(REPLACE "-Xcudafe --diag_suppress=unrecognized_attribute" - "--disable-warnings" - CONFIG_FILE ${CONFIG_FILE}) - string(REPLACE "compute_35,code=\"compute_35\"" "compute_60,code=\"sm_60\"" - CONFIG_FILE ${CONFIG_FILE}) - string(REPLACE "compute_52,code=\"compute_52\"" "compute_61,code=\"sm_61\"" - CONFIG_FILE ${CONFIG_FILE}) - string(REPLACE "compute_60,code=\"compute_60\"" "compute_70,code=\"sm_70\"" - CONFIG_FILE ${CONFIG_FILE}) - -if((CUDA_VERSION_MAJOR EQUAL 9)) - # Do not generate Turing on 9.2 - string(REPLACE "-gencode arch=compute_61,code=\"compute_61\" " "" - CONFIG_FILE ${CONFIG_FILE}) +if(BUILD_CUML_CPP_LIBRARY) -endif() -if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10)) - string(REPLACE "compute_61,code=\"compute_61\"" "compute_70,code=\"sm_75\"" - CONFIG_FILE ${CONFIG_FILE}) - file(WRITE ${FAISS_DIR}/makefile.inc.in "${CONFIG_FILE}") -endif() + # Configuration of faiss for the correct architectures -file(WRITE ${FAISS_DIR}/makefile.inc.in "${CONFIG_FILE}") + # TODO: Update faiss submodule and use new flags + file(READ ${FAISS_DIR}/makefile.inc.in CONFIG_FILE) + string(REPLACE "-Xcudafe --diag_suppress=unrecognized_attribute" + "--disable-warnings" + CONFIG_FILE ${CONFIG_FILE}) + string(REPLACE "compute_35,code=\"compute_35\"" "compute_60,code=\"sm_60\"" + CONFIG_FILE ${CONFIG_FILE}) + string(REPLACE "compute_52,code=\"compute_52\"" "compute_61,code=\"sm_61\"" + CONFIG_FILE ${CONFIG_FILE}) + string(REPLACE "compute_60,code=\"compute_60\"" "compute_70,code=\"sm_70\"" + CONFIG_FILE ${CONFIG_FILE}) + if((CUDA_VERSION_MAJOR EQUAL 9)) + # Do not generate Turing on 9.2 + string(REPLACE "-gencode arch=compute_61,code=\"compute_61\" " "" + CONFIG_FILE ${CONFIG_FILE}) -include (ExternalProject) -ExternalProject_Add(faiss - SOURCE_DIR ${FAISS_DIR} - CONFIGURE_COMMAND LIBS=-pthread CPPFLAGS=-w LDFLAGS=-L${CMAKE_INSTALL_PREFIX}/lib ${FAISS_DIR}/configure --prefix=${CMAKE_CURRENT_BINARY_DIR}/faiss --with-blas=${BLAS_LIBRARIES} --with-cuda=${CUDA_TOOLKIT_ROOT_DIR} --quiet - PREFIX ${CMAKE_CURRENT_BINARY_DIR}/faiss/ - BUILD_COMMAND $(MAKE) - INSTALL_COMMAND $(MAKE) -s install > /dev/null || $(MAKE) && cd gpu && $(MAKE) -s install > /dev/null || $(MAKE) - BUILD_IN_SOURCE 1 - PREFIX=${CMAKE_CURRENT_BINARY_DIR}/faiss -) + endif() + if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10)) + string(REPLACE "compute_61,code=\"compute_61\"" "compute_70,code=\"sm_75\"" + CONFIG_FILE ${CONFIG_FILE}) + file(WRITE ${FAISS_DIR}/makefile.inc.in "${CONFIG_FILE}") + endif() -ExternalProject_Get_Property(faiss install_dir) + file(WRITE ${FAISS_DIR}/makefile.inc.in "${CONFIG_FILE}") -################################################################################################### -# - include paths --------------------------------------------------------------------------------- -add_library(faisslib STATIC IMPORTED) -add_library(gpufaisslib STATIC IMPORTED) + include (ExternalProject) + ExternalProject_Add(faiss + SOURCE_DIR ${FAISS_DIR} + CONFIGURE_COMMAND LIBS=-pthread CPPFLAGS=-w LDFLAGS=-L${CMAKE_INSTALL_PREFIX}/lib ${FAISS_DIR}/configure --prefix=${CMAKE_CURRENT_BINARY_DIR}/faiss --with-blas=${BLAS_LIBRARIES} --with-cuda=${CUDA_TOOLKIT_ROOT_DIR} --quiet + PREFIX ${CMAKE_CURRENT_BINARY_DIR}/faiss/ + BUILD_COMMAND $(MAKE) + INSTALL_COMMAND $(MAKE) -s install > /dev/null || $(MAKE) && cd gpu && $(MAKE) -s install > /dev/null || $(MAKE) + BUILD_IN_SOURCE 1 + PREFIX=${CMAKE_CURRENT_BINARY_DIR}/faiss + ) + + ExternalProject_Get_Property(faiss install_dir) + + add_library(faisslib STATIC IMPORTED) + add_library(gpufaisslib STATIC IMPORTED) -add_dependencies(faisslib faiss) -add_dependencies(gpufaisslib faiss) + add_dependencies(faisslib faiss) + add_dependencies(gpufaisslib faiss) -set_property(TARGET faisslib PROPERTY IMPORTED_LOCATION ${FAISS_DIR}/libfaiss.a) -set_property(TARGET gpufaisslib PROPERTY IMPORTED_LOCATION ${FAISS_DIR}/gpu/libgpufaiss.a) + set_property(TARGET faisslib PROPERTY IMPORTED_LOCATION ${FAISS_DIR}/libfaiss.a) + set_property(TARGET gpufaisslib PROPERTY IMPORTED_LOCATION ${FAISS_DIR}/gpu/libgpufaiss.a) + +endif(BUILD_CUML_CPP_LIBRARY) + +################################################################################################### +# - include paths --------------------------------------------------------------------------------- include_directories( src @@ -233,8 +257,6 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/faiss/include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${GTEST_DIR}/include - # ${MLPRIMS_DIR}/src - # ${MLPRIMS_DIR}/test ${CUTLASS_DIR} ${CUB_DIR} ${ZLIB_INCLUDE_DIRS$}) @@ -242,72 +264,83 @@ include_directories( add_subdirectory(${GTEST_DIR} ${PROJECT_BINARY_DIR}/googletest) file(GLOB_RECURSE ml_prims_header "src_prims/*.h" "src_prims/*.hpp") -file(GLOB_RECURSE cuml_test_cuda_sources "test/sg/*.cu") -file(GLOB_RECURSE cuml_mg_test_cuda_sources "test/mg/*.cu") -file(GLOB_RECURSE mlprims_test_cuda_sources "test/prims/*.cu") ################################################################################################### # - build libcuml++ shared library ------------------------------------------------------------------ -set(CUML_CPP_TARGET "cuml++") -add_library(${CUML_CPP_TARGET} SHARED - src/pca/pca.cu - src/tsvd/tsvd.cu - src/dbscan/dbscan.cu - src/kmeans/kmeans.cu - src/glm/glm.cu - src/knn/knn.cu - src/kalman_filter/lkf_py.cu - src/common/cumlHandle.cpp - src/common/cuml_api.cpp - src/umap/umap.cu - src/solver/solver.cu - src/metrics/metrics.cu - src/decisiontree/decisiontree.cu - src/randomforest/randomforest.cu) - -set(CUML_LINK_LIBRARIES - ${CUDA_cublas_LIBRARY} - ${CUDA_curand_LIBRARY} - ${CUDA_cusolver_LIBRARY} - ${CUDA_CUDART_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ${CUDA_nvgraph_LIBRARY} - ${ZLIB_LIBRARIES} - gpufaisslib - faisslib) - -if(OPENMP_FOUND) - set(CUML_LINK_LIBRARIES ${CUML_LINK_LIBRARIES} OpenMP::OpenMP_CXX pthread) -endif(OPENMP_FOUND) - -target_link_libraries(${CUML_CPP_TARGET} ${CUML_LINK_LIBRARIES}) +if(BUILD_CUML_CPP_LIBRARY) + + set(CUML_CPP_TARGET "cuml++") + add_library(${CUML_CPP_TARGET} SHARED + src/common/cumlHandle.cpp + src/common/cuml_api.cpp + src/dbscan/dbscan.cu + src/decisiontree/decisiontree.cu + src/glm/glm.cu + src/kalman_filter/lkf_py.cu + src/kmeans/kmeans.cu + src/knn/knn.cu + src/metrics/metrics.cu + src/pca/pca.cu + src/randomforest/randomforest.cu + src/solver/solver.cu + src/tsvd/tsvd.cu + src/umap/umap.cu) + + set(CUML_LINK_LIBRARIES + ${CUDA_cublas_LIBRARY} + ${CUDA_curand_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_CUDART_LIBRARY} + ${CUDA_cusparse_LIBRARY} + ${CUDA_nvgraph_LIBRARY} + ${ZLIB_LIBRARIES} + gpufaisslib + faisslib) + + if(OPENMP_FOUND) + set(CUML_LINK_LIBRARIES ${CUML_LINK_LIBRARIES} OpenMP::OpenMP_CXX pthread) + endif(OPENMP_FOUND) + + target_link_libraries(${CUML_CPP_TARGET} ${CUML_LINK_LIBRARIES}) + +endif(BUILD_CUML_CPP_LIBRARY) ################################################################################################### # - build ml_test executable ---------------------------------------------------------------------- -add_executable(ml_test - ${cuml_test_cuda_sources} - ${ml_prims_header}) +if(BUILD_CUML_TESTS) -target_link_libraries(ml_test - ${GTEST_LIBNAME} - ${CUDA_cublas_LIBRARY} - ${CUDA_curand_LIBRARY} - ${CUDA_cusolver_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ${CUDA_CUDART_LIBRARY} - gpufaisslib - ${CUDA_cusparse_LIBRARY} - ${CUDA_nvgraph_LIBRARY} - faisslib - ${CUML_CPP_TARGET} - pthread - ${ZLIB_LIBRARIES}) + file(GLOB_RECURSE cuml_test_cuda_sources "test/sg/*.cu") + + add_executable(ml_test + ${cuml_test_cuda_sources} + ${ml_prims_header}) + + target_link_libraries(ml_test + ${GTEST_LIBNAME} + ${CUDA_cublas_LIBRARY} + ${CUDA_curand_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY} + ${CUDA_CUDART_LIBRARY} + gpufaisslib + ${CUDA_cusparse_LIBRARY} + ${CUDA_nvgraph_LIBRARY} + faisslib + ${CUML_CPP_TARGET} + pthread + ${ZLIB_LIBRARIES}) + +endif(BUILD_CUML_TESTS) ################################################################################################### # - build ml_mg_test executable ------------------------------------------------------------------- +if(BUILD_CUML_MG_TESTS) + +file(GLOB_RECURSE cuml_mg_test_cuda_sources "test/mg/*.cu") + add_executable(ml_mg_test ${cuml_mg_test_cuda_sources} ${ml_prims_header}) @@ -327,9 +360,13 @@ target_link_libraries(ml_mg_test pthread ${ZLIB_LIBRARIES}) +endif(BUILD_CUML_MG_TESTS) + ################################################################################################### # - build prims_test executable ---------------------------------------------------------------- +file(GLOB_RECURSE mlprims_test_cuda_sources "test/prims/*.cu") + set(MLPRIMS_LINK_LIBRARIES ${CUDA_cublas_LIBRARY} ${CUDA_curand_LIBRARY} @@ -357,3 +394,12 @@ endif(DISABLE_EXAMPLES OR ${BUILD_CUML_EXAMPLES}) # - install targets ------------------------------------------------------------------------------- install(TARGETS ${CUML_CPP_TARGET} DESTINATION lib) + +################################################################################################### +# - doxygen targets ------------------------------------------------------------------------------- + +# include(cmake/doxygen.cmake) +# add_doxygen_target(IN_DOXYFILE ${MLPRIMS_DIR}/Doxyfile.in +# OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile +# CWD ${CMAKE_CURRENT_BINARY_DIR}) + From d6451ce266b6a8d6924ce6ebdd523e1d669e0a77 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 9 May 2019 15:03:46 -0400 Subject: [PATCH 056/156] Getting very close to being finished unit tests for prims. --- cuML/src/dbscan/adjgraph/algo.h | 19 +++- cuML/src/dbscan/adjgraph/pack.h | 3 + cuML/src/dbscan/adjgraph/runner.h | 4 +- cuML/src/dbscan/runner.h | 2 +- ml-prims/src/sparse/csr.h | 154 +++++++++++++++++------------- ml-prims/test/csr.cu | 129 +++++++++++++++++++++++++ 6 files changed, 235 insertions(+), 76 deletions(-) diff --git a/cuML/src/dbscan/adjgraph/algo.h b/cuML/src/dbscan/adjgraph/algo.h index e71697b188..78ffef9948 100644 --- a/cuML/src/dbscan/adjgraph/algo.h +++ b/cuML/src/dbscan/adjgraph/algo.h @@ -55,12 +55,23 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, Type batchSize int minPts = data.minPts; int *vd = data.vd; - MLCommon::Sparse::csr_adj_graph_batched(data.ex_scan, data.N, batchSize, - data.adj, data.adj_graph, - [core_pts, minPts, vd] __device__ (Type row, Type start_idx) { + std::cout << MLCommon::arr2Str(data.ex_scan, batchSize, "ex_scan", stream) << std::endl; + std::cout << MLCommon::arr2Str(data.adj, batchSize*data.N, "adj", stream) << std::endl; + + MLCommon::Sparse::csr_adj_graph_batched( + data.ex_scan, + data.N, + data.adjnnz, + batchSize, + data.adj, + data.adj_graph, + stream, + [core_pts, minPts, vd] __device__ (Type row, Type start_idx, Type stop_idx) { // fuse the operation of core points construction core_pts[row] = (vd[row] >= minPts); - }, stream); + }); + + std::cout << MLCommon::arr2Str(data.adj_graph, data.adjnnz, "adj_graph", stream) << std::endl; CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cuML/src/dbscan/adjgraph/pack.h b/cuML/src/dbscan/adjgraph/pack.h index cab9f07e3d..d6da455f84 100644 --- a/cuML/src/dbscan/adjgraph/pack.h +++ b/cuML/src/dbscan/adjgraph/pack.h @@ -31,6 +31,9 @@ struct Pack { bool *adj; /** the adjacency graph */ Type *adj_graph; + + Type adjnnz; + /** exculusive scan generated from vd */ Type *ex_scan; /** array to store whether a vertex is core poType or not */ diff --git a/cuML/src/dbscan/adjgraph/runner.h b/cuML/src/dbscan/adjgraph/runner.h index 8c184a5061..865607b221 100644 --- a/cuML/src/dbscan/adjgraph/runner.h +++ b/cuML/src/dbscan/adjgraph/runner.h @@ -25,9 +25,9 @@ namespace Dbscan { namespace AdjGraph { template -void run(const ML::cumlHandle_impl& handle, bool* adj, int* vd, Type* adj_graph, Type* ex_scan, Type N, +void run(const ML::cumlHandle_impl& handle, bool* adj, int* vd, Type* adj_graph, Type adjnnz, Type* ex_scan, Type N, Type minpts, bool* core_pts, int algo, Type batchSize, cudaStream_t stream) { - Pack data = {vd, adj, adj_graph, ex_scan, core_pts, N, minpts}; + Pack data = {vd, adj, adj_graph, adjnnz, ex_scan, core_pts, N, minpts}; switch(algo) { case 0: Naive::launcher(handle, data, batchSize, stream); diff --git a/cuML/src/dbscan/runner.h b/cuML/src/dbscan/runner.h index 078cce9457..d38feaf745 100644 --- a/cuML/src/dbscan/runner.h +++ b/cuML/src/dbscan/runner.h @@ -119,7 +119,7 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Type N, Type D, Type_f adj_graph.resize(adjlen, stream); } - AdjGraph::run(handle, adj, vd, adj_graph.data(), ex_scan, N, minPts, core_pts, + AdjGraph::run(handle, adj, vd, adj_graph.data(), adjlen, ex_scan, N, minPts, core_pts, algoAdj, nPoints, stream); MLCommon::Sparse::weak_cc_batched( diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index 21914732ba..169de278ab 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -248,7 +248,7 @@ __global__ void csr_row_normalize_l1_kernel( * @param result: l1 normalized data array * @param stream: cuda stream to use */ -template +template void csr_row_normalize_l1( int* const ia, // csr row ex_scan (sorted by row) T* const vals, int nnz, // array of values and number of non-zeros @@ -263,7 +263,7 @@ void csr_row_normalize_l1( m, result); } -template +template __global__ void csr_row_normalize_max_kernel( int *ia, // csr row ind array (sorted by row) T *vals, int nnz, // array of values and number of non-zeros @@ -312,7 +312,7 @@ __global__ void csr_row_normalize_max_kernel( * @param stream: cuda stream to use */ -template +template void csr_row_normalize_max( int* const ia, // csr row ind array (sorted by row) T* const vals, int nnz, // array of values and number of non-zeros @@ -339,7 +339,7 @@ __device__ int get_stop_idx(T row, int m, int nnz, T *ind) { return stop_idx; } -template +template __global__ void csr_to_coo_kernel(int *row_ind, int m, int *coo_rows, int nnz) { // row-based matrix 1 thread per row @@ -370,7 +370,7 @@ void csr_to_coo(int *row_ind, int m, int *coo_rows, int nnz, } -template +template __global__ void csr_add_calc_row_counts_kernel( int *a_ind, int *a_indptr, T *a_val, int nnz1, int *b_ind, int *b_indptr, T *b_val, int nnz2, @@ -428,7 +428,7 @@ __global__ void csr_add_calc_row_counts_kernel( } -template +template __global__ void csr_add_kernel( int *a_ind, int *a_indptr, T *a_val, int nnz1, int *b_ind, int *b_indptr, T *b_val, int nnz2, @@ -493,7 +493,7 @@ __global__ void csr_add_kernel( * @param out_ind: output row_ind array * @param stream: cuda stream to use */ -template +template size_t csr_add_calc_inds( int* const a_ind, int* const a_indptr, T* const a_val, int nnz1, int* const b_ind, int* const b_indptr, T* const b_val, int nnz2, @@ -543,7 +543,7 @@ size_t csr_add_calc_inds( * @param c_val: output data array * @param stream: cuda stream to use */ -template +template void csr_add_finalize( int* const a_ind, int* const a_indptr, T* const a_val, int nnz1, int* const b_ind, int* const b_indptr, T* const b_val, int nnz2, @@ -561,13 +561,14 @@ void csr_add_finalize( CUDA_CHECK(cudaPeekAtLastError()); } -template -__global__ void csr_row_op_batched_kernel(T* const row_ind, T total_rows, - T batchSize, Lambda op) { +template void> +__global__ void csr_row_op_kernel(T* const row_ind, T n_rows, + T nnz, Lambda op) { T row = blockIdx.x*TPB_X + threadIdx.x; - if(row < batchSize) { + if(row < n_rows) { T start_idx = row_ind[row]; - op(row, start_idx); + T stop_idx = row < n_rows-1 ? row_ind[row+1] : nnz; + op(row, start_idx, stop_idx); } } @@ -579,33 +580,19 @@ __global__ void csr_row_op_batched_kernel(T* const row_ind, T total_rows, * @param row_ind the CSR row_ind array to perform parallel operations over * @param total_rows total number vertices in graph * @param batchSize size of row_ind - * @param op custom row operation functor + * @param op custom row operation functor accepting the row and beginning index. * @param stream cuda stream to use */ -template -void csr_row_op_batched(T* const row_ind, T total_rows, T batchSize, +templatevoid> +void csr_row_op(T* const row_ind, T n_rows, T nnz, Lambda op, cudaStream_t stream) { - dim3 grid(MLCommon::ceildiv(batchSize, TPB_X), 1, 1); + dim3 grid(MLCommon::ceildiv(n_rows, TPB_X), 1, 1); dim3 blk(TPB_X, 1, 1); + csr_row_op_kernel<<>> + (row_ind, n_rows, nnz, op); - csr_row_op_batched_kernel<<>> - (row_ind, total_rows, batchSize, op); -} - -/** - * @brief Perform a custom row operation on a CSR matrix. - * @tparam T numerical type of row_ind array - * @tparam TPB_X number of threads per block to use for underlying kernel - * @tparam Lambda type of custom operation function - * @param row_ind the CSR row_ind array to perform parallel operations over - * @param n_rows total number vertices in graph (size of row_ind) - * @param op custom row operation functor - * @param stream cuda stream to use - */ -template -void csr_row_op(T* const row_ind, T n_rows, Lambda op, cudaStream_t stream) { - csr_row_op_batched(row_ind, n_rows, n_rows, op, stream); + CUDA_CHECK(cudaPeekAtLastError()); } /** @@ -617,31 +604,36 @@ void csr_row_op(T* const row_ind, T n_rows, Lambda op, cudaStream_t stream) { * @param row_ind the input CSR row_ind array * @param total_rows number of vertices in graph * @param batchSize number of vertices in current batch - * @param adj an adjacency array + * @param adj an adjacency array (size batchSize * total_rows) * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph * @param stream cuda stream to use */ - -template -void csr_adj_graph_batched(T* const row_ind, T total_rows, T batchSize, - bool* const adj, T *row_ind_ptr, Lambda fused_op, cudaStream_t stream) { - - csr_row_op_batched(row_ind, total_rows, batchSize, +templatevoid> +void csr_adj_graph_batched(T* const row_ind, T total_rows, T nnz, T batchSize, + bool* const adj, T *row_ind_ptr, cudaStream_t stream, Lambda fused_op) { + csr_row_op(row_ind, batchSize, nnz, [fused_op, adj, total_rows, row_ind_ptr, batchSize] __device__ - (T row, T start_idx) { - - fused_op(row, start_idx); - int k = 0; - for(T i=0; ivoid> +void csr_adj_graph_batched(T* const row_ind, T total_rows, T nnz, T batchSize, + bool* const adj, T *row_ind_ptr, cudaStream_t stream) { + csr_adj_graph_batched(row_ind, total_rows, nnz, batchSize, adj, + row_ind_ptr, stream, [] __device__ (T row, T start_idx, T stop_idx) {}); +} + /** * @brief Constructs an adjacency graph CSR row_ind_ptr array from a * a row_ind array and adjacency array. @@ -653,14 +645,15 @@ void csr_adj_graph_batched(T* const row_ind, T total_rows, T batchSize, * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph * @param stream cuda stream to use */ -template -void csr_adj_graph(T* const row_ind, T n_rows, - bool* const adj, T *row_ind_ptr, cudaStream_t stream) { - csr_adj_graph_batched(row_ind, n_rows, n_rows, adj, - row_ind_ptr, stream); +templatevoid> +void csr_adj_graph(T* const row_ind, T total_rows, T nnz, + bool* const adj, T *row_ind_ptr, cudaStream_t stream, Lambda fused_op) { + + csr_adj_graph_batched(row_ind, total_rows, nnz, total_rows, adj, + row_ind_ptr, stream, fused_op); } -template +template class WeakCCState { public: @@ -693,7 +686,7 @@ class WeakCCState { } }; -template +template __global__ void weak_cc_label_device( Type *labels, Type *row_ind, Type *row_ind_ptr, Type nnz, @@ -732,7 +725,7 @@ __global__ void weak_cc_label_device( } -template +template __global__ void weak_cc_init_label_kernel(Type *labels, int startVertexId, int batchSize, Type MAX_LABEL, Lambda filter_op) { /** F1 and F2 in the paper correspond to fa and xa */ @@ -744,7 +737,7 @@ __global__ void weak_cc_init_label_kernel(Type *labels, int startVertexId, int b } } -template +template __global__ void weak_cc_init_all_kernel(Type *labels, bool *fa, bool *xa, Type N, Type MAX_LABEL) { int tid = threadIdx.x + blockIdx.x*TPB_X; @@ -755,7 +748,7 @@ __global__ void weak_cc_init_all_kernel(Type *labels, bool *fa, bool *xa, } } -template +template void weak_cc_label_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr, Type nnz, Type N, WeakCCState *state, @@ -771,6 +764,7 @@ void weak_cc_label_batched(Type *labels, weak_cc_init_label_kernel<<>>(labels, startVertexId, batchSize, MAX_LABEL, filter_op); + CUDA_CHECK(cudaPeekAtLastError()); do { CUDA_CHECK( cudaMemsetAsync(state->m, false, sizeof(bool), stream) ); weak_cc_label_device<<>>( @@ -778,6 +772,7 @@ void weak_cc_label_batched(Type *labels, row_ind, row_ind_ptr, nnz, state->fa, state->xa, state->m, startVertexId, batchSize); + CUDA_CHECK(cudaPeekAtLastError()); //** swapping F1 and F2 MLCommon::updateHost(host_fa, state->fa, N, stream); @@ -812,23 +807,33 @@ void weak_cc_label_batched(Type *labels, * @param filter_op an optional filtering function to determine which points * should get considered for labeling. */ -template +templatebool> void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr, Type nnz, Type N, Type startVertexId, Type batchSize, - WeakCCState *state, cudaStream_t stream, - Lambda filter_op = [] __device__ (int tid) {return true;}) { + WeakCCState *state, cudaStream_t stream, Lambda filter_op) { dim3 blocks(ceildiv(N, TPB_X)); dim3 threads(TPB_X); Type MAX_LABEL = std::numeric_limits::max(); - if(startVertexId == 0) + if(startVertexId == 0) { weak_cc_init_all_kernel<<>> (labels, state->fa, state->xa, N, MAX_LABEL); + CUDA_CHECK(cudaPeekAtLastError()); + } weak_cc_label_batched(labels, row_ind, row_ind_ptr, nnz, N, state, startVertexId, batchSize, stream, filter_op); } +template +void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr, + Type nnz, Type N, Type startVertexId, Type batchSize, + WeakCCState *state, cudaStream_t stream) { + + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, startVertexId, batchSize, + state, stream, [] __device__ (int tid) {return true;}); +} + /** * @brief Compute weakly connected components. Note that the resulting labels * may not be taken from a monotonically increasing set (eg. numbers may be @@ -847,18 +852,29 @@ void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr * @param filter_op an optional filtering function to determine which points * should get considered for labeling. */ -template +templatebool> void weak_cc(Type *labels, Type* const row_ind, Type* const row_ind_ptr, - Type nnz, Type N, cudaStream_t stream, - Lambda filter_op = [] __device__ (int tid) {return true;}) { + Type nnz, Type N, cudaStream_t stream, Lambda filter_op) { - WeakCCState state; + WeakCCState state(N); weak_cc_batched( labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op); } +template +void weak_cc(Type *labels, Type* const row_ind, Type* const row_ind_ptr, + Type nnz, Type N, cudaStream_t stream) { + + WeakCCState state(N); + weak_cc_batched( + labels, row_ind, row_ind_ptr, + nnz, N, 0, N, stream, + [](Type t){return true;}); +} + + }; }; diff --git a/ml-prims/test/csr.cu b/ml-prims/test/csr.cu index c3230d3709..5af18e649b 100644 --- a/ml-prims/test/csr.cu +++ b/ml-prims/test/csr.cu @@ -223,6 +223,135 @@ TEST_P(CSRSum, Result) { CUDA_CHECK(cudaFree(result_val)); } +typedef CSRTest CSRRowOpTest; +TEST_P(CSRRowOpTest, Result) { + + cudaStream_t stream; + cudaStreamCreate(&stream); + + int *ex_scan; + float *result, *verify; + + int ex_scan_h[4] = {0, 4, 8, 9 }; + + float verify_h[10] = { 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0 }; + + allocate(verify, 10); + allocate(ex_scan, 4); + allocate(result, 10, true); + + updateDevice(ex_scan, *&ex_scan_h, 4, stream); + updateDevice(verify, *&verify_h, 10, stream); + + csr_row_op(ex_scan, 4, 10, + [result] __device__ (int row, int start_idx, int stop_idx) { + for(int i = start_idx; i < stop_idx; i++ ) + result[i] = row; + }, stream); + + std::cout << MLCommon::arr2Str(result, 10, "result", stream) << std::endl; + + ASSERT_TRUE(devArrMatch(verify, result, 10, Compare())); + + cudaStreamDestroy(stream); + + CUDA_CHECK(cudaFree(ex_scan)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); +} + +typedef CSRTest AdjGraphTest; +TEST_P(AdjGraphTest, Result) { + + cudaStream_t stream; + cudaStreamCreate(&stream); + + int *row_ind, *result, *verify; + bool *adj; + + int row_ind_h[3] = {0, 3, 6 }; + bool adj_h[18] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + int verify_h[9] = { 0, 1, 2, 0, 1, 2, 0, 1, 2 }; + + allocate(row_ind, 3); + allocate(adj, 18); + allocate(result, 9, true); + allocate(verify, 9); + + updateDevice(row_ind, *&row_ind_h, 3, stream); + updateDevice(adj, *&adj_h, 18, stream); + updateDevice(verify, *&verify_h, 9, stream); + + csr_adj_graph_batched(row_ind, 6, 9, 3, adj, result, stream); + + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(devArrMatch(verify, result, 9, Compare())); + + cudaStreamDestroy(stream); + + CUDA_CHECK(cudaFree(row_ind)); + CUDA_CHECK(cudaFree(adj)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); +} + +typedef CSRTest WeakCCTest; +TEST_P(WeakCCTest, Result) { + + cudaStream_t stream; + cudaStreamCreate(&stream); + + int *row_ind, *row_ind_ptr, *result, *verify; + bool *adj; + + int row_ind_h[3] = {0, 3, 6 }; + int row_ind_ptr_h[9] = { 0, 1, 2, 0, 1, 2, 0, 1, 2 }; + int verify_h[6] = { 0, 1, 2, 0, 1, 2 }; + + allocate(row_ind, 3); + allocate(row_ind_ptr, 9); + allocate(result, 9, true); + allocate(verify, 9); + + updateDevice(row_ind, *&row_ind_h, 3, stream); + updateDevice(row_ind_ptr, *&row_ind_ptr_h, 9, stream); + updateDevice(verify, *&verify_h, 6, stream); + + WeakCCState state(6); +// + + std::cout << "Running..." << std::endl; + weak_cc_batched(result, row_ind, row_ind_ptr, 9, 6, 0, 3, &state, stream); + + std::cout << "Result so far" << std::endl; + + std::cout << MLCommon::arr2Str(result, 6, "labels", stream) << std::endl; + + CUDA_CHECK(cudaDeviceSynchronize()); + + ASSERT_TRUE(devArrMatch(verify, result, 9, Compare())); + + cudaStreamDestroy(stream); + + CUDA_CHECK(cudaFree(row_ind)); + CUDA_CHECK(cudaFree(adj)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); +} + +INSTANTIATE_TEST_CASE_P(CSRTests, WeakCCTest, + ::testing::ValuesIn(inputsf)); + + +INSTANTIATE_TEST_CASE_P(CSRTests, AdjGraphTest, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(CSRTests, CSRRowOpTest, + ::testing::ValuesIn(inputsf)); + + INSTANTIATE_TEST_CASE_P(CSRTests, CSRToCOO, ::testing::ValuesIn(inputsf)); From 9d8032b784dc13f44bb743d616ad6e30a4ba720d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 9 May 2019 15:12:04 -0400 Subject: [PATCH 057/156] Tests passing for CSR --- ml-prims/test/csr.cu | 51 ++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/ml-prims/test/csr.cu b/ml-prims/test/csr.cu index 5af18e649b..72f24f29e6 100644 --- a/ml-prims/test/csr.cu +++ b/ml-prims/test/csr.cu @@ -21,6 +21,7 @@ #include "random/rng.h" #include "test_utils.h" +#include #include namespace MLCommon { @@ -61,13 +62,8 @@ TEST_P(CSRToCOO, Result) { csr_to_coo<32>(ex_scan, 4, result, 10, stream); - std::cout << MLCommon::arr2Str(result, 10, "result", stream) << std::endl; - ASSERT_TRUE(devArrMatch(verify, result, 10, Compare(), stream)); - - std::cout << "Verified!" << std::endl; - delete ex_scan_h; delete verify_h; @@ -105,8 +101,6 @@ TEST_P(CSRRowNormalizeMax, Result) { csr_row_normalize_max<32, float>(ex_scan, in_vals, 10, 4, result, stream); - std::cout << MLCommon::arr2Str(result, 10, "result", stream) << std::endl; - ASSERT_TRUE(devArrMatch(verify, result, 10, Compare())); cudaStreamDestroy(stream); @@ -249,8 +243,6 @@ TEST_P(CSRRowOpTest, Result) { result[i] = row; }, stream); - std::cout << MLCommon::arr2Str(result, 10, "result", stream) << std::endl; - ASSERT_TRUE(devArrMatch(verify, result, 10, Compare())); cudaStreamDestroy(stream); @@ -285,8 +277,6 @@ TEST_P(AdjGraphTest, Result) { csr_adj_graph_batched(row_ind, 6, 9, 3, adj, result, stream); - CUDA_CHECK(cudaDeviceSynchronize()); - ASSERT_TRUE(devArrMatch(verify, result, 9, Compare())); cudaStreamDestroy(stream); @@ -304,39 +294,48 @@ TEST_P(WeakCCTest, Result) { cudaStreamCreate(&stream); int *row_ind, *row_ind_ptr, *result, *verify; - bool *adj; - int row_ind_h[3] = {0, 3, 6 }; - int row_ind_ptr_h[9] = { 0, 1, 2, 0, 1, 2, 0, 1, 2 }; - int verify_h[6] = { 0, 1, 2, 0, 1, 2 }; + int row_ind_h1[3] = {0, 3, 6 }; + int row_ind_ptr_h1[9] = { 0, 1, 2, 0, 1, 2, 0, 1, 2 }; + int verify_h1[6] = { 1, 1, 1, 2147483647, 2147483647, 2147483647 }; + + int row_ind_h2[3] = {0, 2, 4 }; + int row_ind_ptr_h2[5] = { 3, 4, 3, 4, 5 }; + int verify_h2[6] = { 1, 1, 1, 5, 5, 5 }; allocate(row_ind, 3); allocate(row_ind_ptr, 9); allocate(result, 9, true); allocate(verify, 9); - updateDevice(row_ind, *&row_ind_h, 3, stream); - updateDevice(row_ind_ptr, *&row_ind_ptr_h, 9, stream); - updateDevice(verify, *&verify_h, 6, stream); - WeakCCState state(6); -// - std::cout << "Running..." << std::endl; + /** + * Run batch #1 + */ + updateDevice(row_ind, *&row_ind_h1, 3, stream); + updateDevice(row_ind_ptr, *&row_ind_ptr_h1, 9, stream); + updateDevice(verify, *&verify_h1, 6, stream); + weak_cc_batched(result, row_ind, row_ind_ptr, 9, 6, 0, 3, &state, stream); - std::cout << "Result so far" << std::endl; + ASSERT_TRUE(devArrMatch(verify, result, 6, Compare())); - std::cout << MLCommon::arr2Str(result, 6, "labels", stream) << std::endl; + /** + * Run batch #2 + */ + updateDevice(row_ind, *&row_ind_h2, 3, stream); + updateDevice(row_ind_ptr, *&row_ind_ptr_h2, 5, stream); + updateDevice(verify, *&verify_h2, 6, stream); - CUDA_CHECK(cudaDeviceSynchronize()); + weak_cc_batched(result, row_ind, row_ind_ptr, 5, 6, 4, 3, &state, stream); - ASSERT_TRUE(devArrMatch(verify, result, 9, Compare())); + ASSERT_TRUE(devArrMatch(verify, result, 6, Compare())); cudaStreamDestroy(stream); CUDA_CHECK(cudaFree(row_ind)); - CUDA_CHECK(cudaFree(adj)); + CUDA_CHECK(cudaFree(row_ind_ptr)); CUDA_CHECK(cudaFree(verify)); CUDA_CHECK(cudaFree(result)); } From 96377c016d136bbabfadc7fbd3b11c4e396fcf46 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Thu, 9 May 2019 15:27:49 -0400 Subject: [PATCH 058/156] Adding default fused_op to epsilon neighborhood. Prepping for testing --- cuML/src/dbscan/vertexdeg/algo.h | 6 ++---- ml-prims/src/distance/distance.h | 9 +++++---- ml-prims/test/dist_eps.h | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 8 deletions(-) create mode 100644 ml-prims/test/dist_eps.h diff --git a/cuML/src/dbscan/vertexdeg/algo.h b/cuML/src/dbscan/vertexdeg/algo.h index 20468b61fb..b5e6980aaa 100644 --- a/cuML/src/dbscan/vertexdeg/algo.h +++ b/cuML/src/dbscan/vertexdeg/algo.h @@ -68,15 +68,13 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVe MLCommon::Distance::epsilon_neighborhood (data.x, data.x+startVertexId*k, data.adj, m, n, k, eps2, - (void*)workspace.data(), workspaceSize, - + (void*)workspace.data(), workspaceSize, stream, [vd, n] __device__ (int global_c_idx, bool in_neigh) { // fused construction of vertex degree int batch_vertex = global_c_idx - (n * (global_c_idx / n)); atomicAdd(vd+batch_vertex, in_neigh); atomicAdd(vd+n, in_neigh); - }, - stream + } ); CUDA_CHECK(cudaPeekAtLastError()); diff --git a/ml-prims/src/distance/distance.h b/ml-prims/src/distance/distance.h index ec1e66fd94..507c613577 100644 --- a/ml-prims/src/distance/distance.h +++ b/ml-prims/src/distance/distance.h @@ -233,13 +233,14 @@ void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, * * @param stream cuda stream */ -template +templatevoid> size_t epsilon_neighborhood(T *a, T *b, bool *adj, int m, int n, int k, T eps, - void *workspace, size_t worksize, Lambda fused_op, cudaStream_t stream) { + void *workspace, size_t worksize, cudaStream_t stream, + Lambda fused_op = [] __device__(int o, bool t){}) { auto epsilon_op = [n, eps, fused_op] __device__ (T val, int global_c_idx) { - int acc = val <= eps; + bool acc = val <= eps; fused_op(global_c_idx, acc); - return bool(acc); + return acc; }; distance diff --git a/ml-prims/test/dist_eps.h b/ml-prims/test/dist_eps.h new file mode 100644 index 0000000000..9ef3f8fe4e --- /dev/null +++ b/ml-prims/test/dist_eps.h @@ -0,0 +1,15 @@ +/* + * dist_eps.h + * + * Created on: May 9, 2019 + * Author: cjnolet + */ + +#ifndef DIST_EPS_H_ +#define DIST_EPS_H_ + + + + + +#endif /* DIST_EPS_H_ */ From 832ebdc608b2613b7ab19f2bb834d975d2de7a02 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 14:53:13 -0500 Subject: [PATCH 059/156] DOC BUILD.md updated with new flags and build process --- BUILD.md | 56 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/BUILD.md b/BUILD.md index 08f514ec18..a1ed2fe6c3 100644 --- a/BUILD.md +++ b/BUILD.md @@ -12,7 +12,9 @@ To install cuML from source, ensure the dependencies are met: 6. gcc (>=5.4.0) 7. BLAS - Any BLAS compatible with cmake's [FindBLAS](https://cmake.org/cmake/help/v3.12/module/FindBLAS.html). Note that the blas has to be installed to the same folder system as cmake, for example if using conda installed cmake, the blas implementation should also be installed in the conda environment. -### Installing from Source: +## Installing from Source: + +### Typical Process Once dependencies are present, follow the steps below: @@ -21,9 +23,9 @@ Once dependencies are present, follow the steps below: $ git clone --recurse-submodules https://github.com/rapidsai/cuml.git ``` -2. Build and install `libcuml` (the C++/CUDA library containing the cuML algorithms), starting from the repository root folder: +2. Build and install `libcuml++` (C++/CUDA library containing the cuML algorithms), starting from the repository root folder: ```bash -$ cd cuML +$ cd cpp $ mkdir build $ cd build $ export CUDA_BIN_PATH=$CUDA_HOME # (optional env variable if cuda binary is not in the PATH. Default CUDA_HOME=/path/to/cuda/) @@ -47,7 +49,7 @@ The configuration script will print the BLAS found on the search path. If the ve If using conda and a conda installed cmake, the `openblas` conda package is recommended and can be explicitly specified for `blas` and `lapack`: ```bash -cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.so -DLAPACK_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.so +cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.so ``` Additionally, to reduce compile times, you can specify a GPU compute capability to compile for, for example for Volta GPUs: @@ -56,8 +58,9 @@ Additionally, to reduce compile times, you can specify a GPU compute capability $ cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DGPU_ARCHS="70" ``` +There are many options to configure the build process, see the [customizing build section](#custom-build-options). -3. Build `libcuml`: +3. Build `libcuml++`: ```bash $ make -j @@ -67,12 +70,16 @@ $ make install To run tests (optional): ```bash -$ ./ml_test +$ ./ml_test # Single GPU algorithm tests +$ ./ml_mg_test # Multi GPU algorithm tests +$ ./prims_test # ML Primitive function tests ``` If you want a list of the available tests: ```bash -$ ./ml_test --gtest_list_tests +$ ./ml_test --gtest_list_tests # Single GPU algorithm tests +$ ./ml_mg_test --gtest_list_tests # Multi GPU algorithm tests +$ ./prims_test --gtest_list_tests # ML Primitive function tests ``` 4. Build the `cuml` python package: @@ -85,12 +92,12 @@ $ python setup.py build_ext --inplace To run Python tests (optional): ```bash -$ py.test -v +$ pytest -v ``` If you want a list of the available tests: ```bash -$ py.test cuML/test --collect-only +$ pytest cuML/test --collect-only ``` 5. Finally, install the Python package to your Python path: @@ -99,20 +106,27 @@ $ py.test cuML/test --collect-only $ python setup.py install ``` -6. You can also build and run tests for the machine learning primitive header only library located in the `ml-prims` folder. From the repository root: +### Custom Build Options -```bash -$ cd ml-prims -$ mkdir build -$ cd build -$ cmake .. -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DGPU_ARCHS="70" # specifying GPU_ARCH is optional, but significantly reduces compile time -$ make -j -``` +cuML's cmake has the following configurable flags available: -To run the ml-prim tests: -```bash -$./test/mlcommon_test -``` + + +| Flag | Possible Values | Default Value | Behavior | +| --- | --- | --- | --- | +| BLAS_LIBRARIES | path/to/blas_lib | "" | Optional variable allowing to manually specify location of BLAS library. | +| BUILD_CUML_CPP_LIBRARY | [ON, OFF] | ON | Enable/disable building libcuml++ shared library. If either BUILD_CUML_TESTS or BUILD_CUML_MG_TESTS are set to ON, this variable is forced to be ON | +| BUILD_CUML_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_test`. | +| BUILD_CUML_MG_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_mg_test`. | +| BUILD_PRIM_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `prims_test`. | +| BUILD_CUML_EXAMPLES | [ON, OFF] | ON | Enable/disable building cuML C++ API usage examples. | +| CMAKE_CXX11_ABI | [ON, OFF] | ON | Enable/disable the GLIBCXX11 ABI | +| DISABLE_OPENMP | [ON, OFF] | OFF | Set to `ON` to disable OpenMP | +| GPU_ARCHS | List of GPU architectures, semicolon-separated | 60;70;75 | List of GPU architectures that all artifacts are compiled for. | +| KERNEL_INFO | [ON, OFF] | OFF | Enable/disable kernel resource usage info in nvcc. | +| LINE_INFO | [ON, OFF] | OFF | Enable/disable lineinfo in nvcc. | + + From 26c82aa074144d87a980ada1fcd2a284e9c0ffdd Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 15:45:18 -0500 Subject: [PATCH 060/156] FIX Move submodules to /thirdparty folder --- .gitmodules | 8 ++++---- thirdparty/{ml-prims => }/cub | 0 thirdparty/cuml/ml-prims | 1 - thirdparty/{ml-prims => }/cutlass | 0 thirdparty/{cuml => }/faiss | 0 thirdparty/{ml-prims => }/googletest | 0 6 files changed, 4 insertions(+), 5 deletions(-) rename thirdparty/{ml-prims => }/cub (100%) delete mode 120000 thirdparty/cuml/ml-prims rename thirdparty/{ml-prims => }/cutlass (100%) rename thirdparty/{cuml => }/faiss (100%) rename thirdparty/{ml-prims => }/googletest (100%) diff --git a/.gitmodules b/.gitmodules index 6c4581998b..e8800d3c1e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,18 +1,18 @@ [submodule "thirdparty/cuml/faiss"] - path = thirdparty/cuml/faiss + path = thirdparty/faiss url = https://github.com/facebookresearch/faiss ignore = dirty [submodule "external/googletest"] path = thirdparty/cuml/googletest url = https://github.com/google/googletest [submodule "ml-prims/external/cutlass"] - path = thirdparty/ml-prims/cutlass + path = thirdparty/cutlass url = https://github.com/NVIDIA/cutlass [submodule "ml-prims/external/cub"] - path = thirdparty/ml-prims/cub + path = thirdparty/cub url = https://github.com/NVlabs/cub [submodule "ml-prims/external/googletest"] - path = thirdparty/ml-prims/googletest + path = thirdparty/googletest url = https://github.com/google/googletest [submodule "thirdparty/h2o4gpu"] path = thirdparty/h2o4gpu diff --git a/thirdparty/ml-prims/cub b/thirdparty/cub similarity index 100% rename from thirdparty/ml-prims/cub rename to thirdparty/cub diff --git a/thirdparty/cuml/ml-prims b/thirdparty/cuml/ml-prims deleted file mode 120000 index 92127ef8db..0000000000 --- a/thirdparty/cuml/ml-prims +++ /dev/null @@ -1 +0,0 @@ -../../ml-prims/ \ No newline at end of file diff --git a/thirdparty/ml-prims/cutlass b/thirdparty/cutlass similarity index 100% rename from thirdparty/ml-prims/cutlass rename to thirdparty/cutlass diff --git a/thirdparty/cuml/faiss b/thirdparty/faiss similarity index 100% rename from thirdparty/cuml/faiss rename to thirdparty/faiss diff --git a/thirdparty/ml-prims/googletest b/thirdparty/googletest similarity index 100% rename from thirdparty/ml-prims/googletest rename to thirdparty/googletest From bd5c20feb4a0723bd0ccfddd7e48634cc5ff39d3 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 16:45:57 -0500 Subject: [PATCH 061/156] FIX Correct submodule path --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index e8800d3c1e..08e5ab13bc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,4 +1,4 @@ -[submodule "thirdparty/cuml/faiss"] +[submodule "thirdparty/faiss"] path = thirdparty/faiss url = https://github.com/facebookresearch/faiss ignore = dirty From be343b62f1d0ff709e9a307f5e1c8ef483fd9128 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 17:09:34 -0500 Subject: [PATCH 062/156] FIX Correct more submodule paths --- .gitmodules | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.gitmodules b/.gitmodules index 08e5ab13bc..50d5f45f96 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,16 +2,16 @@ path = thirdparty/faiss url = https://github.com/facebookresearch/faiss ignore = dirty -[submodule "external/googletest"] +[submodule "thirdparty/cuml/googletest"] path = thirdparty/cuml/googletest url = https://github.com/google/googletest -[submodule "ml-prims/external/cutlass"] +[submodule "thirdparty/cutlass"] path = thirdparty/cutlass url = https://github.com/NVIDIA/cutlass -[submodule "ml-prims/external/cub"] +[submodule "thirdparty/cub"] path = thirdparty/cub url = https://github.com/NVlabs/cub -[submodule "ml-prims/external/googletest"] +[submodule "thirdparty/googletest"] path = thirdparty/googletest url = https://github.com/google/googletest [submodule "thirdparty/h2o4gpu"] From 048a8a7802129bd17948bd1551824525d1ec2cc7 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 17:12:59 -0500 Subject: [PATCH 063/156] FIX Remove cuml googletest submodule --- .gitmodules | 3 --- thirdparty/cuml/googletest | 1 - 2 files changed, 4 deletions(-) delete mode 160000 thirdparty/cuml/googletest diff --git a/.gitmodules b/.gitmodules index 50d5f45f96..2a58b3f13d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,9 +2,6 @@ path = thirdparty/faiss url = https://github.com/facebookresearch/faiss ignore = dirty -[submodule "thirdparty/cuml/googletest"] - path = thirdparty/cuml/googletest - url = https://github.com/google/googletest [submodule "thirdparty/cutlass"] path = thirdparty/cutlass url = https://github.com/NVIDIA/cutlass diff --git a/thirdparty/cuml/googletest b/thirdparty/cuml/googletest deleted file mode 160000 index bc2d0935b7..0000000000 --- a/thirdparty/cuml/googletest +++ /dev/null @@ -1 +0,0 @@ -Subproject commit bc2d0935b74917be0821bfd834472ed9cc4a3b5b From 1e483f3fb68e2db0aa29c94c6e1a5aed8e3c82fc Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 17:15:55 -0500 Subject: [PATCH 064/156] FIX Remove h204gpu submodule --- .gitmodules | 3 --- thirdparty/h2o4gpu | 1 - 2 files changed, 4 deletions(-) delete mode 160000 thirdparty/h2o4gpu diff --git a/.gitmodules b/.gitmodules index 2a58b3f13d..875a1d43c0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,6 +11,3 @@ [submodule "thirdparty/googletest"] path = thirdparty/googletest url = https://github.com/google/googletest -[submodule "thirdparty/h2o4gpu"] - path = thirdparty/h2o4gpu - url = https://github.com/h2oai/h2o4gpu diff --git a/thirdparty/h2o4gpu b/thirdparty/h2o4gpu deleted file mode 160000 index 7670008364..0000000000 --- a/thirdparty/h2o4gpu +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 76700083643ec227818c5fd29659bd426a9d06c0 From e2743660f3bb3d26209780537b596eea794b0a58 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 17:20:56 -0500 Subject: [PATCH 065/156] DOC Update build table font size --- BUILD.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/BUILD.md b/BUILD.md index a1ed2fe6c3..04723e5555 100644 --- a/BUILD.md +++ b/BUILD.md @@ -110,9 +110,6 @@ $ python setup.py install cuML's cmake has the following configurable flags available: - - - | Flag | Possible Values | Default Value | Behavior | | --- | --- | --- | --- | | BLAS_LIBRARIES | path/to/blas_lib | "" | Optional variable allowing to manually specify location of BLAS library. | @@ -127,6 +124,5 @@ cuML's cmake has the following configurable flags available: | KERNEL_INFO | [ON, OFF] | OFF | Enable/disable kernel resource usage info in nvcc. | | LINE_INFO | [ON, OFF] | OFF | Enable/disable lineinfo in nvcc. | - From b8ccdbe4e91efc57e38aa95a43489aab82ed28c5 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 17:39:27 -0500 Subject: [PATCH 066/156] FIX fix symbolic link --- cpp/external | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/external b/cpp/external index 7cebe27a22..dcef490fbc 120000 --- a/cpp/external +++ b/cpp/external @@ -1 +1 @@ -../thirdparty/cuml/ \ No newline at end of file +../thirdparty/ \ No newline at end of file From 207e6c04d1498b0c6bed973c46b06317715d431f Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Thu, 9 May 2019 18:42:05 -0400 Subject: [PATCH 067/156] Cython side of cumlhandle added for ols and ridge. --- cuML/src/glm/glm.cu | 51 ++++------ cuML/src/glm/glm.hpp | 16 ++-- cuML/src/glm/ols.h | 39 ++++---- cuML/src/glm/ridge.h | 25 +++++ .../cuml/linear_model/linear_regression.pyx | 96 +++++++++++-------- python/cuml/linear_model/ridge.pyx | 37 +++++-- 6 files changed, 158 insertions(+), 106 deletions(-) diff --git a/cuML/src/glm/glm.cu b/cuML/src/glm/glm.cu index c0531f9b0e..f7ce075b0b 100644 --- a/cuML/src/glm/glm.cu +++ b/cuML/src/glm/glm.cu @@ -25,76 +25,61 @@ namespace GLM { using namespace MLCommon; -void olsFit(float *input, int n_rows, int n_cols, float *labels, float *coef, +void olsFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, float *labels, float *coef, float *intercept, bool fit_intercept, bool normalize, int algo) { - cumlHandle handle; + olsFit(handle.getImpl(), input, n_rows, n_cols, labels, coef, intercept, fit_intercept, normalize, handle.getStream(), algo); - ///@todo this should go away after cumlHandle exposure in the interface - CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); + } -void olsFit(double *input, int n_rows, int n_cols, double *labels, double *coef, +void olsFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, double *labels, double *coef, double *intercept, bool fit_intercept, bool normalize, int algo) { - cumlHandle handle; + olsFit(handle.getImpl(), input, n_rows, n_cols, labels, coef, intercept, fit_intercept, normalize, handle.getStream(), algo); - ///@todo this should go away after cumlHandle exposure in the interface - CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } -void olsPredict(const float *input, int n_rows, int n_cols, const float *coef, +void olsPredict(const cumlHandle &handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds) { - cumlHandle handle; - olsPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, + + olsPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, handle.getStream()); - ///@todo this should go away after cumlHandle exposure in the interface - CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } -void olsPredict(const double *input, int n_rows, int n_cols, const double *coef, +void olsPredict(const cumlHandle &handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds) { - cumlHandle handle; + olsPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, handle.getStream()); - ///@todo this should go away after cumlHandle exposure in the interface - CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } -void ridgeFit(float *input, int n_rows, int n_cols, float *labels, float *alpha, +void ridgeFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, float *labels, float *alpha, int n_alpha, float *coef, float *intercept, bool fit_intercept, bool normalize, int algo) { - cumlHandle handle; + ridgeFit(handle.getImpl(), input, n_rows, n_cols, labels, alpha, n_alpha, coef, intercept, fit_intercept, normalize, handle.getStream(), algo); - ///@todo this should go away after cumlHandle exposure in the interface - CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } -void ridgeFit(double *input, int n_rows, int n_cols, double *labels, +void ridgeFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, double *labels, double *alpha, int n_alpha, double *coef, double *intercept, bool fit_intercept, bool normalize, int algo) { - cumlHandle handle; + ridgeFit(handle.getImpl(), input, n_rows, n_cols, labels, alpha, n_alpha, coef, intercept, fit_intercept, normalize, handle.getStream(), algo); - ///@todo this should go away after cumlHandle exposure in the interface - CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } -void ridgePredict(const float *input, int n_rows, int n_cols, const float *coef, +void ridgePredict(const cumlHandle &handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds) { - cumlHandle handle; + ridgePredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, handle.getStream()); - ///@todo this should go away after cumlHandle exposure in the interface - CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } -void ridgePredict(const double *input, int n_rows, int n_cols, const double *coef, +void ridgePredict(const cumlHandle &handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds) { - cumlHandle handle; + ridgePredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, handle.getStream()); - ///@todo this should go away after cumlHandle exposure in the interface - CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } void qnFit(const cumlHandle &cuml_handle, float *X, float *y, int N, int D, diff --git a/cuML/src/glm/glm.hpp b/cuML/src/glm/glm.hpp index 4fc61785d5..f28352077b 100644 --- a/cuML/src/glm/glm.hpp +++ b/cuML/src/glm/glm.hpp @@ -32,9 +32,9 @@ namespace GLM { * @param algo specifies which solver to use (0: SVD, 1: Eigendecomposition, 2: QR-decomposition) * @{ */ -void olsFit(float *input, int n_rows, int n_cols, float *labels, float *coef, +void olsFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, float *labels, float *coef, float *intercept, bool fit_intercept, bool normalize, int algo = 0); -void olsFit(double *input, int n_rows, int n_cols, double *labels, double *coef, +void olsFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, double *labels, double *coef, double *intercept, bool fit_intercept, bool normalize, int algo = 0); /** @} */ @@ -54,11 +54,11 @@ void olsFit(double *input, int n_rows, int n_cols, double *labels, double *coef, * @param algo specifies which solver to use (0: SVD, 1: Eigendecomposition) * @{ */ -void ridgeFit(float *input, int n_rows, int n_cols, float *labels, float *alpha, +void ridgeFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, float *labels, float *alpha, int n_alpha, float *coef, float *intercept, bool fit_intercept, bool normalize, int algo = 0); -void ridgeFit(double *input, int n_rows, int n_cols, double *labels, +void ridgeFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, double *labels, double *alpha, int n_alpha, double *coef, double *intercept, bool fit_intercept, bool normalize, int algo = 0); /** @} */ @@ -73,15 +73,15 @@ void ridgeFit(double *input, int n_rows, int n_cols, double *labels, * @param preds device pointer to store predictions of size n_rows * @{ */ -void olsPredict(const float *input, int n_rows, int n_cols, const float *coef, +void olsPredict(const cumlHandle &handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds); -void olsPredict(const double *input, int n_rows, int n_cols, const double *coef, +void olsPredict(const cumlHandle &handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds); -void ridgePredict(const float *input, int n_rows, int n_cols, const float *coef, +void ridgePredict(const cumlHandle &handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds); -void ridgePredict(const double *input, int n_rows, int n_cols, +void ridgePredict(const cumlHandle &handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds); /** @} */ diff --git a/cuML/src/glm/ols.h b/cuML/src/glm/ols.h index 41c4cbfa68..89acb6af2b 100644 --- a/cuML/src/glm/ols.h +++ b/cuML/src/glm/ols.h @@ -27,22 +27,6 @@ #include #include #include -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - #include #include "preprocess.h" #include "common/cumlHandle.hpp" @@ -53,6 +37,19 @@ namespace GLM { using namespace MLCommon; +/** + * @defgroup Functions fit an ordinary least squares model + * @param input device pointer to feature matrix n_rows x n_cols + * @param n_rows number of rows of the feature matrix + * @param n_cols number of columns of the feature matrix + * @param labels device pointer to label vector of length n_rows + * @param coef device pointer to hold the solution for weights of size n_cols + * @param intercept device pointer to hold the solution for bias term of size 1 + * @param fit_intercept if true, fit intercept + * @param normalize if true, normalize data to zero mean, unit variance + * @param algo specifies which solver to use (0: SVD, 1: Eigendecomposition, 2: QR-decomposition) + * @{ + */ template void olsFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, math_t *labels, math_t *coef, math_t *intercept, bool fit_intercept, bool normalize, @@ -106,6 +103,16 @@ void olsFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols } +/** + * @defgroup Functions to make predictions with a fitted ordinary least squares and ridge regression model + * @param input device pointer to feature matrix n_rows x n_cols + * @param n_rows number of rows of the feature matrix + * @param n_cols number of columns of the feature matrix + * @param coef weights of the model + * @param intercept bias term of the model + * @param preds device pointer to store predictions of size n_rows + * @{ + */ template void olsPredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, int n_cols, const math_t *coef, math_t intercept, math_t *preds, cudaStream_t stream) { diff --git a/cuML/src/glm/ridge.h b/cuML/src/glm/ridge.h index caf9fd0b13..5308e02e6f 100644 --- a/cuML/src/glm/ridge.h +++ b/cuML/src/glm/ridge.h @@ -125,6 +125,21 @@ void ridgeEig(const cumlHandle_impl& handle, math_t *A, int n_rows, int n_cols, CUDA_CHECK(cudaFree(S)); } +/** + * @defgroup Functions fit a ridge regression model (l2 regularized least squares) + * @param input device pointer to feature matrix n_rows x n_cols + * @param n_rows number of rows of the feature matrix + * @param n_cols number of columns of the feature matrix + * @param labels device pointer to label vector of length n_rows + * @param alpha device pointer to parameters of the l2 regularizer + * @param n_alpha number of regularization parameters + * @param coef device pointer to hold the solution for weights of size n_cols + * @param intercept device pointer to hold the solution for bias term of size 1 + * @param fit_intercept if true, fit intercept + * @param normalize if true, normalize data to zero mean, unit variance + * @param algo specifies which solver to use (0: SVD, 1: Eigendecomposition) + * @{ + */ template void ridgeFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, math_t *labels, math_t *alpha, int n_alpha, math_t *coef, math_t *intercept, @@ -184,6 +199,16 @@ void ridgeFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_co } +/** + * @defgroup Functions to make predictions with a fitted ordinary least squares and ridge regression model + * @param input device pointer to feature matrix n_rows x n_cols + * @param n_rows number of rows of the feature matrix + * @param n_cols number of columns of the feature matrix + * @param coef weights of the model + * @param intercept bias term of the model + * @param preds device pointer to store predictions of size n_rows + * @{ + */ template void ridgePredict(const cumlHandle_impl& handle, const math_t *input, int n_rows, int n_cols, const math_t *coef, math_t intercept, math_t *preds, diff --git a/python/cuml/linear_model/linear_regression.pyx b/python/cuml/linear_model/linear_regression.pyx index dd34fbc6d6..1fd9f72060 100644 --- a/python/cuml/linear_model/linear_regression.pyx +++ b/python/cuml/linear_model/linear_regression.pyx @@ -31,40 +31,45 @@ from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free from cuml.common.base import Base +from cuml.common.handle cimport cumlHandle cdef extern from "glm/glm.hpp" namespace "ML::GLM": - cdef void olsFit(float *input, + cdef void olsFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, float *coef, float *intercept, bool fit_intercept, - bool normalize, int algo) + bool normalize, int algo) except + - cdef void olsFit(double *input, + cdef void olsFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, double *coef, double *intercept, bool fit_intercept, - bool normalize, int algo) + bool normalize, int algo) except + - cdef void olsPredict(const float *input, + cdef void olsPredict(cumlHandle& handle, + const float *input, int n_rows, int n_cols, const float *coef, float intercept, - float *preds) + float *preds) except + - cdef void olsPredict(const double *input, + cdef void olsPredict(cumlHandle& handle, + const double *input, int n_rows, int n_cols, const double *coef, double intercept, - double *preds) + double *preds) except + class LinearRegression(Base): @@ -166,7 +171,7 @@ class LinearRegression(Base): # New link: https://github.com/rapidsai/cuml/blob/master/python/notebooks/linear_regression_demo.ipynb - def __init__(self, algorithm='eig', fit_intercept=True, normalize=False): + def __init__(self, algorithm='eig', fit_intercept=True, normalize=False, handle=None): """ Initializes the linear regression class. @@ -178,6 +183,7 @@ class LinearRegression(Base): normalize: boolean. For more information, see `scikitlearn's OLS `_. """ + super(LinearRegression, self).__init__(handle=handle, verbose=False) self.coef_ = None self.intercept_ = None self.fit_intercept = fit_intercept @@ -256,31 +262,38 @@ class LinearRegression(Base): cdef float c_intercept1 cdef double c_intercept2 + cdef cumlHandle* handle_ = self.handle.getHandle() + if self.gdf_datatype.type == np.float32: - olsFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.normalize, - self.algo) + olsFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.normalize, + self.algo) self.intercept_ = c_intercept1 else: - olsFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.normalize, - self.algo) + olsFit(handle_[0], + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.normalize, + self.algo) self.intercept_ = c_intercept2 + + self.handle.sync() + return self @@ -322,21 +335,26 @@ class LinearRegression(Base): cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) preds = cudf.Series(np.zeros(n_rows, dtype=pred_datatype)) cdef uintptr_t preds_ptr = self._get_cudf_column_ptr(preds) + cdef cumlHandle* handle_ = self.handle.getHandle() if pred_datatype.type == np.float32: - olsPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr) + olsPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr) else: - olsPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr) + olsPredict(handle_[0], + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr) + + self.handle.sync() del(X_m) diff --git a/python/cuml/linear_model/ridge.pyx b/python/cuml/linear_model/ridge.pyx index a45f58d7c4..8d5ec49070 100644 --- a/python/cuml/linear_model/ridge.pyx +++ b/python/cuml/linear_model/ridge.pyx @@ -31,10 +31,12 @@ from libc.stdlib cimport calloc, malloc, free from cuml.metrics.base import RegressorMixin from cuml.common.base import Base +from cuml.common.handle cimport cumlHandle cdef extern from "glm/glm.hpp" namespace "ML::GLM": - cdef void ridgeFit(float *input, + cdef void ridgeFit(cumlHandle& handle, + float *input, int n_rows, int n_cols, float *labels, @@ -46,7 +48,8 @@ cdef extern from "glm/glm.hpp" namespace "ML::GLM": bool normalize, int algo) - cdef void ridgeFit(double *input, + cdef void ridgeFit(cumlHandle& handle, + double *input, int n_rows, int n_cols, double *labels, @@ -58,14 +61,16 @@ cdef extern from "glm/glm.hpp" namespace "ML::GLM": bool normalize, int algo) - cdef void ridgePredict(const float *input, + cdef void ridgePredict(cumlHandle& handle, + const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds) - cdef void ridgePredict(const double *input, + cdef void ridgePredict(cumlHandle& handle, + const double *input, int n_rows, int n_cols, const double *coef, @@ -178,7 +183,7 @@ class Ridge(Base, RegressorMixin): # New link : https://github.com/rapidsai/notebooks/blob/master/cuml/ridge_regression_demo.ipynb - def __init__(self, alpha=1.0, solver='eig', fit_intercept=True, normalize=False): + def __init__(self, alpha=1.0, solver='eig', fit_intercept=True, normalize=False, handle=None): """ Initializes the linear ridge regression class. @@ -190,7 +195,8 @@ class Ridge(Base, RegressorMixin): normalize: boolean. For more information, see `scikitlearn's OLS `_. """ - # self._check_alpha(alpha) + self._check_alpha(alpha) + super(Ridge, self).__init__(handle=handle, verbose=False) self.alpha = alpha self.coef_ = None self.intercept_ = None @@ -290,9 +296,12 @@ class Ridge(Base, RegressorMixin): cdef double c_intercept2 cdef float c_alpha1 cdef double c_alpha2 + cdef cumlHandle* handle_ = self.handle.getHandle() + if self.gdf_datatype.type == np.float32: c_alpha1 = self.alpha - ridgeFit(X_ptr, + ridgeFit(handle_[0], + X_ptr, self.n_rows, self.n_cols, y_ptr, @@ -307,7 +316,8 @@ class Ridge(Base, RegressorMixin): self.intercept_ = c_intercept1 else: c_alpha2 = self.alpha - ridgeFit(X_ptr, + ridgeFit(handle_[0], + X_ptr, self.n_rows, self.n_cols, y_ptr, @@ -321,6 +331,8 @@ class Ridge(Base, RegressorMixin): self.intercept_ = c_intercept2 + self.handle.sync() + return self def predict(self, X): @@ -361,22 +373,27 @@ class Ridge(Base, RegressorMixin): cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) preds = cudf.Series(np.zeros(n_rows, dtype=pred_datatype)) cdef uintptr_t preds_ptr = self._get_column_ptr(preds) + cdef cumlHandle* handle_ = self.handle.getHandle() if pred_datatype.type == np.float32: - ridgePredict(X_ptr, + ridgePredict(handle_[0], + X_ptr, n_rows, n_cols, coef_ptr, self.intercept_, preds_ptr) else: - ridgePredict(X_ptr, + ridgePredict(handle_[0], + X_ptr, n_rows, n_cols, coef_ptr, self.intercept_, preds_ptr) + self.handle.sync() + del(X_m) return preds From 9e5c62c5f53b35ddbd6baaa7f04e43b65466d5e1 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Thu, 9 May 2019 18:44:47 -0400 Subject: [PATCH 068/156] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18d8e1ee80..b1b089227c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - PR #590: QN Recover from numeric errors - PR #482: Introduce cumlHandle for pca and tsvd - PR #573: Remove use of unnecessary cuDF column and series copies +- PR #596: Introduce cumlHandle for ols and ridge ## Bug Fixes - PR #584: Added missing virtual destructor to deviceAllocator and hostAllocator From d7445031f3d40f3987906b9578eda5591a41c720 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 17:51:34 -0500 Subject: [PATCH 069/156] DOC Updated cpp README file --- cpp/README.md | 57 +++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index c008a6eca1..e26d22621f 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -1,44 +1,53 @@ -# cuML -This repo contains some of the ML algorithms. +# cuML C++ + +This folder contains the C++ and CUDA code of the algorithms and ML primitives of cuML. The build system uses CMake for build configuration, and an out-of-source build is recommended. # Setup ## Dependencies 1. zlib -2. cmake (>= 3.8 and <= 3.11.4, version 3.11.4 is recommended and there are some issues with version 3.12) -3. CUDA SDK (>= 9.2) -4. Cython (>= 0.28) -5. gcc (>=5.4.0) -6. nvcc (this comes with CUDA SDK) +2. cmake (>= 3.12.4) +3. CUDA (>= 9.2) +4. gcc (>=5.4.0) +5. BLAS - Any BLAS compatible with cmake's [FindBLAS](https://cmake.org/cmake/help/v3.12/module/FindBLAS.html). Note that the blas has to be installed to the same folder system as cmake, for example if using conda installed cmake, the blas implementation should also be installed in the conda environment. -### Building cuML: +## Building cuML: -cuML is implemented as header only C++/CUDA libraries for the developers who would like to call these APIs from their projects. You can build and run the Google tests if you are interested in helping us to improve these libraries. +The main artifact produced by the build system is the shared library libcuml++. Additionally, executables to run tests for the algorithms can be built. To see detailed steps see the [BUILD](../BUILD.md) document of the repository. -First, clone the cuML if you haven't cloned it yet. +Current cmake offers the following configuration options: -```bash -$ git clone --recursive git@github.com:rapidsai/cuml-alpha.git -``` +| Flag | Possible Values | Default Value | Behavior | +| --- | --- | --- | --- | +| BLAS_LIBRARIES | path/to/blas_lib | "" | Optional variable allowing to manually specify location of BLAS library. | +| BUILD_CUML_CPP_LIBRARY | [ON, OFF] | ON | Enable/disable building libcuml++ shared library. If either BUILD_CUML_TESTS or BUILD_CUML_MG_TESTS are set to ON, this variable is forced to be ON | +| BUILD_CUML_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_test`. | +| BUILD_CUML_MG_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_mg_test`. | +| BUILD_PRIM_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `prims_test`. | +| BUILD_CUML_EXAMPLES | [ON, OFF] | ON | Enable/disable building cuML C++ API usage examples. | +| CMAKE_CXX11_ABI | [ON, OFF] | ON | Enable/disable the GLIBCXX11 ABI | +| DISABLE_OPENMP | [ON, OFF] | OFF | Set to `ON` to disable OpenMP | +| GPU_ARCHS | List of GPU architectures, semicolon-separated | 60;70;75 | List of GPU architectures that all artifacts are compiled for. | +| KERNEL_INFO | [ON, OFF] | OFF | Enable/disable kernel resource usage info in nvcc. | +| LINE_INFO | [ON, OFF] | OFF | Enable/disable lineinfo in nvcc. | -To build ml-prims, in the main folder; +After running CMake in a `build` directory, if the `BUILD_*` options were not turned `OFF`, the following targets can be built: ```bash -$ cd cuML -$ mkdir build -$ cd build -$ cmake .. -$ make -j -$ ./ml_test +$ make -j # Build libcuml++ and all tests +$ make -j cuml++ # Build libcuml++ +$ make -j ml_test # Build ml_test algorithm tests binary +$ make -j ml_mg_test # Build ml_mg_test multi GPU algorithms tests binary +$ make -j prims_test # Build prims_test ML primitive unit tests binary ``` -## External +## Third Party Modules -The external folders inside cuML contain submodules that this project in-turn depends on. Appropriate location flags -will be automatically populated in the main CMakeLists.txt file for these. +The external folder contains submodules that cuML depends on. Current external submodules are: 1. [CUTLASS](https://github.com/NVIDIA/cutlass) 2. [CUB](https://github.com/NVlabs/cub) -3. [Google Test](https://github.com/google/googletest) +3. [Faiss] (https://github.com/facebookresearch/faiss) +4. [Google Test](https://github.com/google/googletest) From 9f9e3167ea3d2628d41596044554b35ce82d0966 Mon Sep 17 00:00:00 2001 From: Chirayu Date: Thu, 9 May 2019 15:52:17 -0700 Subject: [PATCH 070/156] Fix comments, add test case for skipped labels --- ml-prims/src/metrics/contingencyMatrix.h | 6 ++++-- ml-prims/test/contingencyMatrix.cu | 25 ++++++++++++++++++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/ml-prims/src/metrics/contingencyMatrix.h b/ml-prims/src/metrics/contingencyMatrix.h index 5c08e62c78..b09277e67b 100644 --- a/ml-prims/src/metrics/contingencyMatrix.h +++ b/ml-prims/src/metrics/contingencyMatrix.h @@ -219,8 +219,8 @@ size_t getCMatrixWorkspaceSize(int nSamples, T* groundTruth, cudaStream_t stream } /** - * @brief contruct contingency matrix given input ground truth and prediction labels - * users should call function getInputClassCardinality to find and allocate memory for + * @brief contruct contingency matrix given input ground truth and prediction labels. + * Users should call function getInputClassCardinality to find and allocate memory for * output. Similarly workspace requirements should be checked using function getCMatrixWorkspaceSize * @param groundTruth: device 1-d array for ground truth (num of rows) * @param predictedLabel: device 1-d array for prediction (num of columns) @@ -243,6 +243,8 @@ void contingencyMatrix(T *groundTruth, T *predictedLabel, int nSamples, int *out // it is also assumed that true labels are monotically increasing with step count 1 // if for some reason groundTruth completely skips some labels // eg: {0,1,2,5} instead of {0,1,2,3} . Output matrix will still have empty rows for label value {3,4} + // Users can use "make_monotonic" ML_prim located at ml-prims/src/array/array.h + // to convert their discontinuous input label range to a monotonically increasing one // this also serves as way to measure co-occurence/joint counts for NLP tasks which // can be used to then compute pointwise mutual information and mutual information diff --git a/ml-prims/test/contingencyMatrix.cu b/ml-prims/test/contingencyMatrix.cu index 13fcfbf67b..28078655ed 100644 --- a/ml-prims/test/contingencyMatrix.cu +++ b/ml-prims/test/contingencyMatrix.cu @@ -29,6 +29,7 @@ struct contingencyMatrixParam { int minClass; int maxClass; bool calcCardinality; + bool skipLabels; float tolerance; }; @@ -51,6 +52,21 @@ protected: std::generate(y.begin(), y.end(), [&](){return intGenerator(dre); }); std::generate(y_hat.begin(), y_hat.end(), [&](){return intGenerator(dre); }); + if (params.skipLabels) { + // remove two label value from input arrays + int y1 = (upperLabelRange - lowerLabelRange) / 2; + int y2 = y1 + (upperLabelRange - lowerLabelRange) / 4; + + // replacement values + int y1_R = y1 + 1; + int y2_R = y2 + 1; + + std::replace(y.begin(), y.end(), y1, y1_R); + std::replace(y.begin(), y.end(), y2, y2_R); + std::replace(y_hat.begin(), y_hat.end(), y1, y1_R); + std::replace(y_hat.begin(), y_hat.end(), y2, y2_R); + } + numUniqueClasses = upperLabelRange - lowerLabelRange + 1; // generate golden output on CPU @@ -119,10 +135,11 @@ protected: }; const std::vector inputs = { - {10000, 1, 10, true, 0.000001}, - {100000, 1, 100, false, 0.000001}, - {1000000, 1, 1200, true, 0.000001}, - {1000000, 1, 10000, false, 0.000001} + {10000, 1, 10, true, false, 0.000001}, + {100000, 1, 100, false, false, 0.000001}, + {1000000, 1, 1200, true, false, 0.000001}, + {1000000, 1, 10000, false, false, 0.000001}, + {100000, 1, 100, false, true, 0.000001} }; typedef ContingencyMatrixTestImpl ContingencyMatrixTestImplS; From 1dd32ca29fa042ae009ae3d51b421e08ad5007b9 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 17:56:37 -0500 Subject: [PATCH 071/156] DOC Add source code explanation to README --- cpp/README.md | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/cpp/README.md b/cpp/README.md index e26d22621f..84ac1be6be 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -2,8 +2,17 @@ This folder contains the C++ and CUDA code of the algorithms and ML primitives of cuML. The build system uses CMake for build configuration, and an out-of-source build is recommended. -# Setup -## Dependencies +## Important Folders + +The source code of cuML is divided in two main files: `src` and `src_prims`. + +- `src` contains the source code of the Machine Learning algorithms, and the main cuML C++ API. The main consumable is the shared library `libcuml++`, that can be used stand alone by C++ consumers or is consumed by our Python package `cuml` to provide a Python API. +- `src_prims` contains most of the common components and computational primitives that form part of the machine learning algorithms in cuML, and can be used individually as well in the form of a header only library. + +The test folder has subfolders that reflect this distinction between the components of cuML. + +## Setup +### Dependencies 1. zlib 2. cmake (>= 3.12.4) @@ -11,7 +20,7 @@ This folder contains the C++ and CUDA code of the algorithms and ML primitives o 4. gcc (>=5.4.0) 5. BLAS - Any BLAS compatible with cmake's [FindBLAS](https://cmake.org/cmake/help/v3.12/module/FindBLAS.html). Note that the blas has to be installed to the same folder system as cmake, for example if using conda installed cmake, the blas implementation should also be installed in the conda environment. -## Building cuML: +### Building cuML: The main artifact produced by the build system is the shared library libcuml++. Additionally, executables to run tests for the algorithms can be built. To see detailed steps see the [BUILD](../BUILD.md) document of the repository. @@ -41,7 +50,7 @@ $ make -j ml_mg_test # Build ml_mg_test multi GPU algorithms tests binary $ make -j prims_test # Build prims_test ML primitive unit tests binary ``` -## Third Party Modules +### Third Party Modules The external folder contains submodules that cuML depends on. @@ -49,5 +58,5 @@ Current external submodules are: 1. [CUTLASS](https://github.com/NVIDIA/cutlass) 2. [CUB](https://github.com/NVlabs/cub) -3. [Faiss] (https://github.com/facebookresearch/faiss) +3. [Faiss](https://github.com/facebookresearch/faiss) 4. [Google Test](https://github.com/google/googletest) From 8460f40eb8626c93fbc23b80a2a193f775db1e7a Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 17:57:59 -0500 Subject: [PATCH 072/156] DOC Update subtitle --- cpp/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/README.md b/cpp/README.md index 84ac1be6be..617c448b8e 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -2,7 +2,7 @@ This folder contains the C++ and CUDA code of the algorithms and ML primitives of cuML. The build system uses CMake for build configuration, and an out-of-source build is recommended. -## Important Folders +## Source Code Folders The source code of cuML is divided in two main files: `src` and `src_prims`. From ae5d02642bed3ec05664dc8284d471b24c526218 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 18:19:23 -0500 Subject: [PATCH 073/156] FIX dependencies paths in cmake --- cpp/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cb3f6b7639..3a7c531bcb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -110,7 +110,7 @@ endif() ################################################################################################### # - External Dependencies-------------------------------------------------------------------------- -set(GTEST_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims/external/googletest/googletest CACHE STRING +set(GTEST_DIR ${PROJECT_SOURCE_DIR}/external/googletest/googletest CACHE STRING "Path to the googletest repo") set(GTEST_LIBNAME "gtest_main" CACHE STRING @@ -122,10 +122,10 @@ set(FAISS_DIR ${PROJECT_SOURCE_DIR}/external/faiss CACHE STRING set(MLPRIMS_DIR ${PROJECT_SOURCE_DIR}/src_prims/ CACHE STRING "Path to the ml-prims repo") -set(CUB_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims/external/cub CACHE STRING +set(CUB_DIR ${PROJECT_SOURCE_DIR}/external/cub CACHE STRING "Path to cub repo") -set(CUTLASS_DIR ${PROJECT_SOURCE_DIR}/external/ml-prims/external/cutlass CACHE STRING +set(CUTLASS_DIR ${PROJECT_SOURCE_DIR}/external/cutlass CACHE STRING "Path to the cutlass repo") set(CUDA_nvgraph_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libnvgraph.so CACHE STRING From 8a8630cf63f46d6a6521e61421416280dc7becbf Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 18:32:13 -0500 Subject: [PATCH 074/156] FEA Updated GPU CI build.sh for new build --- ci/gpu/build.sh | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 162a606615..071e38d072 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. ######################################### # cuML GPU build and test script for CI # ######################################### @@ -64,21 +64,21 @@ fi ################################################################################ logger "Build libcuml..." -mkdir -p $WORKSPACE/cuML/build -cd $WORKSPACE/cuML/build +mkdir -p $WORKSPACE/cpp/build +cd $WORKSPACE/cpp/build logger "Run cmake libcuml..." cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a -DLAPACK_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH .. logger "Clean up make..." make clean -logger "Make libcuml..." -make -j${PARALLEL_LEVEL} +logger "Make libcuml++ and algorithm tests..." +make -j${PARALLEL_LEVEL} cuml++ ml_test ml_mg_test -logger "Install libcuml..." +logger "Install libcuml++..." make -j${PARALLEL_LEVEL} install -logger "Build cuML..." +logger "Build cuml python package..." cd $WORKSPACE/python python setup.py build_ext --inplace @@ -91,12 +91,12 @@ logger "Check GPU usage..." nvidia-smi logger "GoogleTest for libcuml..." -cd $WORKSPACE/cuML/build +cd $WORKSPACE/cpp/build GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./ml_test -logger "Python py.test for cuML..." +logger "Python pytest for cuml..." cd $WORKSPACE/python -py.test --cache-clear --junitxml=${WORKSPACE}/junit-cuml.xml -v +pytest --cache-clear --junitxml=${WORKSPACE}/junit-cuml.xml -v ################################################################################ @@ -104,15 +104,13 @@ py.test --cache-clear --junitxml=${WORKSPACE}/junit-cuml.xml -v ################################################################################ logger "Build ml-prims tests..." -mkdir -p $WORKSPACE/ml-prims/build -cd $WORKSPACE/ml-prims/build -cmake $GPU_ARCH .. +cd $WORKSPACE/cpp/build logger "Clean up make..." make clean logger "Make ml-prims test..." -make -j${PARALLEL_LEVEL} +make -j${PARALLEL_LEVEL} prims_test logger "Run ml-prims test..." cd $WORKSPACE/ml-prims/build -GTEST_OUTPUT="xml:${WORKSPACE}/test-results/ml-prims/" ./test/mlcommon_test +GTEST_OUTPUT="xml:${WORKSPACE}/test-results/ml-prims/" ./test/prims_test From a6c3c427d434d9800346454033a940ac63b0bbc1 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 18:34:47 -0500 Subject: [PATCH 075/156] FEA Updated libcuml++ conda recipe --- conda/recipes/libcuml/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/recipes/libcuml/build.sh b/conda/recipes/libcuml/build.sh index 67f4d9adb0..be53837e8b 100644 --- a/conda/recipes/libcuml/build.sh +++ b/conda/recipes/libcuml/build.sh @@ -12,11 +12,11 @@ printenv # Cleanup local git git clean -xdf # Change directory for build process -cd cuML +cd cpp # Use CMake-based build procedure mkdir build cd build # configure cmake $CMAKE_COMMON_VARIABLES .. # build -make -j${PARALLEL_LEVEL} VERBOSE=1 install +make -j${PARALLEL_LEVEL} cuml++ VERBOSE=1 install From 7c1eae029163aeaddc059652469fd73ba1721701 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 18:37:29 -0500 Subject: [PATCH 076/156] FIX Remove commented code --- cpp/CMakeLists.txt | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3a7c531bcb..538e188dcf 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -90,23 +90,12 @@ else() message(FATAL_ERROR "ZLib not found, please check your settings.") endif(ZLIB_FOUND) -# set(CMAKE_THREAD_PREFER_PTHREAD TRUE) -# find_package (Threads REQUIRED) -# if(NOT CMAKE_USE_PTHREADS_INIT) -# message(FATAL_ERROR "pthreads not found, please check your settings") - if(NOT DEFINED BLAS_LIBRARIES) find_package( BLAS REQUIRED ) else() message(STATUS "Manually setting BLAS to ${BLAS_LIBRARIES}") endif() -# if(NOT DEFINED LAPACK_LIBRARIES) -# find_package( LAPACK REQUIRED ) -# else() -# message(STATUS "Manually setting LAPACK to ${LAPACK_LIBRARIES}") -# endif() - ################################################################################################### # - External Dependencies-------------------------------------------------------------------------- From b5d99e74fa54652a92e496976157dd69e96f66d4 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 18:40:15 -0500 Subject: [PATCH 077/156] FIX Remove remaining straggling files --- ml-prims/CMakeLists.txt | 116 ---------------------------------------- ml-prims/README.md | 56 ------------------- ml-prims/external | 1 - 3 files changed, 173 deletions(-) delete mode 100644 ml-prims/CMakeLists.txt delete mode 100644 ml-prims/README.md delete mode 120000 ml-prims/external diff --git a/ml-prims/CMakeLists.txt b/ml-prims/CMakeLists.txt deleted file mode 100644 index 504a9ff594..0000000000 --- a/ml-prims/CMakeLists.txt +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -cmake_minimum_required(VERSION 3.8 FATAL_ERROR) -project(mlcommon LANGUAGES CXX CUDA) -set(CMAKE_CXX_STANDARD 11) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(MLPRIMS_DIR ${PROJECT_SOURCE_DIR}) # Path to the root of ml-prims project -set(CMAKE_MODULE_PATH ${MLPRIMS_DIR}/cmake) - -find_package(CUDA 8.0 REQUIRED) -find_package(OpenMP REQUIRED) -# TODO: enable this when we are ready! -#find_package(ClangFormat REQUIRED) -#find_package(ClangTidy REQUIRED) - -find_package(ZLIB REQUIRED) -if(ZLIB_FOUND) - message(STATUS "ZLib found in ${ZLIB_INCLUDE_DIRS}") -else() - message(FATAL_ERROR "ZLib not found, please check your settings.") -endif(ZLIB_FOUND) - -# Submodules -set(GTEST_DIR ${PROJECT_SOURCE_DIR}/external/googletest CACHE STRING - "Path to the googletest repo") -set(GTEST_LIBNAME "gtest_main" CACHE STRING - "Name of the googletest library") -set(CUTLASS_DIR ${PROJECT_SOURCE_DIR}/external/cutlass CACHE STRING - "Path to the cutlass repo") -set(CUB_DIR ${PROJECT_SOURCE_DIR}/external/cub CACHE STRING - "Path to cub repo") - -# options exposed to users -set(GPU_ARCHS "" CACHE STRING - "List of GPU architectures (semicolon-separated) to be compiled for") -option(LINEINFO "Enable lineinfo in nvcc" OFF) -option(KERNELINFO "Enable kernel resource usage info" OFF) -option(DEBUG "Get a debug build" OFF) -option(TESTS "Enable running tests" ON) - -## start nvcc options -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fopenmp") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp") -if(CMAKE_CXX_STANDARD STREQUAL "11") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --std=c++11") -endif() -if(LINEINFO) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") -endif() -if(KERNELINFO) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xptxas=-v") -endif() -if(DEBUG) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G -g") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g") -endif() -# Generate optimized binary for every known arch -if(NOT GPU_ARCHS) - set(GPU_ARCHS "60;61") - # NOTE: NOTE: Add more 'if's for every new arch release! - if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9)) - set(GPU_ARCHS "${GPU_ARCHS};70") - endif() - if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10)) - set(GPU_ARCHS "${GPU_ARCHS};75") - endif() -endif() -foreach(arch ${GPU_ARCHS}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}") -endforeach() -# Generate PTX (to be JIT'd at runtime) for the latest architecture -# It is assumed that the last arch in the 'archs' is the latest! -list(GET GPU_ARCHS -1 ptx) -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}") -## end nvcc options - -set(MLPRIMS_LIBS - ${CUDA_cublas_LIBRARY} - ${CUDA_curand_LIBRARY} - ${CUDA_cusolver_LIBRARY} - ${CUDA_cusparse_LIBRARY} - pthread - ${ZLIB_LIBRARIES}) - -include_directories(src - ${CUTLASS_DIR} - ${CUB_DIR}) - -if(TESTS) - add_subdirectory(${GTEST_DIR}/googletest ${PROJECT_BINARY_DIR}/googletest) - add_subdirectory(test ${PROJECT_BINARY_DIR}/test) - - # formatting! - # add_clang_format( - # TARGETS mlcommon_test - # SRCS src test) -endif() - -include(cmake/doxygen.cmake) -add_doxygen_target(IN_DOXYFILE ${MLPRIMS_DIR}/Doxyfile.in - OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile - CWD ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/ml-prims/README.md b/ml-prims/README.md deleted file mode 100644 index 4ce34a276b..0000000000 --- a/ml-prims/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Introduction -This folder contains some of the common components and computational primitives -that form part of the machine learning algorithms in cuML, and can be used -individually as well in the form of a header only library. - -# Setup -## Dependencies (pre-requisites) -1. cmake (>= 3.12.4) -2. CUDA (>= 9.2) -3. doxygen (>= 1.8.11) (only required to build doxygen docs) -4. graphviz (>= 2.38.0) (only required to build doxygen docs) - -## Getting the ML primitives: -```bash -$ git clone --recursive https://github.com/rapidsai/cuml -``` -The primitives are contained in the `ml-prims` sub-folder. - -# Building tests -```bash -$ cd cuml/ml-prims -$ mkdir build -$ cd build -## build to specific GPU arch with -DGPU_ARCHS=70, to reduce compile time! -$ cmake .. -$ make -j -``` - -# Running tests -```bash -# build using above instructions -$ cd build -$ ./test/mlcommon_test -``` - -# Build doxygen docs -This needs doxygen and graphviz to be installed. -```bash -# build using above instructions -$ cd build -$ make doc -``` - -# External -The external folder inside ml-prims contains submodules that this project -depends on. Appropriate location flags for these dependencies will be -automatically populated in the main `CMakeLists.txt`. Current external -submodules are: -1. [CUTLASS](https://github.com/NVIDIA/cutlass) -2. [CUB](https://github.com/NVlabs/cub) -3. [Google Test](https://github.com/google/googletest) - -# Memory layout -Information about needed memory layout in current implementation: -1. Memory storage for matrix is dense, and in both column-major and row-major. Please see individual file/function documentation to see which format is needed for each case. -2. Matrix is densely packed without any LDA diff --git a/ml-prims/external b/ml-prims/external deleted file mode 120000 index 5ba3b03830..0000000000 --- a/ml-prims/external +++ /dev/null @@ -1 +0,0 @@ -../thirdparty/ml-prims/ \ No newline at end of file From 6ab9760ad6dfe3d4f18d449e8555dd9ddd942704 Mon Sep 17 00:00:00 2001 From: wxbn Date: Thu, 9 May 2019 23:23:15 +0000 Subject: [PATCH 078/156] Multiple fixes --- cuML/src/metrics/trustworthiness.cu | 150 +++++--- cuML/src/metrics/trustworthiness.h | 5 +- cuML/test/trustworthiness_test.cu | 434 ++++++++++++++++++++++++ python/cuml/metrics/trustworthiness.pyx | 61 +++- 4 files changed, 578 insertions(+), 72 deletions(-) create mode 100644 cuML/test/trustworthiness_test.cu diff --git a/cuML/src/metrics/trustworthiness.cu b/cuML/src/metrics/trustworthiness.cu index 89e1f8e5c4..1d43dedcb6 100644 --- a/cuML/src/metrics/trustworthiness.cu +++ b/cuML/src/metrics/trustworthiness.cu @@ -19,9 +19,10 @@ #include "distance/distance.h" #include #include -#include "../knn/knn.h" +#include using namespace MLCommon; +using namespace MLCommon::Distance; using namespace MLCommon::Selection; using namespace ML; @@ -39,24 +40,53 @@ namespace ML { int d, int n_neighbors) { cudaStream_t stream = h.getStream(); - auto alloc = h.getHostAllocator(); + auto d_alloc = h.getDeviceAllocator(); - long* d_pred_I; - math_t* d_pred_D; - allocate(d_pred_I, n * n_neighbors); - allocate(d_pred_D, n * n_neighbors); + long* d_pred_I = (long*)d_alloc->allocate(n * n_neighbors * sizeof(long), stream); + math_t* d_pred_D = (math_t*)d_alloc->allocate(n * n_neighbors * sizeof(math_t), stream); kNNParams params = {input, n}; kNN knn(d); knn.fit(¶ms, 1); knn.search(input, n, d_pred_I, d_pred_D, n_neighbors); - long* h_pred_I = (long*)alloc->allocate(n * n_neighbors * sizeof(long), stream); - updateHost(h_pred_I, d_pred_I, n * n_neighbors, stream); - - CUDA_CHECK(cudaFree(d_pred_I)); CUDA_CHECK(cudaFree(d_pred_D)); - return h_pred_I; + return d_pred_I; + } + + + /** + * @brief Compute a the rank of trustworthiness score + * @input param ind_X: indexes given by pairwise distance and sorting + * @input param ind_X_embedded: indexes given by KNN + * @input param n: Number of samples + * @input param n_neighbors: Number of neighbors considered by trustworthiness score + * @input param work: Batch to consider (to do it at once use n * n_neighbors) + * @output param rank: Resulting rank + */ + template + __global__ void compute_rank(math_t *ind_X, long *ind_X_embedded, + int n, int n_neighbors, int work, double * rank) + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= work) + return; + + int n_idx = i / n_neighbors; + int nn_idx = (i % n_neighbors) + 1; + + int idx = ind_X_embedded[n_idx * (n_neighbors+1) + nn_idx]; + math_t* sample_i = &ind_X[n_idx * n]; + for (int r = 1; r < n; r++) + { + if (sample_i[r] == idx) + { + int tmp = r - n_neighbors; + if (tmp > 0) + atomicAdd(rank, tmp); + break; + } + } } namespace Metrics { @@ -69,27 +99,32 @@ namespace ML { * @input param m: Number of features in high/original dimension * @input param d: Number of features in low/embedded dimension * @input param n_neighbors: Number of neighbors considered by trustworthiness score + * @input param distance_type: Distance type to consider * @return Trustworthiness score */ - template + template double trustworthiness_score(const cumlHandle& h, math_t* X, - math_t* X_embedded, int n, int m, int d, int n_neighbors) + math_t* X_embedded, int n, int m, int d, + int n_neighbors) { const int TMP_SIZE = MAX_BATCH_SIZE * n; cudaStream_t stream = h.getStream(); - auto alloc = h.getHostAllocator(); + auto d_alloc = h.getDeviceAllocator(); - constexpr auto distance_type = MLCommon::Distance::DistanceType::EucUnexpandedL2Sqrt; - size_t workspaceSize = 0; // EucUnexpandedL2Sqrt does not need any workspace + size_t workspaceSize = 0; // EucUnexpandedL2Sqrt does not reauire workspace (may need change for other distances) typedef cutlass::Shape<8, 128, 128> OutputTile_t; bool bAllocWorkspace = false; - math_t* d_pdist_tmp; - allocate(d_pdist_tmp, TMP_SIZE); - int* d_ind_X_tmp; - allocate(d_ind_X_tmp, TMP_SIZE); - int* h_ind_X = (int*)alloc->allocate(n * n * sizeof(int), stream); + math_t* d_pdist_tmp = (math_t*)d_alloc->allocate(TMP_SIZE * sizeof(math_t), stream); + int* d_ind_X_tmp = (int*)d_alloc->allocate(TMP_SIZE * sizeof(int), stream); + + long* ind_X_embedded = get_knn_indexes(h, X_embedded, + n, d, n_neighbors + 1); + + double t_tmp = 0.0; + double t = 0.0; + double* d_t = (double*)d_alloc->allocate(sizeof(double), stream); int toDo = n; while (toDo > 0) @@ -97,8 +132,7 @@ namespace ML { int batchSize = min(toDo, MAX_BATCH_SIZE); // Takes at most MAX_BATCH_SIZE vectors at a time - MLCommon::Distance::distance + distance (&X[(n - toDo) * m], X, d_pdist_tmp, batchSize, n, m, @@ -113,50 +147,56 @@ namespace ML { stream); CUDA_CHECK(cudaPeekAtLastError()); - updateHost(&h_ind_X[(n - toDo) * n], d_ind_X_tmp, - batchSize * n, stream); + t_tmp = 0.0; + updateDevice(d_t, &t_tmp, 1, stream); + + int work = batchSize * n_neighbors; + int n_blocks = work / N_THREADS + 1; + compute_rank<<>>(d_ind_X_tmp, + &ind_X_embedded[(n - toDo) * (n_neighbors+1)], + n, + n_neighbors, + batchSize * n_neighbors, + d_t); + CUDA_CHECK(cudaPeekAtLastError()); - toDo -= batchSize; - } + updateHost(&t_tmp, d_t, 1, stream); + t += t_tmp; - long* ind_X_embedded = get_knn_indexes(h, X_embedded, - n, d, n_neighbors + 1); - - double t = 0.0; - for (size_t i = 0; i < n; i++) - { - int* sample_i = &h_ind_X[i * n]; - for (size_t j = 1; j <= n_neighbors; j++) - { - long idx = ind_X_embedded[i * (n_neighbors+1) + j]; - for (int r = 1; r < n; r++) - { - if (sample_i[r] == idx) - { - int tmp = r - n_neighbors; - if (tmp > 0) - t += tmp; - break; - } - } - } + toDo -= batchSize; } - alloc->deallocate(h_ind_X, n * n * sizeof(int), stream); - alloc->deallocate(ind_X_embedded, n * (n_neighbors + 1) * sizeof(long), stream); - t = 1.0 - ((2.0 / ((n * n_neighbors) * ((2.0 * n) - (3.0 * n_neighbors) - 1.0))) * t); + d_alloc->deallocate(ind_X_embedded, n * (n_neighbors + 1) * sizeof(long), stream); + d_alloc->deallocate(d_pdist_tmp, TMP_SIZE * sizeof(math_t), stream); + d_alloc->deallocate(d_ind_X_tmp, TMP_SIZE * sizeof(int), stream); + d_alloc->deallocate(d_t, sizeof(double), stream); + return t; } + template + double trustworthiness_score(const cumlHandle& h, math_t* X, + math_t* X_embedded, int n, int m, int d, + int n_neighbors, int metric) + { + DistanceType distance_type = DistanceType(metric); + if (distance_type == EucUnexpandedL2Sqrt) + { + return trustworthiness_score(h, + X, X_embedded, n, m, d, n_neighbors); + } + + std::ostringstream msg; + msg << "Unknown metric" << std::endl; + throw MLCommon::Exception(msg.str()); + } template double trustworthiness_score(const cumlHandle& h, - float* X, float* X_embedded, int n, int m, int d, int n_neighbors); - // template double trustworthiness_score(const cumlHandle& h, - // double* X, double* X_embedded, int n, int m, int d, int n_neighbors); - // Disabled for now as knn only takes floats + float* X, float* X_embedded, int n, int m, int d, + int n_neighbors, int metric); } } \ No newline at end of file diff --git a/cuML/src/metrics/trustworthiness.h b/cuML/src/metrics/trustworthiness.h index c909f5f1b9..51ce8c471d 100644 --- a/cuML/src/metrics/trustworthiness.h +++ b/cuML/src/metrics/trustworthiness.h @@ -19,12 +19,15 @@ #include #define MAX_BATCH_SIZE 512 +#define N_THREADS 512 namespace ML { namespace Metrics { template - double trustworthiness_score(const cumlHandle& h, math_t* X, math_t* X_embedded, int n, int m, int d, int n_neighbors); + double trustworthiness_score(const cumlHandle& h, math_t* X, + math_t* X_embedded, int n, int m, int d, + int n_neighbors, int metric); } } \ No newline at end of file diff --git a/cuML/test/trustworthiness_test.cu b/cuML/test/trustworthiness_test.cu new file mode 100644 index 0000000000..5434f07c55 --- /dev/null +++ b/cuML/test/trustworthiness_test.cu @@ -0,0 +1,434 @@ +/* + * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +using namespace MLCommon; +using namespace ML::Metrics; + +class TrustworthinessScoreTest: public ::testing::Test { +protected: + void basicTest() { + std::vector X = { + 5.6142087,8.59787,-4.382763,-3.6452143,-5.8816037, + -0.6330313,4.6920023,-0.79210913,0.6106314,2.1210914, + 5.919943,-8.43784,-6.4819884,0.41001374,-6.1052523, + -4.0825715,-5.314755,-2.834671,5.751696,-6.5012555, + -0.4719201,-7.53353,7.6789393,-1.4959852,-5.5977287, + -9.564147,1.2902534,3.559834,-6.7659483,8.265964, + 4.595404,9.133477,-6.1553917,-6.319754,-2.9039452, + 4.4150834,-3.094395,-4.426273,9.584571,-5.64133, + 6.6209483,7.4044604,3.9620576,5.639907,10.33007, + -0.8792053,5.143776,-7.464049,1.2448754,-5.6300974, + 5.4518576,4.119535,6.749645,7.627064,-7.2298336, + 1.9681473,-6.9083176,6.404673,0.07186685,9.0994835, + 8.51037,-8.986389,0.40534487,2.115397,4.086756, + 1.2284287,-2.6272132,0.06527536,-9.587425,-7.206078, + 7.864875,7.4397306,-6.9233336,-2.6643622,3.3466153, + 7.0408177,-3.6069896,-9.971769,4.4075623,7.9063697, + 2.559074,4.323717,1.6867131,-1.1576937,-9.893141, + -3.251416,-7.4889135,-4.0588717,-2.73338,-7.4852257, + 3.4460473,9.759119,-5.4680476,-4.722435,-8.032619, + -1.4598992,4.227361,3.135568,1.1950601,1.1982028, + 6.998856,-6.131138,-6.6921015,0.5361224,-7.1213965, + -5.6104236,-7.2212887,-2.2710054,8.544764,-6.0254574, + 1.4582269,-5.5587835,8.031556,-0.26328218,-5.2591386, + -9.262641,2.8691363,5.299787,-9.209455,8.523085, + 5.180329,10.655528,-5.7171874,-6.7739563,-3.6306462, + 4.067106,-1.5912259,-3.2345476,8.042973,-3.6364832, + 4.1242137,9.886953,5.4743724,6.3058076,9.369645, + -0.5175337,4.9859877,-7.879498,1.358422,-4.147944, + 3.8984218,5.894656,6.4903927,8.702036,-8.023722, + 2.802145,-7.748032,5.8461113,-0.34215945,11.298865, + 1.4107164,-9.949621,-1.6257563,-10.655836,2.4528909, + 1.1570255,5.170669,2.8398793,7.1838694,9.088459, + 2.631155,3.964414,2.8769252,0.04198391,-0.16993195, + 3.6747139,-2.8377378,6.1782537,10.759618,-4.5642614, + -8.522967,0.8614642,6.623416,-1.029324,5.5488334, + -7.804511,2.128833,7.9042315,7.789576,-2.7944536, + 0.72271067,-10.511495,-0.78634536,-10.661714,2.9376361, + 1.9148129,6.22859,0.26264945,8.028384,6.8743043, + 0.9351067,7.0690722,4.2846055,1.4134506,-0.18144785, + 5.2778087,-1.7140163,9.217541,8.602799,-2.6537218, + -7.8377395,1.1244944,5.4540544,-0.38506773,3.9885726, + -10.76455,1.4440702,9.136163,6.664117,-5.7046547, + 8.038592,-9.229767,-0.2799413,3.6064725,4.187257, + 1.0516582,-2.0707326,-0.7615968,-8.561018,-3.7831352, + 10.300297,5.332594,-6.5880876,-4.2508664,1.7985519, + 5.7226253,-4.1223383,-9.6697855,1.4885283,7.524974, + 1.7206005,4.890457,3.7264557,0.4428284,-9.922455, + -4.250455,-6.4410596,-2.107994,-1.4109765,-6.1325397, + 0.32883006,6.0489736,7.7257385,-8.281174,1.0129383, + -10.792166,8.378851,10.802716,9.848448,-9.188757, + 1.3151443,1.9971865,-2.521849,4.3268294,-7.775683, + -2.2902298,3.0824065,-7.17559,9.6100855,7.3965735, + -10.476525,5.895973,-3.6974669,-7.6688933,1.7354839, + -7.4045196,-1.7992063,-4.0394845,5.2471714,-2.250571, + 2.528036,-8.343515,-2.2374575,-10.019771,0.73371273, + 3.1853926,2.7994921,2.6637669,7.620401,7.515571, + 0.68636256,5.834537,4.650282,-1.0362619,0.4461701, + 3.7870514,-4.1340904,7.202998,9.736904,-3.005512, + -8.920467,1.1228397,6.2598724,1.2812365,4.5442104, + -8.791537,0.92113096,8.464749,8.359035,-4.3923397, + 1.2252625,-10.1986475,-1.4409319,-10.013967,3.9071581, + 1.683064,4.877419,1.6570637,9.559105,7.3546534, + 0.36635467,5.220211,4.6303267,0.6601065,0.16149978, + 3.8818731,-3.4438233,8.42085,8.659159,-3.0935583, + -8.039611,2.3060374,5.134666,1.0458113,6.0190983, + -9.143728,0.99048865,9.210842,6.670241,-5.9614363, + 0.8747396,7.078824,8.067469,-10.314754,0.45977542, + -9.28306,9.1838665,9.318644,7.189082,-11.092555, + 1.0320464,3.882163,0.10953151,7.9029684,-6.9068265, + -1.3526366,5.3996363,-8.430931,11.452577,6.39663, + -11.090514,4.6662245,-3.1268113,-8.357452,2.2276728, + -10.357126,-0.9291848,-3.4193344,3.1289792,-2.5030103, + 6.772719,11.457757,-4.2125936,-6.684548,-4.7611327, + 3.6960156,-2.3030636,-3.0591488,10.452471,-4.1267314, + 5.66614,7.501461,5.072407,6.636537,8.990381, + -0.2559256,4.737867,-6.2149944,2.535682,-5.5484023, + 5.7113924,3.4742818,7.9915137,7.0052586,-7.156467, + 1.4354781,-8.286235,5.7523417,-2.4175215,9.678009, + 0.05066403,-9.645226,-2.2658763,-9.518178,4.493372, + 2.3232365,2.1659086,0.42507997,8.360246,8.23535, + 2.6878164,5.236947,3.4924245,-0.6089895,0.8884741, + 4.359464,-4.6073823,7.83441,8.958755,-3.4690795, + -9.182282,1.2478025,5.6311107,-1.2408862,3.6316886, + -8.684654,2.1078515,7.2813864,7.9265943,-3.6135032, + 0.4571511,8.493568,10.496853,-7.432897,0.8625995, + -9.607528,7.2899456,8.83158,8.908199,-10.300263, + 1.1451302,3.7871468,-0.97040755,5.7664757,-8.9688, + -2.146672,5.9641485,-6.2908535,10.126465,6.1553903, + -12.066902,6.301596,-5.0419583,-8.228695,2.4879954, + -8.918582,-3.7434099,-4.1593685,3.7431836,-1.1704745, + 0.5524103,9.109399,9.571567,-11.209955,1.2462777, + -9.554555,9.091726,11.477966,7.630937,-10.450911, + 1.9205878,5.358983,-0.44546837,6.7611346,-9.74753, + -0.5939732,3.8892255,-6.437991,10.294727,5.6723895, + -10.7883,6.192348,-5.293862,-10.811491,1.0194173, + -7.074576,-3.192368,-2.5231771,4.2791643,-0.53309685, + 0.501366,9.636625,7.710316,-6.4219728,1.0975566, + -8.218886,6.9011984,9.873679,8.903804,-9.316832, + 1.2404599,4.9039655,1.2272617,4.541515,-5.2753224, + -3.2196746,3.1303136,-7.285681,9.041425,5.6417427, + -9.93667,5.7548947,-5.113397,-8.544622,4.182665, + -7.7709813,-3.2810235,-3.312072,3.8900535,-2.0604856, + 6.709082,-8.461194,1.2666026,4.8770437,2.6955879, + 3.0340345,-1.1614609,-3.536341,-7.090382,-5.36146, + 9.072544,6.4554095,-4.4728956,-1.88395,3.1095037, + 8.782348,-3.316743,-8.65248,1.6802986,8.186188, + 2.1783829,4.931278,4.158475,1.4033595,-11.320101, + -3.7084908,-6.740436,-2.5555193,-1.0451177,-6.5569925, + 0.82810307,8.505919,8.332857,-9.488569,-0.21588463, + -8.056692,8.493993,7.6401625,8.812983,-9.377281, + 2.4369764,3.1766508,0.6300803,5.6666765,-7.913654, + -0.42301777,4.506412,-7.8954244,10.904591,5.042256, + -9.626183,8.347351,-3.605006,-7.923387,1.1024277, + -8.705793,-2.5151258,-2.5066147,4.0515003,-2.060757, + 6.2635093,8.286584,-6.0509276,-6.76452,-3.1158175, + 1.6578803,-1.4608748,-1.24211,8.151246,-4.2970877, + 6.093071,7.4911637,4.51018,4.8425875,9.211085, + -2.4386222,4.5830803,-5.6079445,2.3713675,-4.0707507, + 3.1787417,5.462342,6.915912,6.3928423,-7.2970796, + 5.0112796,-9.140893,4.9990606,0.38391754,7.7088532, + 1.9340848,8.18833,8.16617,-9.42086,-0.3388326, + -9.659727,8.243045,8.099073,8.439428,-7.038694, + 2.1077902,3.3866816,-1.9975324,7.4972878,-7.2525196, + -1.553731,4.08758,-6.6922374,9.50525,4.026735, + -9.243538,7.2740564,-3.9319072,-6.3228955,1.6693478, + -7.923119,-3.7423058,-2.2813146,5.3469067,-1.8285407, + 3.3118162,8.826356,-4.4641976,-6.4751124,-9.200089, + -2.519147,4.225298,2.4105988,-0.4344186,0.53441775, + 5.2836394,-8.2816105,-4.996147,-1.6870759,-7.8543897, + -3.9788852,-7.0346904,-3.1289773,7.4567637,-5.6227813, + 1.0709786,-8.866012,8.427324,-1.1755563,-5.789216, + -8.197835,5.3342214,6.0646234,-6.8975716,7.717031, + 3.480355,8.312151,-3.6645212,-3.0976524,-8.090359, + -1.9176173,2.4257212,1.9700835,0.4098958,2.1341088, + 7.652741,-9.9595585,-5.989757,0.10119354,-7.935407, + -5.792786,-5.22783,-4.318978,5.414037,-6.4621663, + 1.670883,-6.9224787,8.696932,-2.0214002,-6.6681314, + -8.326418,4.9049683,5.4442496,-6.403739,7.5822453, + 7.0972915,-9.072851,-0.23897195,1.7662339,5.3096304, + 1.983179,-2.222645,-0.34700772,-9.094717,-6.107907, + 9.525174,8.1550665,-5.6940084,-4.1636486,1.7360662, + 8.528821,-3.7299833,-9.341266,2.608542,9.108706, + 0.7978509,4.2488184,2.454484,0.9446999,-10.106636, + -3.8973773,-6.6566644,-4.5647273,-0.99837756,-6.568582, + 9.324853,-7.9020953,2.0910501,2.2896829,1.6790711, + 1.3159255,-3.5258796,1.8898442,-8.105812,-4.924962, + 8.771129,7.1202874,-5.991957,-3.4106019,2.4450088, + 7.796387,-3.055946,-7.8971434,1.9856719,9.001636, + 1.8511922,3.019749,3.1227696,0.4822102,-10.021213, + -3.530504,-6.225959,-3.0029628,-1.7881511,-7.3879776, + 1.3925704,9.499782,-3.7318087,-3.7074296,-7.7466836, + -1.5284524,4.0535855,3.112011,0.10340207,-0.5429599, + 6.67026,-9.155924,-4.924038,0.64248866,-10.0103655, + -3.2742946,-4.850029,-3.6707063,8.586258,-5.855605, + 4.906918,-6.7813993,7.9938135,-2.5473144,-5.688948, + -7.822478,2.1421318,4.66659,-9.701272,9.549149, + 0.8998125,-8.651497,-0.56899565,-8.639817,2.3088377, + 2.1264515,3.2764478,2.341989,8.594338,8.630639, + 2.8440373,6.2043204,4.433932,0.6320018,-1.8179281, + 5.09452,-1.5741565,8.153934,8.744339,-3.6945698, + -8.883078,1.5329908,5.2745943,0.44716078,4.8809066, + -7.9594903,1.134374,9.233994,6.5528665,-4.520542, + 9.477355,-8.622195,-0.23191702,2.0485356,3.9379985, + 1.5916302,-1.4516805,-0.0843819,-7.8554378,-5.88308, + 7.999766,6.2572145,-5.585321,-4.0097756,0.42382592, + 6.160884,-3.631315,-8.333449,2.770595,7.8495173, + 3.3331623,4.940415,3.6207345,-0.037517,-11.034698, + -3.185103,-6.614664,-3.2177854,-2.0792234,-6.8879867, + 7.821685,-8.455084,1.0784642,4.0033927,2.7343264, + 2.6052725,-4.1224284,-0.89305353,-6.8267674,-4.9715133, + 8.880253,5.6994023,-5.9695024,-4.9181266,1.3017995, + 7.972617,-3.9452884,-10.424556,2.4504194,6.21529, + 0.93840516,4.2070026,6.159839,0.91979957,-8.706724, + -4.317946,-6.6823545,-3.0388,-2.464262,-7.3716645, + 1.3926703,6.544412,-5.6251183,-5.122411,-8.622049, + -2.3905911,3.9138813,1.9779967,-0.05011125,0.13310997, + 7.229751,-9.742043,-8.08724,1.2426697,-7.9230795, + -3.3162494,-7.129571,-3.5488048,7.4701195,-5.2357526, + 0.5917681,-6.272206,6.342328,-2.909731,-4.991607, + -8.845513,3.3228495,7.033246,-7.8180246,8.214469, + 6.3910093,9.185153,-6.20472,-7.713809,-3.8481297, + 3.5579286,0.7078448,-3.2893546,7.384514,-4.448121, + 3.0104196,9.492943,8.024847,4.9114385,9.965594, + -3.014036,5.182494,-5.8806014,2.5312455,-5.9926524, + 4.474469,6.3717875,6.993105,6.493093,-8.935534, + 3.004074,-8.055647,8.315765,-1.3026813,8.250377, + 0.02606229,6.8508425,9.655665,-7.0116496,-0.41060972, + -10.049198,7.897801,6.7791023,8.3362,-9.821014, + 2.491157,3.5160472,-1.6228812,7.398063,-8.769123, + -3.1743705,3.2827861,-6.497855,10.831924,5.2761307, + -9.704417,4.3817043,-3.9841619,-8.111647,1.1883026, + -8.115312,-2.9240117,-5.8879666,4.20928,-0.3587938, + 6.935672,-10.177582,0.48819053,3.1250648,2.9306343, + 3.082544,-3.477687,-1.3768549,-7.4922366,-3.756631, + 10.039836,3.6670392,-5.9761434,-4.4728765,3.244255, + 7.027899,-2.3806512,-10.4100685,1.605716,7.7953773, + 0.5408159,1.7156523,3.824097,-1.0604783,-10.142124, + -5.246805,-6.5283823,-4.579547,-2.42714,-6.709197, + 2.7782338,7.33353,-6.454507,-2.9929368,-7.8362985, + -2.695445,2.4900775,1.6682367,0.4641757,-1.0495365, + 6.9631333,-9.291356,-8.23837,-0.34263706,-8.275113, + -2.8454232,-5.0864096,-2.681942,7.5450225,-6.2517986, + 0.06810654,-6.470652,4.9042645,-1.8369255,-6.6937943, + -7.9625087,2.8510258,6.180508,-8.282598,7.919079, + 1.4897474,6.7217417,-4.2459426,-4.114431,-8.375707, + -2.143264,5.6972933,1.5574739,0.39375135,1.7930849, + 5.1737595,-7.826241,-5.160268,-0.80433255,-7.839536, + -5.2620406,-5.4643164,-3.185536,6.620315,-7.065227, + 1.0524757,-6.125088,5.7126627,-1.6161644,-3.852159, + -9.164279,2.7005782,5.946544,-8.468236,8.2145405, + 1.1035942,6.590157,-4.0461283,-4.8090615,-7.6702685, + -2.1121511,5.1147075,1.6128504,2.0064135,1.0544407, + 6.0038295,-7.8282537,-4.801278,0.32349443,-8.0649805, + -4.372714,-5.61336,-5.21394,8.176595,-5.4753284, + 1.7800134,-8.267283,7.2133374,-0.16594432,-6.317046, + -9.490406,4.1261597,5.473317,-7.7551675,7.007468, + 7.478628,-8.801905,0.10975724,3.5478222,4.797803, + 1.3825226,-3.357369,0.99262005,-6.94877,-5.4781394, + 9.632604,5.7492557,-5.9014316,-3.1632116,2.340859, + 8.708098,-3.1255999,-8.848661,4.5612836,8.455157, + 0.73460823,4.112301,4.392744,-0.30759293,-6.8036823, + -3.0331545,-8.269506,-2.82415,-0.9411246,-5.993506, + 2.1618164,-8.716055,-0.7432543,-10.255819,3.095418, + 2.5131428,4.752442,0.9907621,7.8279433,7.85814, + 0.50430876,5.2840405,4.457291,0.03330028,-0.40692952, + 3.9244103,-2.117118,7.6977615,8.759009,-4.2157164, + -9.136053,3.247858,4.668686,0.76162136,5.3833632, + -9.231471,0.44309422,8.380872,6.7211227,-3.091507, + 2.173508,-9.038242,-1.3666698,-9.819077,0.37825826, + 2.3898845,4.2440815,1.9161536,7.24787,6.9124637, + 1.6238527,5.1140285,3.1935842,1.02845,-1.1273454, + 5.638998,-2.497932,8.342559,8.586319,-2.9069402, + -7.6387944,3.5975037,4.4115705,0.41506064,4.9078383, + -9.68327,1.8159529,9.744613,8.40622,-4.495336, + 9.244892,-8.789869,1.3158468,4.018167,3.3922846, + 2.652022,-2.7495477,0.2528986,-8.268324,-6.004913, + 10.428784,6.6580734,-5.537176,-1.7177434,2.7504628, + 6.7735,-2.4454272,-9.998361,2.9483433,6.8266654, + 2.3787718,4.472637,2.5871701,0.7355365,-7.7027745, + -4.1879907,-7.172832,-4.1843605,-0.03646783,-5.419406, + 6.958486,11.011111,-7.1821184,-7.956423,-3.408451, + 4.6850276,-2.348787,-4.398289,6.9787564,-3.8324208, + 5.967827,8.433518,4.660108,5.5657144,9.964243, + -1.3515275,6.404833,-6.4805903,2.4379845,-6.0816774, + 1.752272,5.3771873,6.9613523,6.9788294,-6.3894596, + 3.7521114,-6.8034263,6.4458385,-0.7233525,10.512529, + 4.362273,9.231461,-6.3382263,-7.659,-3.461823, + 4.71463,0.17817476,-3.685746,7.2962036,-4.6489477, + 5.218017,11.546999,4.7218375,6.8498397,9.281103, + -3.900459,6.844054,-7.0886965,-0.05019227,-8.233724, + 5.5808983,6.374517,8.321048,7.969449,-7.3478637, + 1.4917561,-8.003144,4.780668,-1.1981848,7.753739, + 2.0260844,-8.880096,-3.4258451,-7.141975,1.9637157, + 1.814725,5.311151,1.4831505,7.8483663,7.257948, + 1.395786,6.417756,5.376912,0.59505713,0.00062552, + 3.6634305,-4.159713,7.3571978,10.966816,-2.5419605, + -8.466229,1.904205,5.6338267,-0.52567476,5.59736, + -8.361799,0.5009981,8.460681,7.3891273,-3.5272243, + 5.0552278,9.921456,-7.69693,-7.286378,-1.9198836, + 3.1666567,-2.5832257,-2.2445817,9.888111,-5.076563, + 5.677401,7.497946,5.662994,5.414262,8.566503, + -2.5530663,7.1032815,-6.0612082,1.3419591,-4.9595256, + 4.3377542,4.3790717,6.793512,8.383502,-7.1278043, + 3.3240774,-9.379446,6.838661,-0.81241214,8.694813, + 0.79141915,7.632467,8.575382,-8.533798,0.28954387, + -7.5675836,5.8653326,8.97235,7.1649346,-10.575289, + 0.9359381,5.02381,-0.5609511,5.543464,-7.69131, + -2.1792977,2.4729247,-6.1917787,10.373678,7.6549597, + -8.809486,5.5657206,-3.3169382,-8.042887,2.0874746, + -7.079005,-3.33398,-3.6843317,4.0172358,-2.0754814, + 1.1726758,7.4618697,6.9483604,-8.469206,0.7401797, + -10.318176,8.384557,10.5476265,9.146971,-9.250223, + 0.6290606,4.4941425,-0.7514017,7.2271705,-8.309598, + -1.4761636,4.0140634,-6.021102,9.132852,5.6610966, + -11.249811,8.359293,-1.9445792,-7.7393436,-0.3931331, + -8.824441,-2.5995944,-2.5714035,4.140213,-3.6863053, + 5.517265,9.020411,-4.9286127,-7.871219,-3.7446704, + 2.5179656,-1.4543481,-2.2703636,7.010597,-3.6436229, + 6.753862,7.4129915,7.1406755,5.653706,9.5445175, + 0.15698843,4.761813,-7.698002,1.6870106,-4.5410123, + 4.171763,5.3747005,6.341021,7.456738,-8.231657, + 2.763487,-9.208167,6.676799,-1.1957736,10.062605, + 4.0975976,7.312957,-2.4981596,-2.9658387,-8.150425, + -2.1075552,2.64375,1.6636052,1.1483809,0.09276015, + 5.8556347,-7.8481026,-5.9913163,-0.02840613,-9.937289, + -1.0486673,-5.2340155,-3.83912,7.7165728,-8.409944, + 0.80863273,-6.9119215,7.5712357,0.36031485,-6.056131, + -8.470033,1.8678337,3.0121377,-7.3096333,8.205484, + 5.262654,8.774514,-4.7603083,-7.2096143,-4.437014, + 3.6080024,-1.624254,-4.2787876,8.880863,-4.8984556, + 5.1782074,9.944454,3.911282,3.5396595,8.867042, + -1.2006199,5.393288,-5.6455317,0.7829499,-4.0338907, + 2.479272,6.5080743,8.582535,7.0097537,-6.9823785, + 3.984318,-7.225381,5.3135114,-1.0391048,8.951443, + -0.70119005,-8.510742,-0.42949116,-10.9224825,2.8176029, + 1.6800792,5.778404,1.7269998,7.1975236,7.7258267, + 2.7632928,5.3399253,3.4650044,0.01971426,-1.6468811, + 4.114996,-1.5110453,6.8689218,8.269899,-3.1568048, + -7.0344677,1.2911975,5.950357,0.19028673,4.657226, + -8.199647,2.246055,8.989509,5.3101015,-4.2400866 + }; + + std::vector X_embedded = { + -0.41849962,-0.53906363,0.46958843,-0.35832694,-0.23779503,-0.29751351, + -0.01072748,-0.21353109,-0.54769957,-0.55086273,0.37093949,-0.12714292, + -0.06639574,-0.36098689,-0.13060696,-0.07362658,-1.01205945,-0.39285606, + 0.2864089,-0.32031146,-0.19595343,0.08900568,-0.04813879,-0.06563424, + -0.42655188,-0.69014251,0.51459783,-0.1942696,-0.07767916,-0.6119386, + 0.04813685,-0.22557008,-0.56890118,-0.60293794,0.43429622,-0.09240723, + -0.00624062,-0.25800395,-0.1886092,0.01655941,-0.01961523,-0.14147359, + 0.41414487,-0.8512944,-0.61199242,-0.18586016,0.14024924,-0.41635606, + -0.02890144,0.1065347,0.39700791,-1.14060664,-0.95313865,0.14416681, + 0.17306046,-0.53189689,-0.98987544,-0.67918193,0.41787854,-0.20878236, + -0.06612862,0.03502904,-0.03765266,-0.0980606,-0.00971657,0.29432917, + 0.36575687,-1.1645509,-0.89094597,0.03718805,0.2310573,-0.38345811, + -0.10401925,-0.10653082,0.38469055,-0.88302094,-0.80197543,0.03548668, + 0.02775662,-0.54374295,0.03379983,0.00923623,0.29320273,-1.05263519, + -0.93360096,0.03778313,0.12360487,-0.56437284,0.0644429,0.33432651, + 0.36450726,-1.22978747,-0.83822101,-0.18796451,0.34888434,-0.3801491, + -0.45327303,-0.59747899,0.39697698,-0.15616602,-0.06159166,-0.40301991, + -0.11725303,-0.11913263,-0.12406619,-0.11227967,0.43083835,-0.90535849, + -0.81646025,0.10012121,-0.0141237,-0.63747931,0.04805023,0.34190539, + 0.50725192,-1.17861414,-0.74641538,-0.09333111,0.27992678,-0.56214809, + 0.04970971,0.36249384,0.57705611,-1.16913795,-0.69849908,0.10957897, + 0.27983218,-0.62088525,0.0410459,0.23973398,0.40960434,-1.14183664, + -0.83321381,0.02149482,0.21720445,-0.49869928,-0.95655465,-0.51680422, + 0.45761383,-0.08351214,-0.12151554,0.00819737,-0.20813803,-0.01055793, + 0.25319234,0.36154974,0.1822421,-1.15837133,-0.92209691,-0.0501582, + 0.08535917,-0.54003763,-1.08675635,-1.04009593,0.09408128,0.07009826, + -0.01762833,-0.19180447,-0.18029785,-0.20342001,0.04034991,0.1814747, + 0.36906669,-1.13532007,-0.8852452,0.0782818,0.16825101,-0.50301319, + -0.29128098,-0.65341312,0.51484352,-0.38758236,-0.22531103,-0.55021971, + 0.10804344,-0.3521522,-0.38849035,-0.74110794,0.53761131,-0.25142813, + -0.1118066,-0.47453368,0.06347904,-0.23796193,-1.02682328,-0.47594091, + 0.39515916,-0.2782529,-0.16566519,0.08063579,0.00810116,-0.06213913, + -1.059654,-0.62496334,0.53698546,-0.11806234,0.00356161,0.11513405, + -0.14213292,0.04102662,-0.36622161,-0.73686272,0.48323864,-0.27338892, + -0.14203401,-0.41736352,0.03332564,-0.21907479,-0.06396769,0.01831361, + 0.46263444,-1.01878166,-0.86486858,0.17622118,-0.01249686,-0.74530888, + -0.9354887,-0.5027945,0.38170099,-0.15547098,0.00677824,-0.04677663, + -0.13541745,0.07253501,-0.97933143,-0.58001202,0.48235369,-0.18836913, + -0.02430783,0.07572441,-0.08101331,0.00630076,-0.16881248,-0.67989182, + 0.46083611,-0.43910736,-0.29321918,-0.38735861,0.07669903,-0.29749861, + -0.40047669,-0.56722462,0.33168188,-0.13118173,-0.06672747,-0.56856316, + -0.26269144,-0.14236671,0.10651901,0.4962585,0.38848072,-1.06653547, + -0.64079332,-0.47378591,0.43195483,-0.04856951,-0.9840439,-0.70610428, + 0.34028092,-0.2089237,-0.05382041,0.01625874,-0.02080803,-0.12535211, + -0.04146428,-1.24533033,0.48944879,0.0578458,0.26708388,-0.90321028, + 0.35377088,-0.36791429,-0.35382384,-0.52748734,0.42854419,-0.31744713, + -0.19174226,-0.39073724,-0.03258846,-0.19978228,-0.36185205,-0.57412046, + 0.43681973,-0.25414538,-0.12904905,-0.46334973,-0.03123853,-0.11303604, + -0.87073672,-0.45441297,0.41825858,-0.25303507,-0.21845073,0.10248682, + -0.11045569,-0.10002795,-0.00572806,0.16519061,0.42651513,-1.11417019, + -0.83789682,0.02995787,0.16843079,-0.53874511,0.03056994,0.17877036, + 0.49632853,-1.03276777,-0.74778616,-0.03971953,0.10907949,-0.67385727, + -0.9523471,-0.56550741,0.40409449,-0.2703723,-0.10175014,0.13605487, + -0.06306008,-0.01768126,-0.4749442,-0.56964815,0.39389887,-0.19248079, + -0.04161081,-0.38728487,-0.20341556,-0.12656988,-0.35949609,-0.46137866, + 0.28798422,-0.06603147,-0.04363992,-0.60343552,-0.23565227,-0.10242701, + -0.06792886,0.09689897,0.33259571,-0.98854214,-0.84444433,0.00673901, + 0.13457057,-0.43145794,-0.51500046,-0.50821936,0.38000089,0.0132636, + 0.0580942,-0.40157595,-0.11967677,0.02549113,-0.10350953,0.22918226, + 0.40411913,-1.05619383,-0.71218503,-0.02197581,0.26422262,-0.34765676, + 0.06601537,0.21712676,0.34723559,-1.20982027,-0.95646334,0.00793948, + 0.27620381,-0.43475035,-0.67326003,-0.6137197,0.43724492,-0.17666136, + -0.06591748,-0.18937394,-0.07400128,-0.06881691,-0.5201112,-0.61088628, + 0.4225319,-0.18969463,-0.06921366,-0.33993208,-0.06990873,-0.10288513, + -0.70659858,-0.56003648,0.46628812,-0.16090363,-0.0185108,-0.1431348, + -0.1128775,-0.0078648,-0.02323332,0.04292452,0.39291084,-0.94897962, + -0.63863206,-0.16546988,0.23698957,-0.30633628 + }; + + ML::cumlHandle h; + cudaStream_t stream = h.getStream(); + auto d_alloc = h.getDeviceAllocator(); + + float* d_X = (float*)d_alloc->allocate(X.size() * sizeof(float), stream); + float* d_X_embedded = (float*)d_alloc->allocate(X_embedded.size() * sizeof(float), stream); + + updateDevice(d_X, X.data(), X.size(), stream); + updateDevice(d_X_embedded, X_embedded.data(), X_embedded.size(), stream); + + // euclidean test + score = trustworthiness_score(h, d_X, d_X_embedded, 50, 30, 8, 5, 5); + + d_alloc->deallocate(d_X, X.size() * sizeof(float), stream); + d_alloc->deallocate(d_X_embedded, X_embedded.size() * sizeof(float), stream); + } + + void SetUp() override { + basicTest(); + } + + void TearDown() override { + } + +protected: + double score; + +}; + + +typedef TrustworthinessScoreTest TrustworthinessScoreTestF; +TEST_F(TrustworthinessScoreTestF, Result) { + ASSERT_TRUE(0.9374 < score && score < 0.9376); +} \ No newline at end of file diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx index 9d3f72a58e..1efbf019ca 100644 --- a/python/cuml/metrics/trustworthiness.pyx +++ b/python/cuml/metrics/trustworthiness.pyx @@ -28,10 +28,18 @@ from libc.stdint cimport uintptr_t from cuml.common.handle cimport cumlHandle cdef extern from "metrics/trustworthiness.h" namespace "ML::Metrics": + cdef double trustworthiness_score[T](const cumlHandle& h, T* X, - T* X_embedded, int n, int m, int d, int n_neighbors) + T* X_embedded, int n, int m, int d, + int n_neighbors, int metric) + + +metric_codes = { + 'euclidean': 5 +} + -def trustworthiness(X, X_embedded, handle=None, n_neighbors=5): +def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean', should_downcast=True): """ Expresses to what extent the local structure is retained in embedding. The score is defined in the range [0, 1]. @@ -53,6 +61,11 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5): Trustworthiness of the low-dimensional embedding """ + if metric in metric_codes: + metric_code = metric_codes[metric] + else: + raise Exception("Unknown metric") + if isinstance(X, cudf.DataFrame) and isinstance(X_embedded, cudf.DataFrame): datatype1 = np.dtype(X[X.columns[0]]._column.dtype) datatype2 = np.dtype(X_embedded[X_embedded.columns[0]]._column.dtype) @@ -67,11 +80,13 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5): else: raise TypeError("X and X_embedded parameters must both be cuDF Dataframes or Numpy ndarray") - if datatype1 != datatype2: - raise TypeError("X and X_embedded parameters must be of same type") - - if datatype1 != np.float32 or datatype2 != np.float32: # currently only float32 is available - return TypeError("X and X_embedded parameters must be of type float32") + if datatype1 != np.float32 or datatype2 != np.float32: + if should_downcast: + X = to_single_precision(X) + X_embedded = to_single_precision(X_embedded) + else: + raise Exception("Input is double precision. Use 'should_downcast=True' " + "if you'd like it to be automatically casted to single precision.") if isinstance(X, cudf.DataFrame): d_X = X.as_gpu_matrix(order='C') @@ -89,14 +104,9 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5): else: handle_ = handle.getHandle() - if datatype1 == np.float32: - res = trustworthiness_score[float](handle_[0], d_X_ptr, - d_X_embedded_ptr, n_samples, n_features, - n_components, n_neighbors) - #else: - # res = trustworthiness_score[double](handle_[0], d_X_ptr, - # d_X_embedded_ptr, n_samples, n_features, - # n_components, n_neighbors) + res = trustworthiness_score[float](handle_[0], d_X_ptr, + d_X_embedded_ptr, n_samples, n_features, + n_components, n_neighbors, metric_code) if handle is None: del handle_ @@ -107,4 +117,23 @@ def get_ctype_ptr(obj): # The manner to access the pointers in the gdf's might change, so # encapsulating access in the following 3 methods. They might also be # part of future gdf versions. - return obj.device_ctypes_pointer.value \ No newline at end of file + return obj.device_ctypes_pointer.value + + +def to_single_precision(X): + if isinstance(X, cudf.DataFrame): + new_cols = [(col,X._cols[col].astype(np.float32)) for col in X._cols] + overflowed = sum([len(colval[colval >= np.inf]) for colname, colval in new_cols]) + + if overflowed > 0: + raise Exception("Downcast to single-precision resulted in data loss.") + + X = cudf.DataFrame(new_cols) + else: + X = X.astype(np.float32) + overflowed = len(X[X >= np.inf]) + + if overflowed > 0: + raise Exception("Downcast to single-precision resulted in data loss.") + + return X \ No newline at end of file From b0363948966dc6145060bc31b7a0e3c59e9bf8c0 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 19:58:58 -0500 Subject: [PATCH 079/156] FIX Update setup.py --- python/setup.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/setup.py b/python/setup.py index 6720882015..218481cf1b 100644 --- a/python/setup.py +++ b/python/setup.py @@ -58,12 +58,11 @@ extensions = [ Extension("*", sources=['cuml/*/*.pyx'], - include_dirs=['../cuML/src', - '../cuML/external', - '../cuML/external/ml-prims/src', - '../cuML/external/ml-prims/external/cutlass', - '../cuML/external/cutlass', - '../cuML/external/ml-prims/external/cub', + include_dirs=['../cpp/src', + '../cpp/external', + '../cpp/src_prims', + '../thirdparty/cutlass', + '../thirdparty/cub', cuda_include_dir, rmm_include_dir], library_dirs=[get_python_lib()], From cc5b4c070dae6bd610be851823b28cfd26788e3d Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 22:09:07 -0500 Subject: [PATCH 080/156] FIX include path in rmmallocator --- cpp/src/common/rmmAllocatorAdapter.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/common/rmmAllocatorAdapter.hpp b/cpp/src/common/rmmAllocatorAdapter.hpp index 3508b56f21..48272b2f2b 100644 --- a/cpp/src/common/rmmAllocatorAdapter.hpp +++ b/cpp/src/common/rmmAllocatorAdapter.hpp @@ -18,7 +18,7 @@ #include -#include "../../../ml-prims/src/utils.h" +#include "../../src_prims/utils.h" #include "../cuML.hpp" From 8f277bd29d77160d7fba237890e531e6caee96c7 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 22:10:24 -0500 Subject: [PATCH 081/156] DOC Add entry to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 18d8e1ee80..2fa72f3ae4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ## Improvements +- PR #597: C++ cuML and ml-prims folder refactor - PR #590: QN Recover from numeric errors - PR #482: Introduce cumlHandle for pca and tsvd - PR #573: Remove use of unnecessary cuDF column and series copies From e137c0ca9666fc8b255dc5614bf8b3eecf76e4d6 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 22:14:26 -0500 Subject: [PATCH 082/156] FIX Add missing condition for prims tests --- cpp/CMakeLists.txt | 68 ++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 538e188dcf..4ed48aee9f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -328,49 +328,53 @@ endif(BUILD_CUML_TESTS) if(BUILD_CUML_MG_TESTS) -file(GLOB_RECURSE cuml_mg_test_cuda_sources "test/mg/*.cu") + file(GLOB_RECURSE cuml_mg_test_cuda_sources "test/mg/*.cu") -add_executable(ml_mg_test - ${cuml_mg_test_cuda_sources} - ${ml_prims_header}) + add_executable(ml_mg_test + ${cuml_mg_test_cuda_sources} + ${ml_prims_header}) -target_link_libraries(ml_mg_test - ${GTEST_LIBNAME} - ${CUDA_cublas_LIBRARY} - ${CUDA_curand_LIBRARY} - ${CUDA_cusolver_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ${CUDA_CUDART_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ${CUDA_nvgraph_LIBRARY} - gpufaisslib - faisslib - ${CUML_CPP_TARGET} - pthread - ${ZLIB_LIBRARIES}) + target_link_libraries(ml_mg_test + ${GTEST_LIBNAME} + ${CUDA_cublas_LIBRARY} + ${CUDA_curand_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY} + ${CUDA_CUDART_LIBRARY} + ${CUDA_cusparse_LIBRARY} + ${CUDA_nvgraph_LIBRARY} + gpufaisslib + faisslib + ${CUML_CPP_TARGET} + pthread + ${ZLIB_LIBRARIES}) endif(BUILD_CUML_MG_TESTS) ################################################################################################### # - build prims_test executable ---------------------------------------------------------------- -file(GLOB_RECURSE mlprims_test_cuda_sources "test/prims/*.cu") +if(BUILD_PRIM_TESTS) + + file(GLOB_RECURSE mlprims_test_cuda_sources "test/prims/*.cu") + + set(MLPRIMS_LINK_LIBRARIES + ${CUDA_cublas_LIBRARY} + ${CUDA_curand_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY} + pthread + ${ZLIB_LIBRARIES}) -set(MLPRIMS_LINK_LIBRARIES - ${CUDA_cublas_LIBRARY} - ${CUDA_curand_LIBRARY} - ${CUDA_cusolver_LIBRARY} - ${CUDA_cusparse_LIBRARY} - pthread - ${ZLIB_LIBRARIES}) + add_executable(prims_test + ${mlprims_test_cuda_sources} + ${ml_prims_header}) -add_executable(prims_test - ${mlprims_test_cuda_sources} - ${ml_prims_header}) + target_link_libraries(prims_test + ${GTEST_LIBNAME} + ${MLPRIMS_LINK_LIBRARIES}) -target_link_libraries(prims_test - ${GTEST_LIBNAME} - ${MLPRIMS_LINK_LIBRARIES}) +endif(BUILD_PRIM_TESTS) ################################################################################################### # - build examples ------------------------------------------------------------------------- From e65df9f53d1c8469b277f8b14417015ae925cf51 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 22:22:48 -0500 Subject: [PATCH 083/156] ENH change behavior of disabling libcuml++ --- BUILD.md | 2 +- cpp/CMakeLists.txt | 10 ++++++---- cpp/README.md | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/BUILD.md b/BUILD.md index 04723e5555..d9ed320377 100644 --- a/BUILD.md +++ b/BUILD.md @@ -113,7 +113,7 @@ cuML's cmake has the following configurable flags available: | Flag | Possible Values | Default Value | Behavior | | --- | --- | --- | --- | | BLAS_LIBRARIES | path/to/blas_lib | "" | Optional variable allowing to manually specify location of BLAS library. | -| BUILD_CUML_CPP_LIBRARY | [ON, OFF] | ON | Enable/disable building libcuml++ shared library. If either BUILD_CUML_TESTS or BUILD_CUML_MG_TESTS are set to ON, this variable is forced to be ON | +| BUILD_CUML_CPP_LIBRARY | [ON, OFF] | ON | Enable/disable building libcuml++ shared library. Setting this variable to `OFF` sets the variables BUILD_CUML_TESTS, BUILD_CUML_MG_TESTS and BUILD_CUML_EXAMPLES to `OFF` | | BUILD_CUML_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_test`. | | BUILD_CUML_MG_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_mg_test`. | | BUILD_PRIM_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `prims_test`. | diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 4ed48aee9f..475c648789 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -65,10 +65,12 @@ set(GPU_ARCHS "" CACHE STRING set(CMAKE_IGNORE_PATH "${CMAKE_INSTALL_DIR}/lib" CACHE STRING "Ignore any libs added implicitly from the CMAKE_INSTALL_DIR") -# Bulding cuml_test or cuml_mg_test executables forces building libcuml++ -if(BUILD_CUML_TESTS OR BUILD_CUML_MG_TESTS) - set(BUILD_CUML_CPP_LIBRARY ON) -endif(BUILD_CUML_TESTS OR BUILD_CUML_MG_TESTS) +# Disabling libcuml++ disables buidling algorithm tests and examples +if(NOT BUILD_CUML_CPP_LIBRARY) + set(BUILD_CUML_TESTS OFF) + set(BUILD_CUML_MG_TESTS OFF) + set(BUILD_CUML_EXAMPLES OFF) +endif(NOT BUILD_CUML_CPP_LIBRARY) ################################################################################################### diff --git a/cpp/README.md b/cpp/README.md index 617c448b8e..361dddaf32 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -29,7 +29,7 @@ Current cmake offers the following configuration options: | Flag | Possible Values | Default Value | Behavior | | --- | --- | --- | --- | | BLAS_LIBRARIES | path/to/blas_lib | "" | Optional variable allowing to manually specify location of BLAS library. | -| BUILD_CUML_CPP_LIBRARY | [ON, OFF] | ON | Enable/disable building libcuml++ shared library. If either BUILD_CUML_TESTS or BUILD_CUML_MG_TESTS are set to ON, this variable is forced to be ON | +| BUILD_CUML_CPP_LIBRARY | [ON, OFF] | ON | Enable/disable building libcuml++ shared library. Setting this variable to `OFF` sets the variables BUILD_CUML_TESTS, BUILD_CUML_MG_TESTS and BUILD_CUML_EXAMPLES to `OFF` | | BUILD_CUML_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_test`. | | BUILD_CUML_MG_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_mg_test`. | | BUILD_PRIM_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `prims_test`. | From d206bee4a7eb335b676d3a79a19ebe3ca386acf4 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 22:27:07 -0500 Subject: [PATCH 084/156] FIX Update GPU CI script for cmake changes --- ci/gpu/build.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 071e38d072..ded07aae19 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -67,7 +67,7 @@ logger "Build libcuml..." mkdir -p $WORKSPACE/cpp/build cd $WORKSPACE/cpp/build logger "Run cmake libcuml..." -cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a -DLAPACK_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH .. +cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a -DLAPACK_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH -DBUILD_PRIM_TESTS=OFF .. logger "Clean up make..." make clean @@ -104,7 +104,9 @@ pytest --cache-clear --junitxml=${WORKSPACE}/junit-cuml.xml -v ################################################################################ logger "Build ml-prims tests..." -cd $WORKSPACE/cpp/build +mkdir -p $WORKSPACE/cpp/build_prims +cd $WORKSPACE/cpp/build_prims +cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a -DLAPACK_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH -DBUILD_CUML_CPP_LIBRARY=OFF .. logger "Clean up make..." make clean @@ -113,4 +115,4 @@ make -j${PARALLEL_LEVEL} prims_test logger "Run ml-prims test..." cd $WORKSPACE/ml-prims/build -GTEST_OUTPUT="xml:${WORKSPACE}/test-results/ml-prims/" ./test/prims_test +GTEST_OUTPUT="xml:${WORKSPACE}/test-results/ml-prims/" ./prims_test From 4dcf8c2fbe7a508d852135d9cae0df2ad72e687c Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 22:28:25 -0500 Subject: [PATCH 085/156] FIX Remove unused flag in GPU CI build script --- ci/gpu/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index ded07aae19..02688667f5 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -67,7 +67,7 @@ logger "Build libcuml..." mkdir -p $WORKSPACE/cpp/build cd $WORKSPACE/cpp/build logger "Run cmake libcuml..." -cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a -DLAPACK_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH -DBUILD_PRIM_TESTS=OFF .. +cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH -DBUILD_PRIM_TESTS=OFF .. logger "Clean up make..." make clean @@ -106,7 +106,7 @@ pytest --cache-clear --junitxml=${WORKSPACE}/junit-cuml.xml -v logger "Build ml-prims tests..." mkdir -p $WORKSPACE/cpp/build_prims cd $WORKSPACE/cpp/build_prims -cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a -DLAPACK_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH -DBUILD_CUML_CPP_LIBRARY=OFF .. +cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH -DBUILD_CUML_CPP_LIBRARY=OFF .. logger "Clean up make..." make clean From a9880d577a92e0c22c496b051f5538bdd040d02e Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 22:31:20 -0500 Subject: [PATCH 086/156] FIX cpp/scripts folder updates --- cpp/scripts/plotter_kf.py | 73 --------------------------------- cpp/scripts/run-clang-format.py | 11 ++++- 2 files changed, 9 insertions(+), 75 deletions(-) delete mode 100644 cpp/scripts/plotter_kf.py diff --git a/cpp/scripts/plotter_kf.py b/cpp/scripts/plotter_kf.py deleted file mode 100644 index cfa5853b4a..0000000000 --- a/cpp/scripts/plotter_kf.py +++ /dev/null @@ -1,73 +0,0 @@ -# Dependencies - matlab python3 numpy -# -# plotter python script to look at 2 dimentional data from kalman filters -# and analyse the result visually. -# -# running instuctions - $ python plotter.py - - -import matplotlib.pyplot as plt -import numpy as np -from decimal import Decimal - -file_location = "C:\\Users\\apoorva\\Desktop\\gitlab\\KalmanFilter\\testing\\measure4.txt" -f = open(file_location,'r') -read_data = f.read() - -print ("data read!") - -arr_data = read_data.split() - -num_Lines = len(arr_data) - -linestx = [] -linestv = [] -enestx = [] -enestv = [] -z = [] -linupx = [] -linupv = [] -enupx = [] -enupv = [] -x = [] - -print ("sumber of lines " + str(num_Lines)) - -for i in range(num_Lines): - j = i % 9 - - if (j == 0): - linestx.append(float(arr_data[i])) - if (j == 1): - linestv.append(float(arr_data[i])) - if (j == 2): - enestx.append(float(arr_data[i])) - if (j == 3): - enestv.append(float(arr_data[i])) - if (j == 4): - z.append(float(arr_data[i])) - if (j == 5): - linupx.append(float(arr_data[i])) - if (j == 6): - linupv.append(float(arr_data[i])) - if (j == 7): - enupx.append(float(arr_data[i])) - if (j == 8): - enupv.append(float(arr_data[i])) - x.append(i/8 - 1) - -# LKF_est = np.array([linestx]) -# LKF_up = np.array([linupx]) -# EnKF_est = np.array([enestx]) -# EnKF_up = np.array([enupx]) -# Measurements = np.array([z]) - - - -plt.plot (x, linestx,label='LKF_est') -plt.plot (x, linupx, label='LKF_up') -plt.plot (x, enestx, label='EnKF_est') -plt.plot (x, enupx, label='EnKF_up') -plt.plot (x, z, 'g^', label='Measurements') -plt.legend(loc='best') -plt.show() diff --git a/cpp/scripts/run-clang-format.py b/cpp/scripts/run-clang-format.py index 9b3102a15d..4c219f9c58 100755 --- a/cpp/scripts/run-clang-format.py +++ b/cpp/scripts/run-clang-format.py @@ -33,6 +33,7 @@ def listAllSources(fileRegexStr, srcdir, bindir, inplace): allFiles.append((src, dst)) return allFiles + def parseArgs(): argparser = argparse.ArgumentParser("Run clang-format on a project") argparser.add_argument("-bindir", type=str, default=".", @@ -49,6 +50,7 @@ def parseArgs(): help="List of dirs where to find sources") return argparser.parse_args() + def isNewer(src, dst): if not os.path.exists(dst): return True @@ -56,6 +58,7 @@ def isNewer(src, dst): b = os.path.getmtime(dst) return a >= b + def runClangFormat(src, dst, exe): # run the clang format command itself if isNewer(src, dst): @@ -66,18 +69,21 @@ def runClangFormat(src, dst, exe): try: subprocess.check_call(cmd, shell=True) except subprocess.CalledProcessError: - print("Unable to run clang-format! Please configure your environment.") + print("Unable to run clang-format!" + " Please configure your environment.") raise # run the diff to check if there are any formatting issues cmd = "diff -q %s %s >/dev/null" % (src, dst) try: subprocess.check_call(cmd, shell=True) except subprocess.CalledProcessError: - print("clang-format failed! Run 'diff %s %s' to see the formatting issues!" % + print("clang-format failed!" + " Run 'diff %s %s' to see the formatting issues!" % (src, dst)) return False return True + def main(): args = parseArgs() allFiles = [] @@ -98,5 +104,6 @@ def main(): sys.exit(-1) return + if __name__ == "__main__": main() From 1ecc892f0112b3a0031093ee952b32ef694b4223 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 22:47:41 -0500 Subject: [PATCH 087/156] FIX PEP8 style fixes --- cpp/examples/dbscan/gen_dataset.py | 28 ++++++++++++++-------------- cpp/examples/kmeans/prepare_input.py | 15 +++++++++------ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/cpp/examples/dbscan/gen_dataset.py b/cpp/examples/dbscan/gen_dataset.py index f4ff1a9190..123c3161e0 100755 --- a/cpp/examples/dbscan/gen_dataset.py +++ b/cpp/examples/dbscan/gen_dataset.py @@ -15,41 +15,41 @@ # limitations under the License. # import argparse -import numpy as np from sklearn.datasets.samples_generator import make_blobs parser = argparse.ArgumentParser('gen_dataset.py ') -parser.add_argument('-ns', '--num_samples', type=int, default=10000, \ +parser.add_argument('-ns', '--num_samples', type=int, default=10000, help='Number of samples (default 10000)') -parser.add_argument('-nf', '--num_features', type=int, default=25, \ +parser.add_argument('-nf', '--num_features', type=int, default=25, help='Number of features (default 25)') -parser.add_argument('-nc', '--num_clusters', type=int, default=15, \ +parser.add_argument('-nc', '--num_clusters', type=int, default=15, help='Number of clusters (default 15)') -parser.add_argument('--filename_prefix', type=str, default='synthetic', \ - help='Prefix used for output dataset file (default synthetic)') -parser.add_argument('-sd', '--standard_dev' type=str, default=0.1, \ +parser.add_argument('--filename_prefix', type=str, default='synthetic', + help='Prefix used for output file (default synthetic)') +parser.add_argument('-sd', '--standard_dev', type=str, default=0.1, help='Standard deviation of samples generated') -parser.add_argument('-st', '--random_state' type=str, default=123456, \ +parser.add_argument('-st', '--random_state', type=str, default=123456, help='Standard deviation of samples generated') args = parser.parse_args() datasetFile = '%s-%dx%d-clusters-%d.txt' \ - % (args.filename_prefix, args.num_samples, args.num_features, \ + % (args.filename_prefix, args.num_samples, args.num_features, args.num_clusters) -X, _ = make_blobs(n_samples=args.num_samples, n_features=args.num_features, \ - centers=args.num_clusters, cluster_std=arg.standard_dev, \ - random_state=arg.random_state) +X, _ = make_blobs(n_samples=args.num_samples, n_features=args.num_features, + centers=args.num_clusters, cluster_std=args.standard_dev, + random_state=args.random_state) fp = open(datasetFile, 'w') for row in range(args.num_samples): for col in range(args.num_features): - fp.write('%f\n' %X[row, col]) + fp.write('%f\n' % X[row, col]) fp.close() print('Dataset file: %s' % datasetFile) -print('Generated total %d samples with %d features each' % (args.num_samples, args.num_features)) +print('Generated total %d samples with %d features each' % (args.num_samples, + args.num_features)) print('Number of clusters = %d' % args.num_clusters) diff --git a/cpp/examples/kmeans/prepare_input.py b/cpp/examples/kmeans/prepare_input.py index 3bcd1266f2..1a2b65972d 100755 --- a/cpp/examples/kmeans/prepare_input.py +++ b/cpp/examples/kmeans/prepare_input.py @@ -33,10 +33,12 @@ output_file = "output.txt" if len(sys.argv) > 3: output_file = sys.argv[3] -print("Reading Input from train_file = %s and test_file = %s" % (train_file, test_file) ) +print("Reading Input from train_file = %s and test_file = %s" % (train_file, + test_file)) if not os.path.exists(train_file) or not os.path.exists(test_file): - raise Exception("Download the dataset from here: https://www.kaggle.com/c/homesite-quote-conversion/data") + raise Exception("Download the dataset from here:" + " https://www.kaggle.com/c/homesite-quote-conversion/data") train = pd.read_csv(train_file) print("Training dataset dimension: ", train.shape) @@ -45,7 +47,7 @@ # Data munging step - KMeans takes only numerical values train.drop(['QuoteConversion_Flag'], axis=1, inplace=True) dataset = pd.concat([train, test], ignore_index=True) -tmp = dataset.dtypes.reset_index().rename(columns={0:"type"}) +tmp = dataset.dtypes.reset_index().rename(columns={0: "type"}) indx = tmp["type"] == "object" categoricals = tmp[indx]["index"].tolist() # Replace nans as new category @@ -67,14 +69,15 @@ val_dict = val_dict / float(dataset.shape[0]) val_dict = val_dict.to_dict() dataset[col] = dataset[col].apply(lambda x: val_dict[x]) -trainenc = dataset.iloc[:train.shape[0],:].reset_index(drop = True) +trainenc = dataset.iloc[:train.shape[0], :].reset_index(drop=True) trainencflt = trainenc.values.astype(np.float32) print("Output dataset dimension: ", trainencflt.shape) -output = open(output_file,"w+") +output = open(output_file, "w+") num_items = 0 for row in trainencflt: for val in row: output.write("%f\n" % val) num_items += 1 output.close() -print("Wrote %d values in row major order to output %s" % (num_items, output_file)) +print("Wrote %d values in row major order to output %s" % (num_items, + output_file)) From d7dcd9d7a9846dfdc4686a54810e85a5e3213f65 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 9 May 2019 23:39:08 -0500 Subject: [PATCH 088/156] FIX Path of GPU CI prims test --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 02688667f5..15662761ff 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -114,5 +114,5 @@ logger "Make ml-prims test..." make -j${PARALLEL_LEVEL} prims_test logger "Run ml-prims test..." -cd $WORKSPACE/ml-prims/build +cd $WORKSPACE/cpp/build_prims GTEST_OUTPUT="xml:${WORKSPACE}/test-results/ml-prims/" ./prims_test From 49ae13235a12f596358018fc15f42d9a736a02f3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 10 May 2019 11:21:18 -0400 Subject: [PATCH 089/156] Adding C++ unit test for epsilon_neighborhood prim --- cuML/src/dbscan/vertexdeg/algo.h | 4 +- ml-prims/src/distance/distance.h | 48 ++++++++++++-- ml-prims/test/CMakeLists.txt | 1 + ml-prims/test/dist_eps.cu | 105 +++++++++++++++++++++++++++++++ ml-prims/test/dist_eps.h | 15 ----- 5 files changed, 149 insertions(+), 24 deletions(-) create mode 100644 ml-prims/test/dist_eps.cu delete mode 100644 ml-prims/test/dist_eps.h diff --git a/cuML/src/dbscan/vertexdeg/algo.h b/cuML/src/dbscan/vertexdeg/algo.h index b5e6980aaa..0a3e06683e 100644 --- a/cuML/src/dbscan/vertexdeg/algo.h +++ b/cuML/src/dbscan/vertexdeg/algo.h @@ -45,8 +45,6 @@ template void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVertexId, int batchSize, cudaStream_t stream) { data.resetArray(stream, batchSize+1); - typedef cutlass::Shape<8, 128, 128> OutputTile_t; - int m = data.N; int n = min(data.N - startVertexId, batchSize); int k = data.D; @@ -66,7 +64,7 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVe if (workspaceSize != 0) workspace.resize(workspaceSize, stream); - MLCommon::Distance::epsilon_neighborhood + MLCommon::Distance::epsilon_neighborhood (data.x, data.x+startVertexId*k, data.adj, m, n, k, eps2, (void*)workspace.data(), workspaceSize, stream, [vd, n] __device__ (int global_c_idx, bool in_neigh) { diff --git a/ml-prims/src/distance/distance.h b/ml-prims/src/distance/distance.h index 507c613577..beb8d7616b 100644 --- a/ml-prims/src/distance/distance.h +++ b/ml-prims/src/distance/distance.h @@ -25,6 +25,8 @@ namespace MLCommon { namespace Distance { +typedef cutlass::Shape<8, 128, 128> OutputTile_t; + /** enum to tell how to compute euclidean distance */ enum DistanceType { /** evaluate as dist_ij = sum(x_ik^2) + sum(y_ij)^2 - 2*sum(x_ik * y_jk) */ @@ -178,6 +180,7 @@ void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, FinalLambda fin_op, cudaStream_t stream) { DistanceImpl distImpl; distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream); + CUDA_CHECK(cudaPeekAtLastError()); } /** @@ -208,6 +211,8 @@ void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, [] __device__(AccType d_val, int g_d_idx) { return d_val; }; distance( x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream); + + CUDA_CHECK(cudaPeekAtLastError()); } @@ -216,6 +221,7 @@ void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, * filtering the final distance by some epsilon. * @tparam distanceType: distance metric to compute between a and b matrices * @tparam T: the type of input matrices a and b + * @tparam Lambda: * @param a: row-major input matrix a * @param b: row-major input matrix b * @param adj: a boolean output adjacency matrix @@ -227,16 +233,16 @@ void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, * variant for efficiency, the epsilon will need to be squared as well. * @param workspace: temporary workspace needed for computations * @param worksize: number of bytes of the workspace - * @param fused_op: a 2-argument lambda function taking the output index into c + * @param stream cuda stream + * @param fused_op: optional functor taking the output index into c * and a boolean denoting whether or not the inputs are part of * the epsilon neighborhood. - * - * @param stream cuda stream */ -templatevoid> +templatevoid > size_t epsilon_neighborhood(T *a, T *b, bool *adj, int m, int n, int k, T eps, - void *workspace, size_t worksize, cudaStream_t stream, - Lambda fused_op = [] __device__(int o, bool t){}) { + void *workspace, size_t worksize, cudaStream_t stream, Lambda fused_op) { auto epsilon_op = [n, eps, fused_op] __device__ (T val, int global_c_idx) { bool acc = val <= eps; fused_op(global_c_idx, acc); @@ -249,5 +255,35 @@ size_t epsilon_neighborhood(T *a, T *b, bool *adj, int m, int n, int k, T eps, return worksize; } +/** + * @brief Constructs an epsilon neighborhood adjacency matrix by + * filtering the final distance by some epsilon. + * @tparam distanceType: distance metric to compute between a and b matrices + * @tparam T: the type of input matrices a and b + * @tparam Lambda: + * @param a: row-major input matrix a + * @param b: row-major input matrix b + * @param adj: a boolean output adjacency matrix + * @param m: number of points in a + * @param n: number of points in b + * @param k: dimensionality + * @param eps: the epsilon value to use as a filter for neighborhood construction. + * it is important to note that if the distance type returns a squared + * variant for efficiency, the epsilon will need to be squared as well. + * @param workspace: temporary workspace needed for computations + * @param worksize: number of bytes of the workspace + * @param stream cuda stream + */ +template +size_t epsilon_neighborhood(T *a, T *b, bool *adj, int m, int n, int k, T eps, + void *workspace, size_t worksize, cudaStream_t stream) { + return epsilon_neighborhood( + a, b, adj, m, n, k, eps, workspace, worksize, stream, + [] __device__ (int c_idx, bool acc) {} + ); +} + + }; // end namespace Distance }; // end namespace MLCommon diff --git a/ml-prims/test/CMakeLists.txt b/ml-prims/test/CMakeLists.txt index 9bf642c15b..2036c7cb78 100644 --- a/ml-prims/test/CMakeLists.txt +++ b/ml-prims/test/CMakeLists.txt @@ -33,6 +33,7 @@ add_executable(mlcommon_test decoupled_lookback.cu dist_adj.cu dist_cos.cu + dist_eps.cu dist_euc_exp.cu dist_euc_unexp.cu dist_l1.cu diff --git a/ml-prims/test/dist_eps.cu b/ml-prims/test/dist_eps.cu new file mode 100644 index 0000000000..56044c41f3 --- /dev/null +++ b/ml-prims/test/dist_eps.cu @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "distance/distance.h" + +#include +#include "test_utils.h" + +#include +#include + + +namespace MLCommon { +namespace Distance { + + +/** + * For now, this is mostly to test the c++ algorithm is able to be built. + * Comprehensive comparisons of resulting embeddings are being done in the + * Python test suite. Next to come will be a CUDA implementation of t-SNE's + * trustworthiness score, which will allow us to gtest embedding algorithms. + */ +class EpsilonNeighborhoodTest: public ::testing::Test { + +protected: + void SetUp() override {} + void TearDown() override {} +}; + + +typedef EpsilonNeighborhoodTest TestNeighborhoodsNoFunctor; +TEST_F(TestNeighborhoodsNoFunctor, Result) { + + cudaStream_t stream; + CUDA_CHECK( cudaStreamCreate(&stream) ); + + int m = 6; + int k = 2; + + float *data; + bool *adj, *expected; + + allocate(data, m*k, true); + allocate(adj, m*m, true); + allocate(expected, m*m, true); + + std::vector data_h = { 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0 }; + data_h.resize(m*k); + + bool *expected_h = new bool[m*m]{ + 1, 1, 1, 0, 0, 0, + 1, 1, 1, 0, 0, 0, + 1, 1, 1, 0, 0, 0, + 0, 0, 0, 1, 1, 0, + 0, 0, 0, 1, 1, 0, + 0, 0, 0, 0, 0, 1 + }; + + updateDevice(data, data_h.data(), m*k, stream); + updateDevice(expected, expected_h, m*m, stream); + + float eps = 3.0; + + char* workspace; + size_t workspaceSize = 0; + + constexpr auto distance_type = MLCommon::Distance::DistanceType::EucExpandedL2Sqrt; + + workspaceSize = MLCommon::Distance::getWorkspaceSize + (data, data, m, m, k); + + if (workspaceSize != 0) + allocate(workspace, workspaceSize, true); + + epsilon_neighborhood + (data, data, adj, m, m, k, eps, (void*)workspace, workspaceSize, stream); + + CUDA_CHECK( cudaStreamSynchronize(stream) ); + + + ASSERT_TRUE(devArrMatch(adj, expected, m*m, Compare(), stream)); + + CUDA_CHECK( cudaStreamDestroy(stream) ); + CUDA_CHECK( cudaFree(data) ); + CUDA_CHECK( cudaFree(adj) ); + + delete expected_h; +} +}; +}; diff --git a/ml-prims/test/dist_eps.h b/ml-prims/test/dist_eps.h deleted file mode 100644 index 9ef3f8fe4e..0000000000 --- a/ml-prims/test/dist_eps.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * dist_eps.h - * - * Created on: May 9, 2019 - * Author: cjnolet - */ - -#ifndef DIST_EPS_H_ -#define DIST_EPS_H_ - - - - - -#endif /* DIST_EPS_H_ */ From ed9630fa6448843a790ab915c65b4befc8b771cf Mon Sep 17 00:00:00 2001 From: Ray Douglass <3107146+raydouglass@users.noreply.github.com> Date: Fri, 10 May 2019 11:30:24 -0400 Subject: [PATCH 090/156] Update v0.7 release date --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 768efe16de..b1d40d2635 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -# cuML 0.7.0 (Date TBD) +# cuML 0.7.0 (10 May 2019) ## New Features From 0072e26e19bc269458d06fd750910eb0d160281a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 10 May 2019 11:48:37 -0400 Subject: [PATCH 091/156] Adding c++ unit tests for make_monotonic --- ml-prims/src/array/array.h | 34 ++++++++++++---- ml-prims/test/CMakeLists.txt | 1 + ml-prims/test/array.cu | 75 ++++++++++++++++++++++++++++++++++++ ml-prims/test/dist_eps.cu | 7 ---- 4 files changed, 102 insertions(+), 15 deletions(-) create mode 100644 ml-prims/test/array.cu diff --git a/ml-prims/src/array/array.h b/ml-prims/src/array/array.h index d05a9d77cb..7b0f448e7b 100644 --- a/ml-prims/src/array/array.h +++ b/ml-prims/src/array/array.h @@ -21,12 +21,12 @@ namespace Array { template __global__ void map_label_kernel(Type *map_ids, Type *in, Type *out, - Type N, Lambda filter_op) { + size_t N, Lambda filter_op) { int tid = threadIdx.x + blockIdx.x*TPB_X; if(tid < N) { if(!filter_op(in[tid])) { - for(int i=0; i < N; i++) { + for(size_t i=0; i < N; i++) { if(in[tid] == map_ids[i]) { out[tid] = i + 1; break; @@ -54,11 +54,12 @@ __global__ void map_label_kernel(Type *map_ids, Type *in, Type *out, * @param filter_op an optional function for specifying which values * should have monotonically increasing labels applied to them. */ -template -void make_monotonic(Type *out, Type *in, Type N, cudaStream_t stream, - Lambda filter_op = [] __device__ (int val) {return false;}) { +template +void make_monotonic(Type *out, Type *in, size_t N, + cudaStream_t stream, + Lambda filter_op) { - static const int TPB_X = 256; + static const size_t TPB_X = 256; dim3 blocks(ceildiv(N, TPB_X)); dim3 threads(TPB_X); @@ -87,7 +88,24 @@ void make_monotonic(Type *out, Type *in, Type N, cudaStream_t stream, map_label_kernel<<>>(map_ids, in, out, N, filter_op); } - - +/** + * Maps an input array containing a series of numbers into a new array + * where numbers have been mapped to a monotonically increasing set + * of labels. This can be useful in machine learning algorithms, for instance, + * where a given set of labels is not taken from a monotonically increasing + * set. This can happen if they are filtered or if only a subset of the + * total labels are used in a dataset. This is also useful in graph algorithms + * where a set of vertices need to be labeled in a monotonically increasing + * order. + * @tparam Type the numeric type of the input and output arrays + * @tparam Lambda the type of an optional filter function, which determines + * which items in the array to map. + * @param N number of elements in the input array + * @param stream cuda stream to use + */ +template +void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream) { + make_monotonic(out, in, N, stream, [] __device__ (Type val) {return false;}); +} }; }; diff --git a/ml-prims/test/CMakeLists.txt b/ml-prims/test/CMakeLists.txt index 2036c7cb78..6f62db43fa 100644 --- a/ml-prims/test/CMakeLists.txt +++ b/ml-prims/test/CMakeLists.txt @@ -22,6 +22,7 @@ include_directories(${GTEST_DIR}/googletest/include) # (please keep the filenames in alphabetical order) add_executable(mlcommon_test add.cu + array.cu binary_op.cu ternary_op.cu coalesced_reduction.cu diff --git a/ml-prims/test/array.cu b/ml-prims/test/array.cu new file mode 100644 index 0000000000..38e6182c4b --- /dev/null +++ b/ml-prims/test/array.cu @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "array/array.h" + +#include +#include "test_utils.h" + +#include +#include + + +namespace MLCommon { +namespace Array { + + +class ArrayTest: public ::testing::Test { + +protected: + void SetUp() override {} + void TearDown() override {} +}; + + +typedef ArrayTest MakeMonotonicTest; +TEST_F(MakeMonotonicTest, Result) { + + cudaStream_t stream; + CUDA_CHECK( cudaStreamCreate(&stream) ); + + int m = 12; + + float *data, *actual, *expected; + + allocate(data, m, true); + allocate(actual, m, true); + allocate(expected, m, true); + + float* data_h = new float[m]{ 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0 }; + + float *expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0 }; + + updateDevice(data, data_h, m, stream); + updateDevice(expected, expected_h, m, stream); + + make_monotonic(actual, data, m, stream); + + CUDA_CHECK( cudaStreamSynchronize(stream) ); + + ASSERT_TRUE(devArrMatch(actual, expected, m, Compare(), stream)); + + CUDA_CHECK( cudaStreamDestroy(stream) ); + CUDA_CHECK( cudaFree(data) ); + CUDA_CHECK( cudaFree(actual) ); + + delete data_h; + delete expected_h; +} +}; +}; diff --git a/ml-prims/test/dist_eps.cu b/ml-prims/test/dist_eps.cu index 56044c41f3..b9035558dc 100644 --- a/ml-prims/test/dist_eps.cu +++ b/ml-prims/test/dist_eps.cu @@ -29,12 +29,6 @@ namespace MLCommon { namespace Distance { -/** - * For now, this is mostly to test the c++ algorithm is able to be built. - * Comprehensive comparisons of resulting embeddings are being done in the - * Python test suite. Next to come will be a CUDA implementation of t-SNE's - * trustworthiness score, which will allow us to gtest embedding algorithms. - */ class EpsilonNeighborhoodTest: public ::testing::Test { protected: @@ -92,7 +86,6 @@ TEST_F(TestNeighborhoodsNoFunctor, Result) { CUDA_CHECK( cudaStreamSynchronize(stream) ); - ASSERT_TRUE(devArrMatch(adj, expected, m*m, Compare(), stream)); CUDA_CHECK( cudaStreamDestroy(stream) ); From a25aa2b761092e02a39aa1cf734f8a06fa810924 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 10 May 2019 11:52:24 -0400 Subject: [PATCH 092/156] Removing couts from coo tests --- ml-prims/test/coo.cu | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ml-prims/test/coo.cu b/ml-prims/test/coo.cu index b21dfb7dc2..a5757b6aa4 100644 --- a/ml-prims/test/coo.cu +++ b/ml-prims/test/coo.cu @@ -105,8 +105,6 @@ TEST_P(COOSymmetrize, Result) { [] __device__ (int row, int col, float val, float trans) { return val+trans; }, stream); - std::cout << out << std::endl; - ASSERT_TRUE(out.nnz == expected.nnz); ASSERT_TRUE(devArrMatch(out.rows, expected.rows, out.nnz, Compare())); ASSERT_TRUE(devArrMatch(out.cols, expected.cols, out.nnz, Compare())); @@ -201,8 +199,6 @@ TEST_P(COORemoveZeros, Result) { updateDevice(in.cols, in_h.cols, params.nnz, stream); updateDevice(in.vals, in_h.vals, params.nnz, stream); - std::cout << in << std::endl; - coo_sort(&in); int out_rows_ref_h[2] = { 0, 3 }; @@ -215,8 +211,6 @@ TEST_P(COORemoveZeros, Result) { COO out_ref(2, 5, 5); COO out; - std::cout << in << std::endl; - updateDevice(out_ref.rows, *&out_rows_ref_h, 2, stream); updateDevice(out_ref.cols, *&out_cols_ref_h, 2, stream); updateDevice(out_ref.vals, out_vals_ref_h, 2, stream); From e4b0b8d8026242078cb8ec70368b978de253d3cd Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 10 May 2019 12:00:29 -0400 Subject: [PATCH 093/156] Removing the labeling algorithm since it's now a one-liner prim call. Will probably do this with the other algos eventually. Adding some useful comments to the other algos. --- cuML/src/dbscan/adjgraph/algo.h | 8 +- cuML/src/dbscan/labelling/algo1.h | 124 ----------------------------- cuML/src/dbscan/labelling/algo2.h | 61 -------------- cuML/src/dbscan/labelling/naive.h | 98 ----------------------- cuML/src/dbscan/labelling/pack.h | 58 -------------- cuML/src/dbscan/labelling/runner.h | 58 -------------- cuML/src/dbscan/runner.h | 1 - cuML/src/dbscan/vertexdeg/algo.h | 10 +-- 8 files changed, 2 insertions(+), 416 deletions(-) delete mode 100644 cuML/src/dbscan/labelling/algo1.h delete mode 100644 cuML/src/dbscan/labelling/algo2.h delete mode 100644 cuML/src/dbscan/labelling/naive.h delete mode 100644 cuML/src/dbscan/labelling/pack.h delete mode 100644 cuML/src/dbscan/labelling/runner.h diff --git a/cuML/src/dbscan/adjgraph/algo.h b/cuML/src/dbscan/adjgraph/algo.h index 78ffef9948..fc999af8d0 100644 --- a/cuML/src/dbscan/adjgraph/algo.h +++ b/cuML/src/dbscan/adjgraph/algo.h @@ -38,8 +38,7 @@ static const int TPB_X = 256; /** * Takes vertex degree array (vd) and CSR row_ind array (ex_scan) to produce the - * CSR row_ind_ptr array (adj_graph) and values array(core_pts). This could be - * made into a reusable prim by providing a lambda for a fused op, given the + * CSR row_ind_ptr array (adj_graph) and filters into a core_pts array based on min_pts. */ template void launcher(const ML::cumlHandle_impl& handle, Pack data, Type batchSize, cudaStream_t stream) { @@ -55,9 +54,6 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, Type batchSize int minPts = data.minPts; int *vd = data.vd; - std::cout << MLCommon::arr2Str(data.ex_scan, batchSize, "ex_scan", stream) << std::endl; - std::cout << MLCommon::arr2Str(data.adj, batchSize*data.N, "adj", stream) << std::endl; - MLCommon::Sparse::csr_adj_graph_batched( data.ex_scan, data.N, @@ -71,8 +67,6 @@ void launcher(const ML::cumlHandle_impl& handle, Pack data, Type batchSize core_pts[row] = (vd[row] >= minPts); }); - std::cout << MLCommon::arr2Str(data.adj_graph, data.adjnnz, "adj_graph", stream) << std::endl; - CUDA_CHECK(cudaPeekAtLastError()); } diff --git a/cuML/src/dbscan/labelling/algo1.h b/cuML/src/dbscan/labelling/algo1.h deleted file mode 100644 index a858590a99..0000000000 --- a/cuML/src/dbscan/labelling/algo1.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include "pack.h" -#include "dbscan/common.h" -#include -#include - -namespace Dbscan { -namespace Label { -namespace Algo1 { - -using namespace thrust; -using namespace MLCommon; - -template -__global__ void bfs_device(Pack data, int startVertexId, int batchSize) { - int tid = threadIdx.x + blockIdx.x*TPB_X; - if(tid < batchSize) { - if(data.fa[tid + startVertexId]) { - data.fa[tid + startVertexId] = false; - data.xa[tid + startVertexId] = true; - int start = int(data.ex_scan[tid]); - for(int i=0; i< int(data.vd[tid]); i++) - data.fa[startVertexId + data.adj_graph[start + i]] = 1 - data.xa[startVertexId + data.adj_graph[start + i]]; - } - } -} - -static const int TPB_X = 256; - -template -void bfs(const ML::cumlHandle_impl& handle, int id, Pack data, Type *host_adj_graph, Type *host_ex_scan, int *host_vd, - bool *host_visited, Type *host_db_cluster, Type cluster, size_t N, - int startVertexId, int batchSize, cudaStream_t stream) { - MLCommon::host_buffer host_xa(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_fa(handle.getHostAllocator(), stream, N); - memset(host_xa.data(), false, sizeof(bool)*N); - memset(host_fa.data(), false, sizeof(bool)*N); - host_fa[id] = true; - MLCommon::updateDevice(data.xa, host_xa.data(), N, stream); - MLCommon::updateDevice(data.fa, host_fa.data(), N, stream); - int countFa = 1; - dim3 blocks(ceildiv(batchSize, TPB_X), 1, 1); - dim3 threads(TPB_X, 1, 1); - while(countFa > 0) { - bfs_device<<>>(data, startVertexId, batchSize); - ML::thrustAllocatorAdapter alloc( handle.getDeviceAllocator(), stream ); - auto execution_policy = thrust::cuda::par(alloc).on(stream); - countFa = count(execution_policy, data.fa, data.fa + N, true); - } - MLCommon::updateHost(host_xa.data(), data.xa, N, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - for(int i=0; i -void identifyCluster(const ML::cumlHandle_impl& handle, Pack data, int startVertexId, int batchSize, cudaStream_t stream) { - Type cluster = Type(1) + startVertexId; - size_t N = (size_t)data.N; - MLCommon::host_buffer host_vd(handle.getHostAllocator(), stream, batchSize+1); - MLCommon::host_buffer host_core_pts(handle.getHostAllocator(), stream, batchSize); - MLCommon::host_buffer host_visited(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_ex_scan(handle.getHostAllocator(), stream, batchSize); - MLCommon::host_buffer host_db_cluster(handle.getHostAllocator(), stream, N); - - MLCommon::updateHost(host_core_pts.data(), data.core_pts, batchSize, stream); - MLCommon::updateHost(host_vd.data(), data.vd, batchSize+1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - size_t adjgraph_size = size_t(host_vd[batchSize]); - MLCommon::host_buffer host_adj_graph(handle.getHostAllocator(), stream, adjgraph_size); - MLCommon::updateHost(host_ex_scan.data(), data.ex_scan, batchSize, stream); - MLCommon::updateHost(host_adj_graph.data(), data.adj_graph, adjgraph_size, stream); - MLCommon::updateHost(host_visited.data(), data.visited, N, stream); - MLCommon::updateHost(host_db_cluster.data(), data.db_cluster, N, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - for(int i=0; i -void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVertexId, int batchSize, cudaStream_t stream) { - if(startVertexId == 0) - data.resetArray(stream); - CUDA_CHECK(cudaMemsetAsync(data.db_cluster, 0, sizeof(Type)*data.N, stream)); - identifyCluster(handle, data, startVertexId, batchSize, stream); -} - -} //End Algo1 -} //End Label -} //End Dbscan diff --git a/cuML/src/dbscan/labelling/algo2.h b/cuML/src/dbscan/labelling/algo2.h deleted file mode 100644 index 05dc870c8a..0000000000 --- a/cuML/src/dbscan/labelling/algo2.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include "pack.h" -#include "dbscan/common.h" -#include -#include -#include -#include - -#include "sparse/csr.h" - -namespace Dbscan { -namespace Label { - -/** - * This implementation comes from [1] and solves component labeling problem in - * parallel. - * - * todo: This might also be reusable as a more generalized connected component - * labeling algorithm. - * - * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" - */ -namespace Algo2 { - -using namespace thrust; -using namespace MLCommon; - -static const int TPB_X = 256; - -template -void launcher(const ML::cumlHandle_impl& handle, Pack data, Type N, - int startVertexId, int batchSize, cudaStream_t stream) { - - -} - -} // End Algo2 -} // End Label -} // End Dbscan diff --git a/cuML/src/dbscan/labelling/naive.h b/cuML/src/dbscan/labelling/naive.h deleted file mode 100644 index 242e855794..0000000000 --- a/cuML/src/dbscan/labelling/naive.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include "pack.h" -#include "../common.h" -#include -#include -#include - -namespace Dbscan { -namespace Label { -namespace Naive { - -using namespace std; -template -void bfs(int id, Type *host_adj_graph, - Type *host_ex_scan, Type *host_vd, bool *host_visited, - Type *host_db_cluster, Type cluster, bool *host_xa, size_t N) { - queue q; - q.push(id); - host_xa[id] = true; - while(!q.empty()) { - int f = q.front(); - q.pop(); - Type start = host_ex_scan[f]; - for(int i = 0; i< host_vd[f]; i++) { - if(!host_xa[host_adj_graph[start + i]]) { - q.push(host_adj_graph[start + i]); - host_xa[host_adj_graph[start + i]] = true; - } - } - } - - for(int i=0; i -void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVertexId, int batchSize, cudaStream_t stream) { - size_t N = (size_t)data.N; - MLCommon::host_buffer host_vd(handle.getHostAllocator(), stream, N+1); - MLCommon::host_buffer host_core_pts(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_visited(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_ex_scan(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_db_cluster(handle.getHostAllocator(), stream, N); - MLCommon::host_buffer host_xa(handle.getHostAllocator(), stream, N); - data.resetArray(stream); - /** this line not in resetArray function because it interferes with algo2 */ - //CUDA_CHECK(cudaMemsetAsync(data.db_cluster, 0, sizeof(Type)*N, stream)); - MLCommon::updateHost(host_core_pts.data(), data.core_pts, N, stream); - MLCommon::updateHost(host_vd.data(), data.vd, N+1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - size_t adjgraph_size = size_t(host_vd[N]); - MLCommon::host_buffer host_adj_graph(handle.getHostAllocator(), stream, adjgraph_size); - MLCommon::updateHost(host_ex_scan.data(), data.ex_scan, N, stream); - MLCommon::updateHost(host_adj_graph.data(), data.adj_graph, adjgraph_size, stream); - MLCommon::updateHost(host_xa.data(), data.xa, N, stream); - MLCommon::updateHost(host_visited.data(), data.visited, N, stream); - MLCommon::updateHost(host_db_cluster.data(), data.db_cluster, N, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - Type cluster = Type(1); - for(int i=0; i -struct Pack { - /** - * vertex degree array - * Last position is the sum of all elements in this array (excluding it) - * Hence, its length is one more than the number of poTypes - */ - int *vd; - /** the adjacency matrix */ - bool *adj; - /** the adjacency graph */ - Type *adj_graph; - /** exculusive scan generated from vd */ - Type *ex_scan; - /** array to store whether a vertex is core poType or not */ - bool *core_pts; - /** number of poTypes in the dataset */ - Type N; - /** Minpts for classifying core pts */ - Type minPts; - /** arra to store visited points */ - bool *visited; - /** array to store the final cluster */ - Type *db_cluster; - - MLCommon::Sparse::WeakCCState *state; - - void resetArray(cudaStream_t stream) { - CUDA_CHECK(cudaMemsetAsync(visited, false, sizeof(bool)*N, stream)); - CUDA_CHECK(cudaMemsetAsync(db_cluster, 0, sizeof(Type)*N, stream)); - } -}; - -} // namespace Label -} // namespace Dbscan diff --git a/cuML/src/dbscan/labelling/runner.h b/cuML/src/dbscan/labelling/runner.h deleted file mode 100644 index 016919b723..0000000000 --- a/cuML/src/dbscan/labelling/runner.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018-2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include "naive.h" -#include "algo1.h" -#include "pack.h" -#include "algo2.h" -#include -#include "sparse/csr.h" - -namespace Dbscan { -namespace Label { - - -template -void run(const ML::cumlHandle_impl& handle, bool* adj, int* vd, Type* adj_graph, Type* ex_scan, Type N, - Type minpts, bool* core_pts, bool* visited, Type *db_cluster, - MLCommon::Sparse::WeakCCState *state, - int algo, int startVertexId, int batchSize, cudaStream_t stream) { - Pack data = {vd, adj, adj_graph, ex_scan, core_pts, N, minpts, - visited, db_cluster, state}; - switch(algo) { - case 0: - Naive::launcher(handle, data, startVertexId, batchSize, stream); - break; - case 1: - ASSERT(N == batchSize, "Label::Algo1 doesn't support batching!"); - Algo1::launcher(handle, data, startVertexId, batchSize, stream); - break; - case 2: - Algo2::launcher(handle, data, N, startVertexId, batchSize, state, stream); - break; - default: - ASSERT(false, "Incorrect algo passed! '%d'", algo); - } -} - -} // namespace Label -} // namespace Dbscan - diff --git a/cuML/src/dbscan/runner.h b/cuML/src/dbscan/runner.h index d38feaf745..645e06b16c 100644 --- a/cuML/src/dbscan/runner.h +++ b/cuML/src/dbscan/runner.h @@ -19,7 +19,6 @@ #include #include "vertexdeg/runner.h" #include "adjgraph/runner.h" -#include "labelling/runner.h" #include #include diff --git a/cuML/src/dbscan/vertexdeg/algo.h b/cuML/src/dbscan/vertexdeg/algo.h index 0a3e06683e..69ecd4fcc5 100644 --- a/cuML/src/dbscan/vertexdeg/algo.h +++ b/cuML/src/dbscan/vertexdeg/algo.h @@ -31,15 +31,7 @@ namespace Algo { /** - * Calculates both the vertex degree array and the epsilon neighborhood in a single kernel. - * - * Proposed API for this should be an epsilon neighborhood primitive that accepts a lambda and - * executes the lambda with [n, acc, vertex]. - * - * template - * void epsilon_neighborhood(T *a, T *b, bool *adj, m, n, k, T eps, - * workspaceData, workspaceSize, fused_op, stream) - * + * Calculates the vertex degree array and the epsilon neighborhood adjacency matrix for the batch. */ template void launcher(const ML::cumlHandle_impl& handle, Pack data, int startVertexId, int batchSize, cudaStream_t stream) { From f346b06c97875c7dd9164b114406e8321a95a999 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 10 May 2019 12:09:42 -0400 Subject: [PATCH 094/156] Adjusting comments --- cuML/src/dbscan/runner.h | 10 +++++++++- ml-prims/src/sparse/csr.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/cuML/src/dbscan/runner.h b/cuML/src/dbscan/runner.h index 645e06b16c..75918df54c 100644 --- a/cuML/src/dbscan/runner.h +++ b/cuML/src/dbscan/runner.h @@ -32,6 +32,11 @@ using namespace MLCommon; static const int TPB = 256; +/** + * Adjust labels from weak_cc primitive to match sklearn: + * 1. Turn any labels matching MAX_LABEL into -1 + * 2. Subtract 1 from all other labels. + */ template __global__ void relabelForSkl(Type* labels, Type N, Type MAX_LABEL) { int tid = threadIdx.x + blockDim.x * blockIdx.x; @@ -39,6 +44,10 @@ __global__ void relabelForSkl(Type* labels, Type N, Type MAX_LABEL) { else if(tid < N) --labels[tid]; } +/** + * Turn the non-monotonic labels from weak_cc primitive into + * an array of labels drawn from a monotonically increasing set. + */ template void final_relabel(Type *db_cluster, Type N, cudaStream_t stream) { @@ -138,7 +147,6 @@ size_t run(const ML::cumlHandle_impl& handle, Type_f* x, Type N, Type D, Type_f CUDA_CHECK(cudaPeekAtLastError()); - return (size_t) 0; } } // namespace Dbscan diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index 169de278ab..6f1779fb3b 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -792,6 +792,11 @@ void weak_cc_label_batched(Type *labels, * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, * which will make a monotonically increasing set of labels. * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * * @tparam Type the numeric type of non-floating point elements * @tparam TPB_X the threads to use per block when configuring the kernel * @tparam Lambda the type of an optional filter function (int)->bool @@ -840,6 +845,11 @@ void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, * which will make a monotonically increasing set of labels. * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * * @tparam Type the numeric type of non-floating point elements * @tparam TPB_X the threads to use per block when configuring the kernel * @tparam Lambda the type of an optional filter function (int)->bool @@ -863,6 +873,28 @@ void weak_cc(Type *labels, Type* const row_ind, Type* const row_ind_ptr, filter_op); } +/** + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param stream the cuda stream to use + * should get considered for labeling. + */ template void weak_cc(Type *labels, Type* const row_ind, Type* const row_ind_ptr, Type nnz, Type N, cudaStream_t stream) { From 2d7415335ebc851b6d2b6768bd17e81176a13866 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 10 May 2019 13:02:26 -0400 Subject: [PATCH 095/156] Adding license header to array.h --- ml-prims/src/array/array.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/ml-prims/src/array/array.h b/ml-prims/src/array/array.h index 7b0f448e7b..03c9a71fbb 100644 --- a/ml-prims/src/array/array.h +++ b/ml-prims/src/array/array.h @@ -1,10 +1,18 @@ /* - * array.h + * Copyright (c) 2019, NVIDIA CORPORATION. * - * Created on: May 8, 2019 - * Author: cjnolet + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ - #pragma once #include From dcd2946a99b61c704257359ec832d807108c2b04 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Fri, 10 May 2019 13:03:47 -0400 Subject: [PATCH 096/156] Adding comment to non-lambda batched cc function --- ml-prims/src/sparse/csr.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ml-prims/src/sparse/csr.h b/ml-prims/src/sparse/csr.h index 6f1779fb3b..2ab9a134bd 100644 --- a/ml-prims/src/sparse/csr.h +++ b/ml-prims/src/sparse/csr.h @@ -830,6 +830,30 @@ void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr startVertexId, batchSize, stream, filter_op); } +/** + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param startVertexId the starting vertex index for the current batch + * @param batchSize number of vertices for current batch + * @param state instance of inter-batch state management + * @param stream the cuda stream to use + */ template void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr, Type nnz, Type N, Type startVertexId, Type batchSize, From 1e236bcbd515ed4e15d66faedd6060c172600e42 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 13:06:13 -0500 Subject: [PATCH 097/156] FEA Added cython flake8 config file --- python/.flake8.cython | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 python/.flake8.cython diff --git a/python/.flake8.cython b/python/.flake8.cython new file mode 100644 index 0000000000..4eb437c8ea --- /dev/null +++ b/python/.flake8.cython @@ -0,0 +1,20 @@ +# +# Copyright (c) 2018-2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +[flake8] +filename = *.pyx, *.pxd +exclude = *.egg, build, docs, .git +ignore = E999, E225, E226, E227, W503, W504 From 9e53497f6d77c19fe046206f12a510da59f4ead2 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 13:06:38 -0500 Subject: [PATCH 098/156] FIX kmeans pep8 changes --- python/cuml/cluster/kmeans.pyx | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx index d86048f24a..b1a0749393 100644 --- a/python/cuml/cluster/kmeans.pyx +++ b/python/cuml/cluster/kmeans.pyx @@ -131,6 +131,7 @@ cdef extern from "kmeans/kmeans.hpp" namespace "ML::kmeans": double *X_new, int verbose) + class KMeans(Base): """ @@ -370,7 +371,7 @@ class KMeans(Base): fit_predict( handle_[0], self.n_clusters, # n_clusters - 0, # distance metric as squared L2: @todo - support other metrics # noqa + 0, # distance metric as squared L2: @todo - support other metrics # noqa: E501 init_value, # init method self.max_iter, # max_iterations self.tol, # threshold @@ -385,7 +386,7 @@ class KMeans(Base): fit_predict( handle_[0], self.n_clusters, # n_clusters - 0, # distance metric as squared L2: @todo - support other metrics # noqa + 0, # distance metric as squared L2: @todo - support other metrics # noqa: E501 init_value, # init method self.max_iter, # max_iterations self.tol, # threshold @@ -404,8 +405,10 @@ class KMeans(Base): self.handle.sync() cluster_centers_gdf = cudf.DataFrame() for i in range(0, self.n_cols): - cluster_centers_gdf[str(i)] = self.cluster_centers_[i:self.n_clusters*self.n_cols:self.n_cols] # noqa - self.cluster_centers_ = cluster_centers_gdf + n_c = self.n_cluster + n_cols = self.n_cols + cc_df[str(i)] = self.cluster_centers_[i:n_c*n_cols:n_cols] + self.cluster_centers_ = cc_df del(X_m) From 559fff6d68a046dbb9101ca1298f711f50d38c4c Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 13:06:57 -0500 Subject: [PATCH 099/156] FIX base pep8 changes --- python/cuml/common/base.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cuml/common/base.pyx b/python/cuml/common/base.pyx index 1027fa4f5c..03a1f34db5 100644 --- a/python/cuml/common/base.pyx +++ b/python/cuml/common/base.pyx @@ -95,10 +95,10 @@ class Base: def get_params(self, deep=True): """ - Returns a dict of all params owned by this class. If the child class has - appropriately overridden the `get_param_names` method and does not need - anything other than what is there in this method, then it doesn't have - to override this method + Returns a dict of all params owned by this class. If the child class + has appropriately overridden the `get_param_names` method and does not + need anything other than what is there in this method, then it doesn't + have to override this method """ params = dict() variables = self.get_param_names() From c86c68a1af787a1e3c535650e32e4211bcdf716b Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 13:07:45 -0500 Subject: [PATCH 100/156] FIX cuda pep8 changes --- python/cuml/common/cuda.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/cuml/common/cuda.pyx b/python/cuml/common/cuda.pyx index 9af5b68d69..ac3845807f 100644 --- a/python/cuml/common/cuda.pyx +++ b/python/cuml/common/cuda.pyx @@ -72,8 +72,9 @@ cdef class Stream: def sync(self): """ - Synchronize on the cudastream owned by this object. Note that this could - raise exception due to issues with previous asynchronous launches! + Synchronize on the cudastream owned by this object. Note that this + could raise exception due to issues with previous asynchronous + launches """ cdef _Stream stream = <_Stream>self.s cdef _Error e = cudaStreamSynchronize(stream) From 4cce924ec1413acdf469bdf76671502fe645ed6b Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 13:11:13 -0500 Subject: [PATCH 101/156] FIX handle pep8 changes --- python/cuml/common/handle.pyx | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/cuml/common/handle.pyx b/python/cuml/common/handle.pyx index 840776b5ac..f46ddbbe8c 100644 --- a/python/cuml/common/handle.pyx +++ b/python/cuml/common/handle.pyx @@ -57,7 +57,8 @@ cdef class Handle: """ # ML::cumlHandle doesn't have copy operator. So, use pointer for the object - # python world cannot access to this raw object directly, hence use 'size_t'! + # python world cannot access to this raw object directly, hence use + # 'size_t'! cdef size_t h def __cinit__(self): @@ -81,7 +82,8 @@ cdef class Handle: Second, the allocator based on RMM. So, this function, basically makes the cumlHandle use a more efficient allocator, instead of the default. """ - cdef shared_ptr[deviceAllocator] rmmAlloc = shared_ptr[deviceAllocator](new rmmAllocatorAdapter()) + cdef shared_ptr[deviceAllocator] rmmAlloc = ( + shared_ptr[deviceAllocator](new rmmAllocatorAdapter())) cdef cumlHandle* h_ = self.h h_.setDeviceAllocator(rmmAlloc) @@ -89,8 +91,8 @@ cdef class Handle: """ Issues a sync on the stream set for this handle. - Once we make `cuml.cuda.Stream` as a mandatory option for creating `cuml.Handle`, - this should go away! + Once we make `cuml.cuda.Stream` as a mandatory option for creating + `cuml.Handle`, this should go away """ cdef cumlHandle* h_ = self.h cdef _Stream stream = h_.getStream() From aa3e09be39be75bc01f7db4a30f7ed43a3f201c7 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 13:42:56 -0500 Subject: [PATCH 102/156] FIX decomposition methods pep8 changes --- python/cuml/decomposition/pca.pyx | 115 ++++++++++++---------- python/cuml/decomposition/tsvd.pyx | 134 ++++++++++++++------------ python/cuml/decomposition/tsvd_mg.pyx | 109 ++++++++++++--------- 3 files changed, 205 insertions(+), 153 deletions(-) diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx index b190e76aa2..7ceba58baf 100644 --- a/python/cuml/decomposition/pca.pyx +++ b/python/cuml/decomposition/pca.pyx @@ -112,15 +112,16 @@ cdef extern from "pca/pca.hpp" namespace "ML": class PCA(Base): """ - PCA (Principal Component Analysis) is a fundamental dimensionality reduction technique used to - combine features in X in linear combinations such that each new component captures the most - information or variance of the data. N_components is usually small, say at 3, where it can be - used for data visualization, data compression and exploratory analysis. + PCA (Principal Component Analysis) is a fundamental dimensionality + reduction technique used to combine features in X in linear combinations + such that each new component captures the most information or variance of + the data. N_components is usually small, say at 3, where it can be used for + data visualization, data compression and exploratory analysis. - cuML's PCA expects a cuDF DataFrame, and provides 2 algorithms Full and Jacobi. - Full (default) uses a full eigendecomposition then selects the top K eigenvectors. - The Jacobi algorithm is much faster as it iteratively tries to correct the top K eigenvectors, - but might be less accurate. + cuML's PCA expects a cuDF DataFrame, and provides 2 algorithms Full and + Jacobi. Full (default) uses a full eigendecomposition then selects the top + K eigenvectors. The Jacobi algorithm is much faster as it iteratively tries + to correct the top K eigenvectors, but might be less accurate. Examples --------- @@ -144,7 +145,8 @@ class PCA(Base): print(f'components: {pca_float.components_}') print(f'explained variance: {pca_float.explained_variance_}') - print(f'explained variance ratio: {pca_float.explained_variance_ratio_}') + exp_var = pca_float.explained_variance_ratio_ + print(f'explained variance ratio: {exp_var}') print(f'singular values: {pca_float.singular_values_}') print(f'mean: {pca_float.mean_}') @@ -205,28 +207,33 @@ class PCA(Base): Parameters ---------- copy : boolean (default = True) - If True, then copies data then removes mean from data. False might cause data to be - overwritten with its mean centered version. + If True, then copies data then removes mean from data. False might + cause data to be overwritten with its mean centered version. handle : cuml.Handle If it is None, a new one is created just for this class iterated_power : int (default = 15) - Used in Jacobi solver. The more iterations, the more accurate, but the slower. + Used in Jacobi solver. The more iterations, the more accurate, but + slower. n_components : int (default = 1) - The number of top K singular vectors / values you want. Must be <= number(columns). + The number of top K singular vectors / values you want. + Must be <= number(columns). random_state : int / None (default = None) - If you want results to be the same when you restart Python, select a state. + If you want results to be the same when you restart Python, select a + state. svd_solver : 'full' or 'jacobi' or 'auto' (default = 'full') - Full uses a eigendecomposition of the covariance matrix then discards components. + Full uses a eigendecomposition of the covariance matrix then discards + components. Jacobi is much faster as it iteratively corrects, but is less accurate. tol : float (default = 1e-7) - Used if algorithm = "jacobi". The smaller the tolerance, the more accurate, - but the more slower the algorithm will get to converge. + Used if algorithm = "jacobi". Smaller tolerance can increase accuracy, + but but will slow down the algorithm's convergence. verbose : bool Whether to print debug spews whiten : boolean (default = False) - If True, de-correlates the components. This is done by dividing them by the corresponding - singular values then multiplying by sqrt(n_samples). Whitening allows each component - to have unit variance and removes multi-collinearity. It might be beneficial for downstream + If True, de-correlates the components. This is done by dividing them by + the corresponding singular values then multiplying by sqrt(n_samples). + Whitening allows each component to have unit variance and removes + multi-collinearity. It might be beneficial for downstream tasks like LinearRegression where correlated features cause problems. @@ -248,25 +255,29 @@ class PCA(Base): Notes ------ - PCA considers linear combinations of features, specifically those that maximise global - variance structure. This means PCA is fantastic for global structure analyses, but weak - for local relationships. Consider UMAP or T-SNE for a locally important embedding. + PCA considers linear combinations of features, specifically those that + maximise global variance structure. This means PCA is fantastic for global + structure analyses, but weak for local relationships. Consider UMAP or + T-SNE for a locally important embedding. **Applications of PCA** - PCA is used extensively in practice for data visualization and data compression. It has been used - to visualize extremely large word embeddings like Word2Vec and GloVe in 2 or 3 dimensions, large - datasets of everyday objects and images, and used to distinguish between cancerous cells from - healthy cells. + PCA is used extensively in practice for data visualization and data + compression. It has been used to visualize extremely large word + embeddings like Word2Vec and GloVe in 2 or 3 dimensions, large + datasets of everyday objects and images, and used to distinguish + between cancerous cells from healthy cells. - For an additional example see `the PCA notebook `_. - For additional docs, see `scikitlearn's PCA `_. + For an additional example see `the PCA notebook + `_. + For additional docs, see `scikitlearn's PCA + `_. """ def __init__(self, copy=True, handle=None, iterated_power=15, - n_components=1, random_state=None, svd_solver='auto', tol=1e-7, - verbose=False, whiten=False): + n_components=1, random_state=None, svd_solver='auto', + tol=1e-7, verbose=False, whiten=False): # parameters super(PCA, self).__init__(handle=handle, verbose=verbose) self.copy = copy @@ -366,12 +377,13 @@ class PCA(Base): params.algorithm = self.c_algorithm if self.n_components > self.n_cols: - raise ValueError('Number of components should not be greater than the number of columns in the data') + raise ValueError('Number of components should not be greater than' + 'the number of columns in the data') self._initialize_arrays(params.n_components, params.n_rows, params.n_cols) - cdef uintptr_t components_ptr = self._get_dev_array_ptr(self.components_) + cdef uintptr_t comp_ptr = self._get_dev_array_ptr(self.components_) cdef uintptr_t explained_var_ptr = self._get_cudf_column_ptr( self.explained_variance_) @@ -382,14 +394,14 @@ class PCA(Base): cdef uintptr_t mean_ptr = self._get_cudf_column_ptr(self.mean_) cdef uintptr_t noise_vars_ptr = self._get_cudf_column_ptr( self.noise_variance_) - cdef uintptr_t trans_input_ptr = self._get_dev_array_ptr(self.trans_input_) + cdef uintptr_t t_input_ptr = self._get_dev_array_ptr(self.trans_input_) cdef cumlHandle* handle_ = self.handle.getHandle() if self.gdf_datatype.type == np.float32: pcaFitTransform(handle_[0], input_ptr, - trans_input_ptr, - components_ptr, + t_input_ptr, + comp_ptr, explained_var_ptr, explained_var_ratio_ptr, singular_vals_ptr, @@ -399,8 +411,8 @@ class PCA(Base): else: pcaFitTransform(handle_[0], input_ptr, - trans_input_ptr, - components_ptr, + t_input_ptr, + comp_ptr, explained_var_ptr, explained_var_ratio_ptr, singular_vals_ptr, @@ -414,10 +426,11 @@ class PCA(Base): components_gdf = cudf.DataFrame() for i in range(0, params.n_cols): - components_gdf[str(i)] = self.components_[i*params.n_components:(i+1)*params.n_components] + n_c = params.n_components + components_gdf[str(i)] = self.components_[i*n_c:(i+1)*n_c] self.components_ = components_gdf - self.components_ptr = components_ptr + self.components_ptr = comp_ptr self.explained_variance_ptr = explained_var_ptr self.explained_variance_ratio_ptr = explained_var_ratio_ptr self.singular_values_ptr = singular_vals_ptr @@ -439,7 +452,8 @@ class PCA(Base): Parameters ---------- X : cuDF DataFrame, shape (n_samples, n_features) - training data (floats or doubles), where n_samples is the number of samples, and n_features is the number of features. + training data (floats or doubles), where n_samples is the number of + samples, and n_features is the number of features. y : ignored @@ -465,7 +479,8 @@ class PCA(Base): Parameters ---------- X : cuDF DataFrame, shape (n_samples, n_components) - New data (floats or doubles), where n_samples is the number of samples and n_components is the number of components. + New data (floats or doubles), where n_samples is the number of + samples and n_components is the number of components. Returns ------- @@ -524,7 +539,8 @@ class PCA(Base): X_original = cudf.DataFrame() for i in range(0, params.n_cols): - X_original[str(i)] = input_data[i*params.n_rows:(i+1)*params.n_rows] + n_r = params.n_rows + X_original[str(i)] = input_data[i*n_r:(i+1)*n_r] del(X_m) @@ -534,12 +550,14 @@ class PCA(Base): """ Apply dimensionality reduction to X. - X is projected on the first principal components previously extracted from a training set. + X is projected on the first principal components previously extracted + from a training set. Parameters ---------- X : cuDF DataFrame, shape (n_samples, n_features) - New data (floats or doubles), where n_samples is the number of samples and n_features is the number of features. + New data (floats or doubles), where n_samples is the number of + samples and n_features is the number of features. Returns ------- @@ -573,11 +591,11 @@ class PCA(Base): params.n_cols = n_cols params.whiten = self.whiten - trans_input_data = cuda.to_device( + t_input_data = cuda.to_device( np.zeros(params.n_rows*params.n_components, dtype=gdf_datatype.type)) - cdef uintptr_t trans_input_ptr = self._get_dev_array_ptr(trans_input_data) + cdef uintptr_t trans_input_ptr = self._get_dev_array_ptr(t_input_data) cdef uintptr_t components_ptr = self.components_ptr cdef uintptr_t singular_vals_ptr = self.singular_values_ptr cdef uintptr_t mean_ptr = self.mean_ptr @@ -606,12 +624,11 @@ class PCA(Base): X_new = cudf.DataFrame() for i in range(0, params.n_components): - X_new[str(i)] = trans_input_data[i*params.n_rows:(i+1)*params.n_rows] + X_new[str(i)] = t_input_data[i*params.n_rows:(i+1)*params.n_rows] del(X_m) return X_new - def get_param_names(self): return ["copy", "iterated_power", "n_components", "svd_solver", "tol", "whiten"] diff --git a/python/cuml/decomposition/tsvd.pyx b/python/cuml/decomposition/tsvd.pyx index 51ba74552c..a1e7a88cd2 100644 --- a/python/cuml/decomposition/tsvd.pyx +++ b/python/cuml/decomposition/tsvd.pyx @@ -92,14 +92,15 @@ cdef extern from "tsvd/tsvd.hpp" namespace "ML": class TruncatedSVD(Base): """ - TruncatedSVD is used to compute the top K singular values and vectors of a large matrix X. - It is much faster when n_components is small, such as in the use of PCA when 3 components is - used for 3D visualization. + TruncatedSVD is used to compute the top K singular values and vectors of a + large matrix X. It is much faster when n_components is small, such as in + the use of PCA when 3 components is used for 3D visualization. - cuML's TruncatedSVD expects a cuDF DataFrame, and provides 2 algorithms Full and Jacobi. - Full (default) uses a full eigendecomposition then selects the top K singular vectors. - The Jacobi algorithm is much faster as it iteratively tries to correct the top K singular - vectors, but might be less accurate. + cuML's TruncatedSVD expects a cuDF DataFrame, and provides 2 algorithms + Full and Jacobi. Full (default) uses a full eigendecomposition then selects + the top K singular vectors. The Jacobi algorithm is much faster as it + iteratively tries to correct the top K singular vectors, but might be + less accurate. Examples --------- @@ -118,12 +119,14 @@ class TruncatedSVD(Base): gdf_float['1'] = np.asarray([4.0,2.0,1.0], dtype = np.float32) gdf_float['2'] = np.asarray([4.0,2.0,1.0], dtype = np.float32) - tsvd_float = TruncatedSVD(n_components = 2, algorithm = "jacobi", n_iter = 20, tol = 1e-9) + tsvd_float = TruncatedSVD(n_components = 2, algorithm = "jacobi", + n_iter = 20, tol = 1e-9) tsvd_float.fit(gdf_float) print(f'components: {tsvd_float.components_}') print(f'explained variance: {tsvd_float.explained_variance_}') - print(f'explained variance ratio: {tsvd_float.explained_variance_ratio_}') + exp_var = tsvd_float.explained_variance_ratio_ + print(f'explained variance ratio: {exp_var}') print(f'singular values: {tsvd_float.singular_values_}') trans_gdf_float = tsvd_float.transform(gdf_float) @@ -151,9 +154,11 @@ class TruncatedSVD(Base): 0 7.439024 1 4.0817795 - Transformed matrix: 0 1 - 0 5.1659107 -2.512643 - 1 3.4638448 -0.042223275 2 4.0809603 3.2164836 + Transformed Matrix: + 0 1 2 + 0 5.1659107 -2.512643 + 1 3.4638448 -0.042223275 + 2 4.0809603 3.2164836 Input matrix: 0 1 2 0 1.0 4.000001 4.000001 @@ -163,19 +168,23 @@ class TruncatedSVD(Base): Parameters ----------- algorithm : 'full' or 'jacobi' or 'auto' (default = 'full') - Full uses a eigendecomposition of the covariance matrix then discards components. + Full uses a eigendecomposition of the covariance matrix then discards + components. Jacobi is much faster as it iteratively corrects, but is less accurate. handle : cuml.Handle If it is None, a new one is created just for this class n_components : int (default = 1) - The number of top K singular vectors / values you want. Must be <= number(columns). + The number of top K singular vectors / values you want. + Must be <= number(columns). n_iter : int (default = 15) - Used in Jacobi solver. The more iterations, the more accurate, but the slower. + Used in Jacobi solver. The more iterations, the more accurate, but + slower. random_state : int / None (default = None) - If you want results to be the same when you restart Python, select a state. + If you want results to be the same when you restart Python, select a + state. tol : float (default = 1e-7) - Used if algorithm = "jacobi". The smaller the tolerance, the more accurate, - but the more slower the algorithm will get to converge. + Used if algorithm = "jacobi". Smaller tolerance can increase accuracy, + but but will slow down the algorithm's convergence. verbose : bool Whether to print debug spews @@ -192,24 +201,28 @@ class TruncatedSVD(Base): Notes ------ - TruncatedSVD (the randomized version [Jacobi]) is fantastic when the number of components - you want is much smaller than the number of features. The approximation to the largest - singular values and vectors is very robust, however, this method loses a lot of accuracy - when you want many many components. + TruncatedSVD (the randomized version [Jacobi]) is fantastic when the number + of components you want is much smaller than the number of features. The + approximation to the largest singular values and vectors is very robust, + however, this method loses a lot of accuracy when you want many many + components. **Applications of TruncatedSVD** - TruncatedSVD is also known as Latent Semantic Indexing (LSI) which tries to find topics of a - word count matrix. If X previously was centered with mean removal, TruncatedSVD is the - same as TruncatedPCA. TruncatedSVD is also used in information retrieval tasks, recommendation - systems and data compression. + TruncatedSVD is also known as Latent Semantic Indexing (LSI) which + tries to find topics of a word count matrix. If X previously was + centered with mean removal, TruncatedSVD is the same as TruncatedPCA. + TruncatedSVD is also used in information retrieval tasks, + recommendation systems and data compression. - For additional examples, see `the Truncated SVD notebook `_. - For additional documentation, see `scikitlearn's TruncatedSVD docs `_. + For additional examples, see `the Truncated SVD notebook + `_. + For additional documentation, see `scikitlearn's TruncatedSVD docs + `_. """ - def __init__(self, algorithm='full', handle=None, n_components=1, n_iter=15, - random_state=None, tol=1e-7, verbose=False): + def __init__(self, algorithm='full', handle=None, n_components=1, + n_iter=15, random_state=None, tol=1e-7, verbose=False): # params super(TruncatedSVD, self).__init__(handle, verbose) self.algorithm = algorithm @@ -299,7 +312,7 @@ class TruncatedSVD(Base): params.algorithm = self.c_algorithm self._initialize_arrays(self.n_components, self.n_rows, self.n_cols) - cdef uintptr_t components_ptr = self._get_dev_array_ptr(self.components_) + cdef uintptr_t comp_ptr = self._get_dev_array_ptr(self.components_) cdef uintptr_t explained_var_ptr = self._get_cudf_column_ptr( self.explained_variance_) @@ -307,7 +320,7 @@ class TruncatedSVD(Base): self.explained_variance_ratio_) cdef uintptr_t singular_vals_ptr = self._get_cudf_column_ptr( self.singular_values_) - cdef uintptr_t trans_input_ptr = self._get_dev_array_ptr(self.trans_input_) + cdef uintptr_t t_input_ptr = self._get_dev_array_ptr(self.trans_input_) if self.n_components> self.n_cols: raise ValueError(' n_components must be < n_features') @@ -316,8 +329,8 @@ class TruncatedSVD(Base): if self.gdf_datatype.type == np.float32: tsvdFitTransform(handle_[0], input_ptr, - trans_input_ptr, - components_ptr, + t_input_ptr, + comp_ptr, explained_var_ptr, explained_var_ratio_ptr, singular_vals_ptr, @@ -325,8 +338,8 @@ class TruncatedSVD(Base): else: tsvdFitTransform(handle_[0], input_ptr, - trans_input_ptr, - components_ptr, + t_input_ptr, + comp_ptr, explained_var_ptr, explained_var_ratio_ptr, singular_vals_ptr, @@ -338,10 +351,11 @@ class TruncatedSVD(Base): components_gdf = cudf.DataFrame() for i in range(0, params.n_cols): - components_gdf[str(i)] = self.components_[i*params.n_components:(i+1)*params.n_components] + n_c = params.n_components + components_gdf[str(i)] = self.components_[i*n_c:(i+1)*n_c] self.components_ = components_gdf - self.components_ptr = components_ptr + self.components_ptr = comp_ptr self.explained_variance_ptr = explained_var_ptr self.explained_variance_ratio_ptr = explained_var_ratio_ptr self.singular_values_ptr = singular_vals_ptr @@ -364,7 +378,7 @@ class TruncatedSVD(Base): Returns ---------- X_new : cuDF DataFrame, shape (n_samples, n_components) - Reduced version of X. This will always be a dense cuDF DataFrame + Reduced version of X as a dense cuDF DataFrame """ self.fit(X, _transform=True) @@ -422,16 +436,16 @@ class TruncatedSVD(Base): if gdf_datatype.type == np.float32: tsvdInverseTransform(handle_[0], - trans_input_ptr, - components_ptr, - input_ptr, - params) + trans_input_ptr, + components_ptr, + input_ptr, + params) else: tsvdInverseTransform(handle_[0], - trans_input_ptr, - components_ptr, - input_ptr, - params) + trans_input_ptr, + components_ptr, + input_ptr, + params) # make sure the previously scheduled gpu tasks are complete before the # following transfers start @@ -439,7 +453,8 @@ class TruncatedSVD(Base): X_original = cudf.DataFrame() for i in range(0, params.n_cols): - X_original[str(i)] = input_data[i*params.n_rows:(i+1)*params.n_rows] + n_r = params.n_rows + X_original[str(i)] = input_data[i*n_r:(i+1)*n_r] return X_original @@ -483,27 +498,27 @@ class TruncatedSVD(Base): params.n_rows = len(X) params.n_cols = self.n_cols - trans_input_data = cuda.to_device( + t_input_data = cuda.to_device( np.zeros(params.n_rows*params.n_components, dtype=gdf_datatype.type)) - cdef uintptr_t trans_input_ptr = self._get_dev_array_ptr(trans_input_data) + cdef uintptr_t trans_input_ptr = self._get_dev_array_ptr(t_input_data) cdef uintptr_t components_ptr = self.components_ptr cdef cumlHandle* handle_ = self.handle.getHandle() if gdf_datatype.type == np.float32: tsvdTransform(handle_[0], - input_ptr, - components_ptr, - trans_input_ptr, - params) + input_ptr, + components_ptr, + trans_input_ptr, + params) else: tsvdTransform(handle_[0], - input_ptr, - components_ptr, - trans_input_ptr, - params) + input_ptr, + components_ptr, + trans_input_ptr, + params) # make sure the previously scheduled gpu tasks are complete before the # following transfers start @@ -511,11 +526,10 @@ class TruncatedSVD(Base): X_new = cudf.DataFrame() for i in range(0, params.n_components): - X_new[str(i)] = trans_input_data[i*params.n_rows:(i+1)*params.n_rows] + X_new[str(i)] = t_input_data[i*params.n_rows:(i+1)*params.n_rows] del(X_m) return X_new - def get_param_names(self): return ["algorithm", "n_components", "n_iter", "random_state", "tol"] diff --git a/python/cuml/decomposition/tsvd_mg.pyx b/python/cuml/decomposition/tsvd_mg.pyx index bc98e1d988..23b433a882 100644 --- a/python/cuml/decomposition/tsvd_mg.pyx +++ b/python/cuml/decomposition/tsvd_mg.pyx @@ -110,7 +110,8 @@ class TSVDparams: class TruncatedSVDSPMG: """ - Create a DataFrame, fill it with data, and compute Truncated Singular Value Decomposition: + Create a DataFrame, fill it with data, and compute Truncated Singular Value + Decomposition: .. code-block:: python @@ -123,12 +124,14 @@ class TruncatedSVDSPMG: gdf_float['1']=np.asarray([4.0,2.0,1.0],dtype=np.float32) gdf_float['2']=np.asarray([4.0,2.0,1.0],dtype=np.float32) - tsvd_float = TruncatedSVD(n_components = 2, algorithm="jacobi", n_iter=20, tol=1e-9) + tsvd_float = TruncatedSVD(n_components = 2, algorithm="jacobi", + n_iter=20, tol=1e-9) tsvd_float.fit(gdf_float) print(f'components: {tsvd_float.components_}') print(f'explained variance: {tsvd_float.explained_variance_}') - print(f'explained variance ratio: {tsvd_float.explained_variance_ratio_}') + exp_var = tsvd_float.explained_variance_ratio_ + print(f'explained variance ratio: {exp_var}') print(f'singular values: {tsvd_float.singular_values_}') trans_gdf_float = tsvd_float.transform(gdf_float) @@ -158,14 +161,18 @@ class TruncatedSVDSPMG: Transformed matrix: 0 1 0 5.1659107 -2.512643 - 1 3.4638448 -0.042223275 2 4.0809603 3.2164836 + 1 3.4638448 -0.042223275 + 2 4.0809603 3.2164836 Input matrix: 0 1 2 0 1.0 4.000001 4.000001 1 2.0000005 2.0000005 2.0000007 2 5.000001 0.9999999 1.0000004 - For additional examples, see `the Truncated SVD notebook `_. For additional documentation, see `scikitlearn's TruncatedSVD docs `_. + For additional examples, see the Truncated SVD notebook + `_. + For additional documentation, see `scikitlearn's TruncatedSVD docs + `_. """ @@ -202,22 +209,25 @@ class TruncatedSVDSPMG: self.gdf_datatype) self.components_ = cudf.utils.cudautils.zeros(n_cols*n_components, - self.gdf_datatype) + self.gdf_datatype) - self.explained_variance_ = cudf.Series(cudf.utils.cudautils.zeros(n_components, - self.gdf_datatype)) + self.explained_variance_ = cudf.Series(cudf.utils.cudautils.zeros( + n_components, + self.gdf_datatype)) - self.explained_variance_ratio_ = cudf.Series(cudf.utils.cudautils.zeros(n_components, + self.explained_variance_ratio_ = cudf.Series(np.zeros( + n_components, self.gdf_datatype)) self.mean_ = cudf.Series(cudf.utils.cudautils.zeros(n_cols, - self.gdf_datatype)) + self.gdf_datatype)) - self.singular_values_ = cudf.Series(cudf.utils.cudautils.zeros(n_components, - self.gdf_datatype)) - - self.noise_variance_ = cudf.Series(np.zeros(1, dtype=self.gdf_datatype)) + self.singular_values_ = cudf.Series(cudf.utils.cudautils.zeros( + n_components, + self.gdf_datatype)) + self.noise_variance_ = cudf.Series(np.zeros(1, + dtype=self.gdf_datatype)) def _get_ctype_ptr(self, obj): # The manner to access the pointers in the gdf's might change, so @@ -234,7 +244,8 @@ class TruncatedSVDSPMG: def _fit_spmg(self, X, _transform=True, gpu_ids=[]): if (not isinstance(X, np.ndarray)): - msg = "X matrix must be a Numpy ndarray. Dask will be supported in the next version." + msg = "X matrix must be a Numpy ndarray. Dask will be supported" \ + + " in the next version." raise TypeError(msg) n_gpus = len(gpu_ids) @@ -263,7 +274,9 @@ class TruncatedSVDSPMG: params.tol = self.params.tol params.algorithm = self.params.svd_solver - cdef uintptr_t X_ptr, components_ptr, explained_variance_ptr, explained_variance_ratio_ptr, singular_values_ptr, trans_input_ptr, gpu_ids_ptr + cdef uintptr_t X_ptr, components_ptr, explained_variance_ptr + cdef uintptr_t explained_variance_ratio_ptr, singular_values_ptr, + cdef uintptr_t trans_input_ptr, gpu_ids_ptr self.gdf_datatype = X.dtype @@ -273,13 +286,16 @@ class TruncatedSVDSPMG: dtype=X.dtype, order='F') self.explained_variance_ratio_ = np.zeros(self.params.n_components, dtype=X.dtype, order='F') - self.singular_values_ = np.zeros(self.params.n_components, dtype=X.dtype, order='F') - self.trans_input_ = np.zeros((n_rows, self.params.n_components), dtype=X.dtype, order='F') + self.singular_values_ = np.zeros(self.params.n_components, + dtype=X.dtype, order='F') + self.trans_input_ = np.zeros((n_rows, self.params.n_components), + dtype=X.dtype, order='F') X_ptr = X.ctypes.data components_ptr = self.components_.ctypes.data explained_variance_ptr = self.explained_variance_ratio_.ctypes.data - explained_variance_ratio_ptr = self.explained_variance_ratio_.ctypes.data + exp_vr = self.explained_variance_ratio_ + explained_variance_ratio_ptr = exp_vr.ctypes.data singular_values_ptr = self.singular_values_.ctypes.data trans_input_ptr = self.trans_input_.ctypes.data gpu_ids_32 = np.array(gpu_ids, dtype=np.int32) @@ -290,45 +306,44 @@ class TruncatedSVDSPMG: tsvdFitSPMG(X_ptr, components_ptr, singular_values_ptr, - params, - gpu_ids_ptr, - n_gpus) + params, + gpu_ids_ptr, + n_gpus) else: tsvdFitSPMG(X_ptr, components_ptr, singular_values_ptr, - params, - gpu_ids_ptr, - n_gpus) + params, + gpu_ids_ptr, + n_gpus) else: if self.gdf_datatype.type == np.float32: tsvdFitTransformSPMG(X_ptr, trans_input_ptr, - components_ptr, + components_ptr, explained_variance_ptr, - explained_variance_ratio_ptr, + explained_variance_ratio_ptr, singular_values_ptr, params, - gpu_ids_ptr, + gpu_ids_ptr, n_gpus) else: tsvdFitTransformSPMG(X_ptr, trans_input_ptr, - components_ptr, + components_ptr, explained_variance_ptr, - explained_variance_ratio_ptr, + explained_variance_ratio_ptr, singular_values_ptr, params, - gpu_ids_ptr, + gpu_ids_ptr, n_gpus) self.components_ = np.transpose(self.components_) return self - def _fit_transform_spmg(self, X, gpu_ids): self._fit_spmg(X, True, gpu_ids) return self.trans_input_ @@ -340,7 +355,8 @@ class TruncatedSVDSPMG: X = np.array(X, order='F') if (not np.isfortran(self.components_)): - self.components_ = np.array(self.components_, order='F', dtype=X.dtype) + self.components_ = np.array(self.components_, order='F', + dtype=X.dtype) n_rows = X.shape[0] n_cols = X.shape[1] @@ -358,7 +374,8 @@ class TruncatedSVDSPMG: params.n_rows = n_rows params.n_cols = self.params.n_cols - original_X = np.zeros((n_rows, self.params.n_cols), dtype=X.dtype, order='F') + original_X = np.zeros((n_rows, self.params.n_cols), dtype=X.dtype, + order='F') cdef uintptr_t X_ptr, original_X_ptr, gpu_ids_ptr, components_ptr @@ -388,7 +405,6 @@ class TruncatedSVDSPMG: gpu_ids_ptr, n_gpus) - return original_X def _transform_spmg(self, X, gpu_ids=[]): @@ -398,7 +414,8 @@ class TruncatedSVDSPMG: X = np.array(X, order='F') if (not np.isfortran(self.components_)): - self.components_ = np.array(self.components_, order='F', dtype=X.dtype) + self.components_ = np.array(self.components_, order='F', + dtype=X.dtype) n_rows = X.shape[0] n_cols = X.shape[1] @@ -416,7 +433,8 @@ class TruncatedSVDSPMG: params.n_rows = n_rows params.n_cols = self.params.n_cols - trans_X = np.zeros((n_rows, self.params.n_components), dtype=X.dtype, order='F') + trans_X = np.zeros((n_rows, self.params.n_components), dtype=X.dtype, + order='F') cdef uintptr_t X_ptr, trans_X_ptr, gpu_ids_ptr, components_ptr @@ -458,7 +476,8 @@ class TruncatedSVDSPMG: Training data (floats or doubles) n_gpus : int - Number of gpus to be used for prediction. If gpu_ids parameter has more than element, this parameter is ignored. + Number of gpus to be used for prediction. If gpu_ids parameter + has more than element, this parameter is ignored. gpu_ids: int array GPU ids to be used for prediction. @@ -475,7 +494,6 @@ class TruncatedSVDSPMG: raise ValueError('Number of GPUS should be 2 or more' 'For single GPU, use the normal TruncatedSVD') - def fit_transform(self, X, n_gpus=1, gpu_ids=[]): """ Fit LSI model to X and perform dimensionality reduction on X. @@ -486,7 +504,8 @@ class TruncatedSVDSPMG: Training data (floats or doubles) n_gpus : int - Number of gpus to be used for prediction. If gpu_ids parameter has more than element, this parameter is ignored. + Number of gpus to be used for prediction. If gpu_ids parameter + has more than element, this parameter is ignored. gpu_ids: int array GPU ids to be used for prediction. @@ -494,7 +513,8 @@ class TruncatedSVDSPMG: Returns ---------- X_new : cuDF DataFrame, shape (n_samples, n_components) - Reduced version of X. This will always be a dense cuDF DataFrame + Reduced version of X. This will always be a dense cuDF + DataFrame """ @@ -520,7 +540,8 @@ class TruncatedSVDSPMG: New data. n_gpus : int - Number of gpus to be used for prediction. If gpu_ids parameter has more than element, this parameter is ignored. + Number of gpus to be used for prediction. If gpu_ids parameter + has more than element, this parameter is ignored. gpu_ids: int array GPU ids to be used for prediction. @@ -542,7 +563,6 @@ class TruncatedSVDSPMG: raise ValueError('Number of GPUS should be 2 or more' 'For single GPU, use the normal TruncatedSVD') - def transform(self, X, n_gpus=1, gpu_ids=[]): """ Perform dimensionality reduction on X. @@ -553,7 +573,8 @@ class TruncatedSVDSPMG: New data. n_gpus : int - Number of gpus to be used for prediction. If gpu_ids parameter has more than element, this parameter is ignored. + Number of gpus to be used for prediction. If gpu_ids parameter + has more than element, this parameter is ignored. gpu_ids: int array GPU ids to be used for prediction. From 93f7699447b15759cc493aff4bdf564e0f70f089 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 13:43:53 -0500 Subject: [PATCH 103/156] FEA First attempt at enabling cython flake8 in CI --- ci/checks/style.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/checks/style.sh b/ci/checks/style.sh index 2105091ba1..f91cd3bff1 100644 --- a/ci/checks/style.sh +++ b/ci/checks/style.sh @@ -12,7 +12,7 @@ PATH=/conda/bin:$PATH source activate gdf # Run flake8 and get results/return code -FLAKE=`flake8 --exclude=cuML,ml-prims,__init__.py,versioneer.py` +FLAKE=`flake8 --exclude=cuML,ml-prims,__init__.py,versioneer.py && flake8 --config=python/.flake8.cython` RETVAL=$? # Output results if failure otherwise show pass From db5bd31e471fb99a0236c3557a78e49fa95062f9 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 14:23:49 -0500 Subject: [PATCH 104/156] FIX filter methods pep8 changes --- python/cuml/filter/kalman_filter.pyx | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/python/cuml/filter/kalman_filter.pyx b/python/cuml/filter/kalman_filter.pyx index 0f57aa6bf4..2246d04473 100644 --- a/python/cuml/filter/kalman_filter.pyx +++ b/python/cuml/filter/kalman_filter.pyx @@ -192,7 +192,8 @@ class KalmanFilter(Base): 'double': np.float64, }[precision] - def __init__(self, dim_x, dim_z, solver='long', precision='single', seed=False): + def __init__(self, dim_x, dim_z, solver='long', precision='single', + seed=False): if solver in ['long', 'short_implicit', 'short_explicit']: self._algorithm = self._get_algorithm_c_name(solver) @@ -244,8 +245,6 @@ class KalmanFilter(Base): cdef int c_dim_x = dim_x cdef int c_dim_z = dim_z - - with nogil: workspace_size = get_workspace_size_f32(var_ptr, @@ -261,10 +260,10 @@ class KalmanFilter(Base): _R_ptr, _H_ptr) - self.workspace = cuda.to_device(np.zeros(workspace_size, dtype=self.dtype)) + self.workspace = cuda.to_device(np.zeros(workspace_size, + dtype=self.dtype)) self._workspace_size = workspace_size - def _get_algorithm_c_name(self, algorithm): return { 'long': LongForm, @@ -272,7 +271,6 @@ class KalmanFilter(Base): 'short_explicit': ShortFormImplicit, }[algorithm] - def predict(self, B=None, F=None, Q=None): """ Predict next state (prior) using the Kalman filter state propagation @@ -386,13 +384,8 @@ class KalmanFilter(Base): _ws_ptr, workspace_size) - predict_f64(var64) - # if workspace_size != current_size: - # self.workspace = cuda.to_device(np.zeros(workspace_size, dtype=self.dtype)) - - def update(self, z, R=None, H=None): """ Add a new measurement (z) to the Kalman filter. @@ -468,8 +461,6 @@ class KalmanFilter(Base): _R_ptr, _H_ptr) - - init_f32(var32, dim_x, dim_z, @@ -505,8 +496,6 @@ class KalmanFilter(Base): _R_ptr, _H_ptr) - - init_f64(var64, dim_x, dim_z, @@ -525,7 +514,6 @@ class KalmanFilter(Base): update_f64(var64, z_ptr) - def __setattr__(self, name, value): if name in ["F", "x_up", "x", "P_up", "P", "Q", "H", "R", "z"]: if (isinstance(value, cudf.DataFrame)): @@ -534,7 +522,8 @@ class KalmanFilter(Base): elif (isinstance(value, cudf.Series)): val = value._column._data.mem - elif (isinstance(value, np.ndarray) or cuda.devicearray.is_cuda_ndarray(value)): + elif (isinstance(value, np.ndarray) or + cuda.devicearray.is_cuda_ndarray(value)): val = cuda.to_device(value) super(KalmanFilter, self).__setattr__(name, val) From a7ae688fe45c87e709faeb462fa05077388dbdf4 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 15:02:15 -0500 Subject: [PATCH 105/156] FIX linear_model methods pep8 changes --- python/cuml/linear_model/elastic_net.pyx | 67 +++++--- python/cuml/linear_model/lasso.pyx | 74 +++++---- .../cuml/linear_model/linear_regression.pyx | 125 +++++++------- .../linear_model/linear_regression_mg.pyx | 155 +++++++++--------- python/cuml/linear_model/ridge.pyx | 134 ++++++++------- 5 files changed, 301 insertions(+), 254 deletions(-) diff --git a/python/cuml/linear_model/elastic_net.pyx b/python/cuml/linear_model/elastic_net.pyx index 33e6b75781..4f84068cbd 100644 --- a/python/cuml/linear_model/elastic_net.pyx +++ b/python/cuml/linear_model/elastic_net.pyx @@ -19,19 +19,20 @@ # cython: embedsignature = True # cython: language_level = 3 - -import cudf -import numpy as np from cuml.solvers import CD + class ElasticNet: """ - ElasticNet extends LinearRegression with combined L1 and L2 regularizations on the coefficients when - predicting response y with a linear combination of the predictors in X. It can reduce - the variance of the predictors, force some coefficients to be smaell, and improves the conditioning of the problem. + ElasticNet extends LinearRegression with combined L1 and L2 regularizations + on the coefficients when predicting response y with a linear combination of + the predictors in X. It can reduce the variance of the predictors, force + some coefficients to be smaell, and improves the conditioning of the + problem. - cuML's ElasticNet expects a cuDF DataFrame, uses coordinate descent to fit a linear model. + cuML's ElasticNet expects a cuDF DataFrame, uses coordinate descent to fit + a linear model. Examples --------- @@ -84,26 +85,33 @@ class ElasticNet: ----------- alpha : float or double Constant that multiplies the L1 term. Defaults to 1.0. - alpha = 0 is equivalent to an ordinary least square, solved by the LinearRegression object. - For numerical reasons, using alpha = 0 with the Lasso object is not advised. + alpha = 0 is equivalent to an ordinary least square, solved by the + LinearRegression object. + For numerical reasons, using alpha = 0 with the Lasso object is not + advised. Given this, you should use the LinearRegression object. l1_ratio: The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. - For l1_ratio = 0 the penalty is an L2 penalty. For l1_ratio = 1 it is an L1 penalty. + For l1_ratio = 0 the penalty is an L2 penalty. For l1_ratio = 1 it is + an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. fit_intercept : boolean (default = True) If True, Lasso tries to correct for the global mean of y. If False, the model expects that you have centered the data. normalize : boolean (default = False) - If True, the predictors in X will be normalized by dividing by it's L2 norm. + If True, the predictors in X will be normalized by dividing by it's L2 + norm. If False, no scaling will be done. max_iter : int The maximum number of iterations tol : float, optional - The tolerance for the optimization: if the updates are smaller than tol, - the optimization code checks the dual gap for optimality and continues until it is smaller than tol. + The tolerance for the optimization: if the updates are smaller than + tol, the optimization code checks the dual gap for optimality and + continues until it is smaller than tol. selection : str, default ‘cyclic’ - If set to ‘random’, a random coefficient is updated every iteration rather than looping over features sequentially by default. - This (setting to ‘random’) often leads to significantly faster convergence especially when tol is higher than 1e-4. + If set to ‘random’, a random coefficient is updated every iteration + rather than looping over features sequentially by default. + This (setting to ‘random’) often leads to significantly faster + convergence especially when tol is higher than 1e-4. Attributes ----------- @@ -113,10 +121,12 @@ class ElasticNet: The independent term. If fit_intercept_ is False, will be 0. - For additional docs, see `scikitlearn's ElasticNet `_. + For additional docs, see `scikitlearn's ElasticNet + `_. """ - def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, selection='cyclic'): + def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True, + normalize=False, max_iter=1000, tol=1e-3, selection='cyclic'): """ Initializes the elastic-net regression class. @@ -131,7 +141,8 @@ class ElasticNet: tol: float or double. selection : str, ‘cyclic’, or 'random' - For additional docs, see `scikitlearn's ElasticNet `_. + For additional docs, see `scikitlearn's ElasticNet + `_. """ self._check_alpha(alpha) self._check_l1_ratio(l1_ratio) @@ -146,7 +157,7 @@ class ElasticNet: self.tol = tol self.cuElasticNet = None if selection in ['cyclic', 'random']: - self.selection = selection + self.selection = selection else: msg = "selection {!r} is not supported" raise TypeError(msg.format(selection)) @@ -154,7 +165,7 @@ class ElasticNet: self.intercept_value = 0.0 def _check_alpha(self, alpha): - if alpha<= 0.0: + if alpha <= 0.0: msg = "alpha value has to be positive" raise ValueError(msg.format(alpha)) @@ -181,8 +192,10 @@ class ElasticNet: if self.selection == 'random': shuffle = True - self.cuElasticNet = CD(fit_intercept=self.fit_intercept, normalize=self.normalize, alpha=self.alpha, - l1_ratio=self.l1_ratio, shuffle=shuffle, max_iter=self.max_iter) + self.cuElasticNet = CD(fit_intercept=self.fit_intercept, + normalize=self.normalize, alpha=self.alpha, + l1_ratio=self.l1_ratio, shuffle=shuffle, + max_iter=self.max_iter) self.cuElasticNet.fit(X, y) self.coef_ = self.cuElasticNet.coef_ @@ -208,7 +221,6 @@ class ElasticNet: return self.cuElasticNet.predict(X) - def get_params(self, deep=True): """ Sklearn style return parameter state @@ -218,13 +230,13 @@ class ElasticNet: deep : boolean (default = True) """ params = dict() - variables = ['alpha', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'selection'] + variables = ['alpha', 'fit_intercept', 'normalize', 'max_iter', 'tol', + 'selection'] for key in variables: - var_value = getattr(self,key,None) + var_value = getattr(self, key, None) params[key] = var_value return params - def set_params(self, **params): """ Sklearn style set parameter state to dictionary of params. @@ -235,7 +247,8 @@ class ElasticNet: """ if not params: return self - variables = ['alpha', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'selection'] + variables = ['alpha', 'fit_intercept', 'normalize', 'max_iter', 'tol', + 'selection'] for key, value in params.items(): if key not in variables: raise ValueError('Invalid parameter for estimator') diff --git a/python/cuml/linear_model/lasso.pyx b/python/cuml/linear_model/lasso.pyx index cb6a3a1a86..62b1dff800 100644 --- a/python/cuml/linear_model/lasso.pyx +++ b/python/cuml/linear_model/lasso.pyx @@ -19,20 +19,20 @@ # cython: embedsignature = True # cython: language_level = 3 - -import cudf -import numpy as np from cuml.solvers import CD + class Lasso: """ - Lasso extends LinearRegression by providing L1 regularization on the coefficients when - predicting response y with a linear combination of the predictors in X. It can zero some of - the coefficients for feature selection, and improves the conditioning of the problem. + Lasso extends LinearRegression by providing L1 regularization on the + coefficients when predicting response y with a linear combination of the + predictors in X. It can zero some of the coefficients for feature + selection, and improves the conditioning of the problem. + + cuML's Lasso expects a cuDF DataFrame or NumPy matrix, and uses coordinate + descent to fit a linear model. - cuML's Lasso expects a cuDF DataFrame, uses coordinate descent to fit a linear model. - Examples --------- @@ -84,23 +84,29 @@ class Lasso: ----------- alpha : float or double Constant that multiplies the L1 term. Defaults to 1.0. - alpha = 0 is equivalent to an ordinary least square, solved by the LinearRegression object. - For numerical reasons, using alpha = 0 with the Lasso object is not advised. - Given this, you should use the LinearRegression object. + alpha = 0 is equivalent to an ordinary least square, solved by the + LinearRegression class. + For numerical reasons, using alpha = 0 with the Lasso class is not + advised. + Given this, you should use the LinearRegression class. fit_intercept : boolean (default = True) If True, Lasso tries to correct for the global mean of y. If False, the model expects that you have centered the data. normalize : boolean (default = False) - If True, the predictors in X will be normalized by dividing by it's L2 norm. + If True, the predictors in X will be normalized by dividing by it's L2 + norm. If False, no scaling will be done. max_iter : int The maximum number of iterations tol : float, optional - The tolerance for the optimization: if the updates are smaller than tol, - the optimization code checks the dual gap for optimality and continues until it is smaller than tol. + The tolerance for the optimization: if the updates are smaller than + tol, the optimization code checks the dual gap for optimality and + continues until it is smaller than tol. selection : str, default ‘cyclic’ - If set to ‘random’, a random coefficient is updated every iteration rather than looping over features sequentially by default. - This (setting to ‘random’) often leads to significantly faster convergence especially when tol is higher than 1e-4. + If set to ‘random’, a random coefficient is updated every iteration + rather than looping over features sequentially by default. + This (setting to ‘random’) often leads to significantly faster + convergence especially when tol is higher than 1e-4. Attributes ----------- @@ -108,11 +114,13 @@ class Lasso: The estimated coefficients for the linear regression model. intercept_ : array The independent term. If fit_intercept_ is False, will be 0. - - For additional docs, see `scikitlearn's Lasso `_. + + For additional docs, see `scikitlearn's Lasso + `_. """ - - def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, selection='cyclic'): + + def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, + max_iter=1000, tol=1e-3, selection='cyclic'): """ Initializes the lasso regression class. @@ -120,8 +128,8 @@ class Lasso: Parameters ---------- alpha : float or double. - fit_intercept: boolean. - normalize: boolean. + fit_intercept: boolean. + normalize: boolean. max_iter: int tol: float or double. selection : str, ‘cyclic’, or 'random' @@ -138,11 +146,11 @@ class Lasso: self.tol = tol self.culasso = None if selection in ['cyclic', 'random']: - self.selection = selection + self.selection = selection else: msg = "selection {!r} is not supported" raise TypeError(msg.format(selection)) - + self.intercept_value = 0.0 def _check_alpha(self, alpha): @@ -168,8 +176,10 @@ class Lasso: if self.selection == 'random': shuffle = True - self.culasso = CD(fit_intercept=self.fit_intercept, normalize=self.normalize, alpha=self.alpha, - l1_ratio=1.0, shuffle=shuffle, max_iter=self.max_iter) + self.culasso = CD(fit_intercept=self.fit_intercept, + normalize=self.normalize, alpha=self.alpha, + l1_ratio=1.0, shuffle=shuffle, + max_iter=self.max_iter) self.culasso.fit(X, y) self.coef_ = self.culasso.coef_ @@ -195,7 +205,6 @@ class Lasso: return self.culasso.predict(X) - def get_params(self, deep=True): """ Sklearn style return parameter state @@ -205,13 +214,13 @@ class Lasso: deep : boolean (default = True) """ params = dict() - variables = ['alpha', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'selection'] + variables = ['alpha', 'fit_intercept', 'normalize', 'max_iter', 'tol', + 'selection'] for key in variables: - var_value = getattr(self,key,None) + var_value = getattr(self, key, None) params[key] = var_value return params - def set_params(self, **params): """ Sklearn style set parameter state to dictionary of params. @@ -222,11 +231,12 @@ class Lasso: """ if not params: return self - variables = ['alpha', 'fit_intercept', 'normalize', 'max_iter', 'tol', 'selection'] + variables = ['alpha', 'fit_intercept', 'normalize', 'max_iter', 'tol', + 'selection'] for key, value in params.items(): if key not in variables: raise ValueError('Invalid parameter for estimator') else: setattr(self, key, value) - + return self diff --git a/python/cuml/linear_model/linear_regression.pyx b/python/cuml/linear_model/linear_regression.pyx index dd34fbc6d6..02c3757cb1 100644 --- a/python/cuml/linear_model/linear_regression.pyx +++ b/python/cuml/linear_model/linear_regression.pyx @@ -70,12 +70,12 @@ cdef extern from "glm/glm.hpp" namespace "ML::GLM": class LinearRegression(Base): """ - LinearRegression is a simple machine learning model where the response y is modelled by a - linear combination of the predictors in X. + LinearRegression is a simple machine learning model where the response y is + modelled by a linear combination of the predictors in X. - cuML's LinearRegression expects either a cuDF DataFrame or a NumPy matrix and provides 2 - algorithms SVD and Eig to fit a linear model. SVD is more stable, but Eig (default) - is much more faster. + cuML's LinearRegression expects either a cuDF DataFrame or a NumPy matrix + and provides 2 algorithms SVD and Eig to fit a linear model. SVD is more + stable, but Eig (default) is much faster. Examples --------- @@ -89,7 +89,8 @@ class LinearRegression(Base): from cuml import LinearRegression from cuml.linear_model import LinearRegression - lr = LinearRegression(fit_intercept = True, normalize = False, algorithm = "eig") + lr = LinearRegression(fit_intercept = True, normalize = False, + algorithm = "eig") X = cudf.DataFrame() X['col1'] = np.array([1,1,2,2], dtype = np.float32) @@ -130,13 +131,15 @@ class LinearRegression(Base): Parameters ----------- algorithm : 'eig' or 'svd' (default = 'eig') - Eig uses a eigendecomposition of the covariance matrix, and is much faster. - SVD is slower, but is guaranteed to be stable. + Eig uses a eigendecomposition of the covariance matrix, and is much + faster. + SVD is slower, but guaranteed to be stable. fit_intercept : boolean (default = True) If True, LinearRegression tries to correct for the global mean of y. If False, the model expects that you have centered the data. normalize : boolean (default = False) - If True, the predictors in X will be normalized by dividing by it's L2 norm. + If True, the predictors in X will be normalized by dividing by it's + L2 norm. If False, no scaling will be done. Attributes @@ -148,23 +151,28 @@ class LinearRegression(Base): Notes ------ - LinearRegression suffers from multicollinearity (when columns are correlated with each other), - and variance explosions from outliers. Consider using Ridge Regression to fix the multicollinearity - problem,and consider maybe first DBSCAN to remove the outliers, or using leverage statistics to - filter possible outliers. + LinearRegression suffers from multicollinearity (when columns are + correlated with each other), and variance explosions from outliers. + Consider using Ridge Regression to fix the multicollinearity problem, and + consider maybe first DBSCAN to remove the outliers, or statistical analysis + to filter possible outliers. **Applications of LinearRegression** - LinearRegression is used in regression tasks where one wants to predict say sales or house prices. - It is also used in extrapolation or time series tasks, dynamic systems modelling and many other - machine learning tasks. This model should be first tried if the machine learning problem is a - regression task (predicting a continuous variable). + LinearRegression is used in regression tasks where one wants to predict + say sales or house prices. It is also used in extrapolation or time + series tasks, dynamic systems modelling and many other machine learning + tasks. This model should be first tried if the machine learning problem + is a regression task (predicting a continuous variable). + + For additional docs, see `scikitlearn's OLS + `_. + + For an additional example see `the OLS notebook + `_. - For additional docs, see `scikitlearn's OLS `_. - """ - # For an additional example see `the OLS notebook `_. - # New link: https://github.com/rapidsai/cuml/blob/master/python/notebooks/linear_regression_demo.ipynb + """ def __init__(self, algorithm='eig', fit_intercept=True, normalize=False): @@ -173,9 +181,12 @@ class LinearRegression(Base): Parameters ---------- - algorithm : Type: string. 'eig' (default) and 'svd' are supported algorithms. - fit_intercept: boolean. For more information, see `scikitlearn's OLS `_. - normalize: boolean. For more information, see `scikitlearn's OLS `_. + algorithm : Type: string. 'eig' (default) and 'svd' are supported + algorithms. + fit_intercept: boolean. For more information, see `scikitlearn's OLS + `_. + normalize: boolean. For more information, see `scikitlearn's OLS + `_. """ self.coef_ = None @@ -237,7 +248,10 @@ class LinearRegression(Base): raise TypeError(msg) if self.n_cols == 1: - self.algo = 0 # eig based method doesn't work when there is only one column. + # TODO: Throw algorithm when this changes algorithm from the user's + # choice. Github issue #602 + # eig based method doesn't work when there is only one column. + self.algo = 0 X_ptr = self._get_dev_array_ptr(X_m) @@ -259,31 +273,30 @@ class LinearRegression(Base): if self.gdf_datatype.type == np.float32: olsFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.normalize, - self.algo) + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.normalize, + self.algo) self.intercept_ = c_intercept1 else: olsFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.normalize, - self.algo) + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.normalize, + self.algo) self.intercept_ = c_intercept2 return self - def predict(self, X): """ Predicts the y for X. @@ -325,24 +338,23 @@ class LinearRegression(Base): if pred_datatype.type == np.float32: olsPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr) else: olsPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr) del(X_m) return preds - def get_params(self, deep=True): """ Sklearn style return parameter state @@ -352,13 +364,12 @@ class LinearRegression(Base): deep : boolean (default = True) """ params = dict() - variables = ['algorithm','fit_intercept','normalize'] + variables = ['algorithm', 'fit_intercept', 'normalize'] for key in variables: - var_value = getattr(self,key,None) + var_value = getattr(self, key, None) params[key] = var_value return params - def set_params(self, **params): """ Sklearn style set parameter state to dictionary of params. @@ -369,7 +380,7 @@ class LinearRegression(Base): """ if not params: return self - variables = ['algorithm','fit_intercept','normalize'] + variables = ['algorithm', 'fit_intercept', 'normalize'] for key, value in params.items(): if key not in variables: raise ValueError('Invalid parameter %s for estimator') diff --git a/python/cuml/linear_model/linear_regression_mg.pyx b/python/cuml/linear_model/linear_regression_mg.pyx index db75b67e6e..396a4648fa 100644 --- a/python/cuml/linear_model/linear_regression_mg.pyx +++ b/python/cuml/linear_model/linear_regression_mg.pyx @@ -126,7 +126,7 @@ class LinearRegressionMG: """ Single Process, Multi-GPU Linear Regression - For using with Numpy: + For using with Numpy, assuming 2 GPUs: .. code-block:: python @@ -135,26 +135,29 @@ class LinearRegressionMG: X = np.array([ - [ 1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0], - [ 2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0], - [ 3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0], - [ 4.0, 14.0, 24.0, 34.0, 44.0, 54.0, 4.0, 14.0, 24.0, 34.0, 44.0, 54.0], - [ 1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0], - [ 2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0], - [ 3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0], - [ 4.0, 14.0, 24.0, 34.0, 44.0, 54.0, 4.0, 14.0, 24.0, 34.0, 44.0, 54.0], - [ 1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0], - [ 2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0], - [ 3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0], - [ 4.0, 14.0, 24.0, 34.0, 44.0, 54.0, 4.0, 14.0, 24.0, 34.0, 44.0, 54.0], - [ 1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0], - [ 2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0], - [ 3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0], - [ 4.0, 14.0, 24.0, 34.0, 44.0, 54.0, 4.0, 14.0, 24.0, 34.0, 44.0, 54.0]], dtype=np.float32) - - - - y = np.array([60.0, 61.0, 62.0, 63.0, 60.0, 61.0, 62.0, 63.0, 60.0, 61.0, 62.0, 63.0, 60.0, 61.0, 62.0, 63.0], dtype=np.float32) + [1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0], + [2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0], + [3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0], + [4.0, 14.0, 24.0, 34.0, 44.0, 54.0, 4.0, 14.0, 24.0, 34.0, 44.0, 54.0], + [1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0], + [2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0], + [3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0], + [4.0, 14.0, 24.0, 34.0, 44.0, 54.0, 4.0, 14.0, 24.0, 34.0, 44.0, 54.0], + [1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0], + [2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0], + [3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0], + [4.0, 14.0, 24.0, 34.0, 44.0, 54.0, 4.0, 14.0, 24.0, 34.0, 44.0, 54.0], + [1.0, 11.0, 21.0, 31.0, 41.0, 51.0, 1.0, 11.0, 21.0, 31.0, 41.0, 51.0], + [2.0, 12.0, 22.0, 32.0, 42.0, 52.0, 2.0, 12.0, 22.0, 32.0, 42.0, 52.0], + [3.0, 13.0, 23.0, 33.0, 43.0, 53.0, 3.0, 13.0, 23.0, 33.0, 43.0, 53.0], + [4.0, 14.0, 24.0, 34.0, 44.0, 54.0, 4.0, 14.0, 24.0, 34.0, 44.0, 54.0] + ], dtype=np.float32) + + + + y = np.array([60.0, 61.0, 62.0, 63.0, 60.0, 61.0, 62.0, 63.0, 60.0, + 61.0, 62.0, 63.0, 60.0, 61.0, 62.0, 63.0], + dtype=np.float32) lr = LinearRegression() @@ -163,7 +166,6 @@ class LinearRegressionMG: To use with Dask, please see the LinearRegression in dask-cuml. - """ def __init__(self, algorithm='eig', fit_intercept=True, normalize=False): @@ -173,9 +175,12 @@ class LinearRegressionMG: Parameters ---------- - algorithm : Type: string. 'eig' (default) and 'svd' are supported algorithms. - fit_intercept: boolean. For more information, see `scikitlearn's OLS `_. - normalize: boolean. For more information, see `scikitlearn's OLS `_. + algorithm : Type: string. 'eig' (default) and 'svd' are supported + algorithms. + fit_intercept: boolean. For more information, see `scikitlearn's OLS + `_. + normalize: boolean. For more information, see `scikitlearn's OLS + `_. """ self.coef_ = None @@ -218,7 +223,8 @@ class LinearRegressionMG: Dense vector (floats or doubles) of shape (n_samples, 1) n_gpus : int - Number of gpus to be used for prediction. If gpu_ids parameter has more than element, this parameter is ignored. + Number of gpus to be used for prediction. If gpu_ids parameter + has more than element, this parameter is ignored. gpu_ids: int array GPU ids to be used for prediction. @@ -245,7 +251,8 @@ class LinearRegressionMG: Dense matrix (floats or doubles) of shape (n_samples, n_features) n_gpus : int - Number of gpus to be used for prediction. If gpu_ids parameter has more than element, this parameter is ignored. + Number of gpus to be used for prediction. If gpu_ids parameter + has more than element, this parameter is ignored. gpu_ids: int array GPU ids to be used for prediction. @@ -267,17 +274,18 @@ class LinearRegressionMG: raise ValueError('Number of GPUS should be 2 or more' 'For single GPU, use the normal LinearRegression') - def _fit_spmg(self, X, y, gpu_ids): # Using numpy ctypes pointer to avoid cimport numpy for abi issues # Future improvement change saving this coefs as distributed in gpus if (not isinstance(X, np.ndarray)): - msg = "X matrix must be a Numpy ndarray. Dask will be supported in the next version." + msg = "X matrix must be a Numpy ndarray." \ + " Dask will be supported in the next version." raise TypeError(msg) if (not isinstance(y, np.ndarray)): - msg = "y matrix must be a Numpy ndarray. Dask will be supported in the next version." + msg = "y matrix must be a Numpy ndarray." \ + " Dask will be supported in the next version." raise TypeError(msg) n_gpus = len(gpu_ids) @@ -353,7 +361,6 @@ class LinearRegressionMG: return self - def _predict_spmg(self, X, gpu_ids): n_gpus = len(gpu_ids) @@ -389,7 +396,7 @@ class LinearRegressionMG: n_rows, n_cols, coef_ptr, - self.intercept_, + self.intercept_, pred_ptr, gpu_ids_ptr, n_gpus) @@ -399,14 +406,13 @@ class LinearRegressionMG: n_rows, n_cols, coef_ptr, - self.intercept_, + self.intercept_, pred_ptr, gpu_ids_ptr, n_gpus) return pred - def _fit_mg(self, alloc_info, params): self.fit_intercept = params['fit_intercept'] @@ -469,17 +475,17 @@ class LinearRegressionMG: idx = idx + 1 spmgOlsFit( input32, - input_cols, - n_rows, - n_cols, - labels32, - label_rows, - coef32, - coef_cols, - &intercept_f32, - self.fit_intercept, - self.normalize, - n_allocs) + input_cols, + n_rows, + n_cols, + labels32, + label_rows, + coef32, + coef_cols, + &intercept_f32, + self.fit_intercept, + self.normalize, + n_allocs) return intercept_f32 @@ -507,21 +513,20 @@ class LinearRegressionMG: idx = idx + 1 spmgOlsFit( input64, - input_cols, - n_rows, - n_cols, - labels64, - label_rows, - coef64, - coef_cols, - &intercept_f64, - self.fit_intercept, - self.normalize, - n_allocs) + input_cols, + n_rows, + n_cols, + labels64, + label_rows, + coef64, + coef_cols, + &intercept_f64, + self.fit_intercept, + self.normalize, + n_allocs) return intercept_f64 - def _predict_mg(self, alloc_info, intercept, params): self.fit_intercept = params['fit_intercept'] @@ -580,15 +585,15 @@ class LinearRegressionMG: idx = idx + 1 spmgOlsPredict(input32, - input_cols, - n_rows, - n_cols, - coef32, - coef_cols, - intercept, - pred32, - pred_rows, - n_allocs) + input_cols, + n_rows, + n_cols, + coef32, + coef_cols, + intercept, + pred32, + pred_rows, + n_allocs) else: @@ -614,12 +619,12 @@ class LinearRegressionMG: idx = idx + 1 spmgOlsPredict(input64, - input_cols, - n_rows, - n_cols, - coef64, - coef_cols, - intercept, - pred64, - pred_rows, - n_allocs) + input_cols, + n_rows, + n_cols, + coef64, + coef_cols, + intercept, + pred64, + pred_rows, + n_allocs) diff --git a/python/cuml/linear_model/ridge.pyx b/python/cuml/linear_model/ridge.pyx index a45f58d7c4..9b0d971ad2 100644 --- a/python/cuml/linear_model/ridge.pyx +++ b/python/cuml/linear_model/ridge.pyx @@ -76,13 +76,14 @@ cdef extern from "glm/glm.hpp" namespace "ML::GLM": class Ridge(Base, RegressorMixin): """ - Ridge extends LinearRegression by providing L2 regularization on the coefficients when - predicting response y with a linear combination of the predictors in X. It can reduce - the variance of the predictors, and improves the conditioning of the problem. + Ridge extends LinearRegression by providing L2 regularization on the + coefficients when predicting response y with a linear combination of the + predictors in X. It can reduce the variance of the predictors, and improves + the conditioning of the problem. - cuML's Ridge expects a cuDF DataFrame, and provides 3 algorithms SVD, Eig and CD to - fit a linear model. SVD is more stable, but Eig (default) is much more faster. CD uses - Coordinate Descent and can be faster if the data is large. + cuML's Ridge expects a cuDF DataFrame, and provides 3 algorithms SVD, Eig + and CD to fit a linear model. SVD is more stable, but Eig (default) is much + faster. CD uses Coordinate Descent and can be faster when data is large. Examples --------- @@ -97,7 +98,8 @@ class Ridge(Base, RegressorMixin): from cuml.linear_model import Ridge alpha = np.array([1.0]) - ridge = Ridge(alpha = alpha, fit_intercept = True, normalize = False, solver = "eig") + ridge = Ridge(alpha = alpha, fit_intercept = True, normalize = False, + solver = "eig") X = cudf.DataFrame() X['col1'] = np.array([1,1,2,2], dtype = np.float32) @@ -138,17 +140,20 @@ class Ridge(Base, RegressorMixin): Parameters ----------- alpha : float or double - Regularization strength - must be a positive float. Larger values specify - stronger regularization. Array input will be supported later. + Regularization strength - must be a positive float. Larger values + specify stronger regularization. Array input will be supported later. solver : 'eig' or 'svd' or 'cd' (default = 'eig') - Eig uses a eigendecomposition of the covariance matrix, and is much faster. - SVD is slower, but is guaranteed to be stable. - CD or Coordinate Descent is very fast and is suitable for large problems. + Eig uses a eigendecomposition of the covariance matrix, and is much + faster. + SVD is slower, but guaranteed to be stable. + CD or Coordinate Descent is very fast and is suitable for large + problems. fit_intercept : boolean (default = True) If True, Ridge tries to correct for the global mean of y. If False, the model expects that you have centered the data. normalize : boolean (default = False) - If True, the predictors in X will be normalized by dividing by it's L2 norm. + If True, the predictors in X will be normalized by dividing by it's L2 + norm. If False, no scaling will be done. Attributes @@ -160,34 +165,37 @@ class Ridge(Base, RegressorMixin): Notes ------ - Ridge provides L2 regularization. This means that the coefficients can shrink to become - very very small, but not zero. This can cause issues of interpretabiliy on the coefficients. + Ridge provides L2 regularization. This means that the coefficients can + shrink to become very small, but not zero. This can cause issues of + interpretabiliy on the coefficients. Consider using Lasso, or thresholding small coefficients to zero. **Applications of Ridge** - Ridge Regression is used in the same way as LinearRegression, but is used more frequently - as it does not suffer from multicollinearity issues. Ridge is used in insurance premium - prediction, stock market analysis and much more. + Ridge Regression is used in the same way as LinearRegression, but is + used frequently as it does not suffer from multicollinearity issues. + Ridge is used in insurance premium prediction, stock market analysis + and much more. - For additional docs, see `scikitlearn's Ridge `_. + For additional docs, see `scikitlearn's Ridge + `_. """ - # Link will work later - # For an additional example see `the Ridge notebook `_. - # New link : https://github.com/rapidsai/notebooks/blob/master/cuml/ridge_regression_demo.ipynb - - def __init__(self, alpha=1.0, solver='eig', fit_intercept=True, normalize=False): + def __init__(self, alpha=1.0, solver='eig', fit_intercept=True, + normalize=False): """ Initializes the linear ridge regression class. Parameters ---------- - solver : Type: string. 'eig' (default) and 'svd' are supported algorithms. - fit_intercept: boolean. For more information, see `scikitlearn's OLS `_. - normalize: boolean. For more information, see `scikitlearn's OLS `_. + solver : Type: string. 'eig' (default) and 'svd' are supported + algorithms. + fit_intercept: boolean. For more information, see `scikitlearn's OLS + `_. + normalize: boolean. For more information, see `scikitlearn's OLS + `_. """ # self._check_alpha(alpha) @@ -227,7 +235,6 @@ class Ridge(Base, RegressorMixin): def _get_column_ptr(self, obj): return self._get_ctype_ptr(obj._column._data.to_gpu_array()) - def fit(self, X, y): """ Fit the model with X and y. @@ -267,7 +274,9 @@ class Ridge(Base, RegressorMixin): raise TypeError(msg) if self.n_cols == 1: - self.algo = 0 # eig based method doesn't work when there is only one column. + # TODO: Throw algorithm when this changes algorithm from the user's + # choice. Github issue #602 + self.algo = 0 X_ptr = self._get_dev_array_ptr(X_m) @@ -283,7 +292,8 @@ class Ridge(Base, RegressorMixin): self.n_alpha = 1 - self.coef_ = cudf.Series(np.zeros(self.n_cols, dtype=self.gdf_datatype)) + self.coef_ = cudf.Series(np.zeros(self.n_cols, + dtype=self.gdf_datatype)) cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) cdef float c_intercept1 @@ -293,31 +303,31 @@ class Ridge(Base, RegressorMixin): if self.gdf_datatype.type == np.float32: c_alpha1 = self.alpha ridgeFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - &c_alpha1, - self.n_alpha, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.normalize, - self.algo) + self.n_rows, + self.n_cols, + y_ptr, + &c_alpha1, + self.n_alpha, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.normalize, + self.algo) self.intercept_ = c_intercept1 else: c_alpha2 = self.alpha ridgeFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - &c_alpha2, - self.n_alpha, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.normalize, - self.algo) + self.n_rows, + self.n_cols, + y_ptr, + &c_alpha2, + self.n_alpha, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.normalize, + self.algo) self.intercept_ = c_intercept2 @@ -364,24 +374,23 @@ class Ridge(Base, RegressorMixin): if pred_datatype.type == np.float32: ridgePredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr) else: ridgePredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr) del(X_m) return preds - def get_params(self, deep=True): """ Sklearn style return parameter state @@ -397,7 +406,6 @@ class Ridge(Base, RegressorMixin): params[key] = var_value return params - def set_params(self, **params): """ Sklearn style set parameter state to dictionary of params. @@ -415,5 +423,5 @@ class Ridge(Base, RegressorMixin): else: setattr(self, key, value) if 'solver' in params.keys(): - self.algo=self._get_algorithm_int(self.solver) + self.algo = self._get_algorithm_int(self.solver) return self From d2e78a1a505c2f01bb658c2e4505a807fac01658 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 15:04:04 -0500 Subject: [PATCH 106/156] FIX variable name in kmeans --- python/cuml/cluster/kmeans.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx index b1a0749393..8dc98bd0e3 100644 --- a/python/cuml/cluster/kmeans.pyx +++ b/python/cuml/cluster/kmeans.pyx @@ -403,7 +403,7 @@ class KMeans(Base): ' passed.') self.handle.sync() - cluster_centers_gdf = cudf.DataFrame() + cc_df = cudf.DataFrame() for i in range(0, self.n_cols): n_c = self.n_cluster n_cols = self.n_cols From 7ff9835080d3d15371e69c92d44f068f1bda57de Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 15:19:17 -0500 Subject: [PATCH 107/156] FIX manifold methods pep8 changes --- python/cuml/manifold/umap.pyx | 87 +++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 39 deletions(-) diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index e5531073af..f75576e525 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -124,8 +124,9 @@ cdef class UMAP: relative to the ``spread`` value, which determines the scale at which embedded points will be spread out. spread: float (optional, default 1.0) - The effective scale of embedded points. In combination with ``min_dist`` - this determines how clustered/clumped the embedded points are. + The effective scale of embedded points. In combination with + ``min_dist`` this determines how clustered/clumped the embedded + points are. set_op_mix_ratio: float (optional, default 1.0) Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy @@ -185,7 +186,8 @@ cdef class UMAP: References ---------- * Leland McInnes, John Healy, James Melville - UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction + UMAP: Uniform Manifold Approximation and Projection for Dimension + Reduction https://arxiv.org/abs/1802.03426 """ @@ -214,13 +216,13 @@ cdef class UMAP: negative_sample_rate=5, transform_queue_size=4.0, init="spectral", - verbose = False, - a = None, - b = None, - target_n_neighbors = -1, - target_weights = 0.5, - target_metric = "euclidean", - should_downcast = True): + verbose=False, + a=None, + b=None, + target_n_neighbors=-1, + target_weights=0.5, + target_metric="euclidean", + should_downcast=True): self.umap_params = new UMAPParams() @@ -236,7 +238,7 @@ cdef class UMAP: elif(init == "random"): self.umap_params.init = 0 else: - raise Exception("Initialization strategy not support: [init=%d]" % init) + raise Exception("Initialization strategy not supported: %d" % init) if a is not None: self.umap_params.a = a @@ -267,7 +269,6 @@ cdef class UMAP: self.umap = new UMAP_API(self.umap_params) - def __dealloc__(self): del self.umap_params del self.umap @@ -280,17 +281,22 @@ cdef class UMAP: if dtype != np.float32: if self._should_downcast: - new_cols = [(col,X._cols[col].astype(np.float32)) for col in X._cols] - overflowed = sum([len(colval[colval >= np.inf]) for colname, colval in new_cols]) + new_cols = [(col, X._cols[col].astype(np.float32)) + for col in X._cols] + overflowed = sum([len(colval[colval >= np.inf]) + for colname, colval in new_cols]) if overflowed > 0: - raise Exception("Downcast to single-precision resulted in data loss.") + raise Exception("Downcast to single-precision resulted" + " in data loss.") X = cudf.DataFrame(new_cols) else: - raise Exception("Input is double precision. Use 'should_downcast=True' " - "if you'd like it to be automatically casted to single precision.") + raise Exception("Input is double precision. Use " + "'should_downcast=True' " + "if you'd like it to be automatically " + "casted to single precision.") X = numba_utils.row_matrix(X) elif isinstance(X, np.ndarray): @@ -300,11 +306,14 @@ cdef class UMAP: if self._should_downcast: X = X.astype(np.float32) if len(X[X == np.inf]) > 0: - raise Exception("Downcast to single-precision resulted in data loss.") + raise Exception("Downcast to single-precision resulted" + " in data loss.") else: - raise Exception("Input is double precision. Use 'should_downcast=True' " - "if you'd like it to be automatically casted to single precision.") + raise Exception("Input is double precision. Use" + " 'should_downcast=True' " + "if you'd like it to be automatically " + "casted to single precision.") X = cuda.to_device(X) else: @@ -312,8 +321,7 @@ cdef class UMAP: return X - - def fit(self, X, y = None): + def fit(self, X, y=None): """Fit X into an embedded space. Parameters ---------- @@ -324,7 +332,7 @@ cdef class UMAP: """ assert len(X.shape) == 2, 'data should be two dimensional' - assert X.shape[0] > 1, 'need more than 1 sample to build nearest neighbors graph' + assert X.shape[0] > 1, 'need more than 1 sample to build nearest neighbors graph' # noqa E501 self.umap_params.n_neighbors = min(X.shape[0], self.umap_params.n_neighbors) @@ -336,8 +344,8 @@ cdef class UMAP: self.raw_data = X_m.device_ctypes_pointer.value self.arr_embed = cuda.to_device(np.zeros((X_m.shape[0], - self.umap_params.n_components), - order = "C", dtype=np.float32)) + self.umap_params.n_components), + order="C", dtype=np.float32)) self.embeddings = self.arr_embed.device_ctypes_pointer.value cdef uintptr_t y_raw @@ -363,7 +371,7 @@ cdef class UMAP: del X_m - def fit_transform(self, X, y = None): + def fit_transform(self, X, y=None): """Fit X into an embedded space and return that transformed output. Parameters @@ -380,19 +388,19 @@ cdef class UMAP: if isinstance(X, cudf.DataFrame): ret = cudf.DataFrame() for i in range(0, self.arr_embed.shape[1]): - ret[str(i)] = self.arr_embed[:,i] + ret[str(i)] = self.arr_embed[:, i] elif isinstance(X, np.ndarray): ret = np.asarray(self.arr_embed) return ret - def transform(self, X): """Transform X into the existing embedded space and return that transformed output. Please refer to the reference UMAP implementation for information - on the differences between fit_transform() and running fit() transform(). + on the differences between fit_transform() and running fit() + transform(). Specifically, the transform() function is stochastic: https://github.com/lmcinnes/umap/issues/158 @@ -408,29 +416,30 @@ cdef class UMAP: """ assert len(X.shape) == 2, 'data should be two dimensional' - assert X.shape[0] > 1, 'need more than 1 sample to build nearest neighbors graph' - assert X.shape[1] == self.n_dims, "n_features of X must match n_features of training data" + assert X.shape[0] > 1, 'need more than 1 sample to build nearest neighbors graph' # noqa E501 + assert X.shape[1] == self.n_dims, "n_features of X must match n_features of training data" # noqa E501 X_m = self._downcast(X) cdef uintptr_t x_ptr = X_m.device_ctypes_pointer.value - embedding = cuda.to_device(np.zeros((X_m.shape[0], self.umap_params.n_components), - order = "C", dtype=np.float32)) + embedding = cuda.to_device(np.zeros((X_m.shape[0], + self.umap_params.n_components), + order="C", dtype=np.float32)) cdef uintptr_t embed_ptr = embedding.device_ctypes_pointer.value self.umap.transform( x_ptr, - X_m.shape[0], - X_m.shape[1], - self.embeddings, - self.arr_embed.shape[0], - embed_ptr) + X_m.shape[0], + X_m.shape[1], + self.embeddings, + self.arr_embed.shape[0], + embed_ptr) if isinstance(X, cudf.DataFrame): ret = cudf.DataFrame() for i in range(0, embedding.shape[1]): - ret[str(i)] = embedding[:,i] + ret[str(i)] = embedding[:, i] elif isinstance(X, np.ndarray): ret = np.asarray(embedding) From 0945cdc2838982f428f6e336f883775dbef941c1 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 15:20:14 -0500 Subject: [PATCH 108/156] FIX metrics methods pep8 changes --- python/cuml/metrics/regression.pyx | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/python/cuml/metrics/regression.pyx b/python/cuml/metrics/regression.pyx index 69165f469f..fd7eba7a52 100644 --- a/python/cuml/metrics/regression.pyx +++ b/python/cuml/metrics/regression.pyx @@ -41,25 +41,18 @@ def r2_score(y, y_hat, handle=None): if y.dtype == 'float32': result_f32 = regression.r2_score_py(handle_[0], - y_ptr, - y_hat_ptr, - n) + y_ptr, + y_hat_ptr, + n) result = result_f32 else: result_f64 = regression.r2_score_py(handle_[0], - y_ptr, - y_hat_ptr, - n) + y_ptr, + y_hat_ptr, + n) result = result_f64 - return result - - - - - - From 50ea4b5e080c6e66a41c298b84b7a2b785f8aa36 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 15:29:47 -0500 Subject: [PATCH 109/156] FIX neighbors methods pep8 changes --- python/cuml/neighbors/nearest_neighbors.pyx | 136 +++++++++++--------- 1 file changed, 76 insertions(+), 60 deletions(-) diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index e70b87b014..f17364cc93 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -62,15 +62,17 @@ cdef extern from "knn/knn.h" namespace "ML": cdef class NearestNeighbors: """ - NearestNeighbors is a unsupervised algorithm where if one wants to find the "closest" - datapoint(s) to new unseen data, one can calculate a suitable "distance" between - each and every point, and return the top K datapoints which have the smallest distance to it. - - cuML's KNN expects a cuDF DataFrame or a Numpy Array (where automatic chunking will be done - in to a Numpy Array in a future release), and fits a special data structure first to - approximate the distance calculations, allowing our querying times to be O(plogn) - and not the brute force O(np) [where p = no(features)]: - + NearestNeighbors is a unsupervised algorithm where if one wants to find the + "closest" datapoint(s) to new unseen data, one can calculate a suitable + "distance" between each and every point, and return the top K datapoints + which have the smallest distance to it. + + cuML's KNN expects a cuDF DataFrame or a Numpy Array (where automatic + chunking will be done in to a Numpy Array in a future release), and fits a + special data structure first to approximate the distance calculations, + allowing our querying times to be O(plogn) and not the brute force O(np) + [where p = no(features)]: + Examples --------- .. code-block:: python @@ -95,7 +97,8 @@ cdef class NearestNeighbors: nn_float = NearestNeighbors() nn_float.fit(gdf_float) - distances,indices = nn_float.kneighbors(gdf_float,k=3) #get 3 nearest neighbors + # get 3 nearest neighbors + distances,indices = nn_float.kneighbors(gdf_float,k=3) print(indices) print(distances) @@ -134,23 +137,24 @@ cdef class NearestNeighbors: Parameters ---------- n_neighbors: int (default = 5) - The top K closest datapoints you want the algorithm to return. If this number is large, - then expect the algorithm to run slower. + The top K closest datapoints you want the algorithm to return. + If this number is large, then expect the algorithm to run slower. should_downcast : bool (default = False) - Currently only single precision is supported in the underlying undex. Setting this to - true will allow single-precision input arrays to be automatically downcasted to single - precision. Default = False. - + Currently only single precision is supported in the underlying undex. + Setting this to true will allow single-precision input arrays to be + automatically downcasted to single precision. + Notes ------ - NearestNeighbors is a generative model. This means the data X has to be stored in order - for inference to occur. - + NearestNeighbors is a generative model. This means the data X has to be + stored in order for inference to occur. + **Applications of NearestNeighbors** - - Applications of NearestNeighbors include recommendation systems where content or colloborative - filtering is used. Since NearestNeighbors is a relatively simple generative model, it is also - used in data visualization and regression / classification tasks. + + Applications of NearestNeighbors include recommendation systems where + content or colloborative filtering is used. Since NearestNeighbors is a + relatively simple generative model, it is also used in data + visualization and regression / classification tasks. For an additional example see `the NearestNeighbors notebook `_. @@ -158,7 +162,7 @@ cdef class NearestNeighbors: For additional docs, see `scikitlearn's NearestNeighbors `_. """ - + cpdef kNN *k cdef int num_gpus @@ -179,16 +183,17 @@ cdef class NearestNeighbors: cpdef kNNParams *input - def __cinit__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, should_downcast = True): + def __cinit__(self, n_neighbors=5, n_gpus=1, devices=None, verbose=False, + should_downcast=True): """ Construct the NearestNeighbors object for training and querying. Parameters ---------- - should_downcast: Bool - Currently only single precision is supported in the underlying undex. Setting this to - true will allow single-precision input arrays to be automatically downcasted to single - precision. Default = False. + should_downcast: bool (default = False) + Currently only single precision is supported in the underlying + index. Setting this to true will allow single-precision input + arrays to be automatically downcasted to single precision. """ self._verbose = verbose self.n_gpus = n_gpus @@ -214,7 +219,6 @@ cdef class NearestNeighbors: def _get_gdf_as_matrix_ptr(self, gdf): return self._get_ctype_ptr(gdf.as_gpu_matrix()) - def _downcast(self, X): if isinstance(X, cudf.DataFrame): @@ -223,17 +227,22 @@ cdef class NearestNeighbors: if dtype != np.float32: if self._should_downcast: - new_cols = [(col,X._cols[col].astype(np.float32)) for col in X._cols] - overflowed = sum([len(colval[colval >= np.inf]) for colname, colval in new_cols]) + new_cols = [(col, X._cols[col].astype(np.float32)) + for col in X._cols] + overflowed = sum([len(colval[colval >= np.inf]) + for colname, colval in new_cols]) if overflowed > 0: - raise Exception("Downcast to single-precision resulted in data loss.") + raise Exception("Downcast to single-precision resulted" + "in data loss.") X = cudf.DataFrame(new_cols) else: - raise Exception("Input is double precision. Use 'should_downcast=True' " - "if you'd like it to be automatically casted to single precision.") + raise Exception("Input is double precision. Use" + " 'should_downcast=True' " + "if you'd like it to be automatically" + " casted to single precision.") X_m = numba_utils.row_matrix(X) @@ -244,10 +253,13 @@ cdef class NearestNeighbors: if self._should_downcast: X = np.ascontiguousarray(X.astype(np.float32)) if len(X[X == np.inf]) > 0: - raise Exception("Downcast to single-precision resulted in data loss.") + raise Exception("Downcast to single-precision resulted" + " in data loss.") else: - raise Exception("Input is double precision. Use 'should_downcast=True' " - "if you'd like it to be automatically casted to single precision.") + raise Exception("Input is double precision. Use" + " 'should_downcast=True' " + "if you'd like it to be automatically" + " casted to single precision.") X_m = cuda.to_device(X) else: @@ -255,7 +267,6 @@ cdef class NearestNeighbors: return X_m - def fit(self, X): """ Fit GPU index for performing nearest neighbor queries. @@ -280,12 +291,15 @@ cdef class NearestNeighbors: if X.dtype != np.float32: if self._should_downcast: X = np.ascontiguousarray(X, np.float32) - if len(X[X==np.inf]) > 0: - raise Exception("Downcast to single-precision resulted in data loss.") + if len(X[X == np.inf]) > 0: + raise Exception("Downcast to single-precision resulted" + " in data loss.") else: - raise Exception("Only single precision floating point is supported for this" - "algorithm. Use 'should_downcast=True' if you'd like it to " - "be automatically casted to single precision.") + raise Exception("Only single precision floating point is" + " supported for this algorithm. Use " + "'should_downcast=True' if you'd like it " + "to be automatically casted to single " + "precision.") sys_devices = set([d.id for d in cuda.gpus]) @@ -300,7 +314,8 @@ cdef class NearestNeighbors: n_gpus = min(self.n_gpus, len(sys_devices)) final_devices = list(sys_devices)[:n_gpus] - final_devices = np.ascontiguousarray(np.array(final_devices), np.int32) + final_devices = np.ascontiguousarray(np.array(final_devices), + np.int32) X_ctype = X.ctypes.data dev_ptr = final_devices.ctypes.data @@ -328,9 +343,9 @@ cdef class NearestNeighbors: def _fit_mg(self, n_dims, alloc_info): """ - Fits a model using multiple GPUs. This method takes in a list of dict objects - representing the distribution of the underlying device pointers. The device - information can be extracted from the pointers. + Fits a model using multiple GPUs. This method takes in a list of dict + objects representing the distribution of the underlying device + pointers. The device information can be extracted from the pointers. :param n_dims the number of features for each vector @@ -342,10 +357,10 @@ cdef class NearestNeighbors: if self.k != NULL: del self.k - self.k = new kNN(n_dims, verbose = self._verbose) + self.k = new kNN(n_dims, verbose=self._verbose) del self.input - self.input = < kNNParams * > malloc(len(alloc_info) * sizeof(kNNParams)) + self.input = malloc(len(alloc_info) * sizeof(kNNParams)) cdef uintptr_t input_ptr for i in range(len(alloc_info)): @@ -360,8 +375,7 @@ cdef class NearestNeighbors: self.k.fit( < kNNParams * > self.input, < int > len(alloc_info)) - - def kneighbors(self, X, k = None): + def kneighbors(self, X, k=None): """ Query the GPU index for the k nearest neighbors of row vectors in X. @@ -376,7 +390,8 @@ cdef class NearestNeighbors: Returns ---------- distances: cuDF DataFrame or numpy ndarray - The distances of the k-nearest neighbors for each column vector in X + The distances of the k-nearest neighbors for each column vector + in X indices: cuDF DataFrame of numpy ndarray The indices of the k-nearest neighbors for each column vector in X @@ -390,9 +405,10 @@ cdef class NearestNeighbors: cdef uintptr_t X_ctype = self._get_ctype_ptr(X_m) N = len(X) - # Need to establish result matrices for indices (Nxk) and for distances (Nxk) - I_ndarr = cuda.to_device(np.zeros(N*k, dtype=np.int64, order = "C")) - D_ndarr = cuda.to_device(np.zeros(N*k, dtype=np.float32, order = "C")) + # Need to establish result matrices for indices (Nxk) + # and for distances (Nxk) + I_ndarr = cuda.to_device(np.zeros(N*k, dtype=np.int64, order="C")) + D_ndarr = cuda.to_device(np.zeros(N*k, dtype=np.float32, order="C")) cdef uintptr_t I_ptr = self._get_ctype_ptr(I_ndarr) cdef uintptr_t D_ptr = self._get_ctype_ptr(D_ndarr) @@ -405,11 +421,11 @@ cdef class NearestNeighbors: if isinstance(X, cudf.DataFrame): inds = cudf.DataFrame() for i in range(0, I_ndarr.shape[1]): - inds[str(i)] = I_ndarr[:,i] + inds[str(i)] = I_ndarr[:, i] dists = cudf.DataFrame() for i in range(0, D_ndarr.shape[1]): - dists[str(i)] = D_ndarr[:,i] + dists[str(i)] = D_ndarr[:, i] return dists, inds @@ -423,7 +439,6 @@ cdef class NearestNeighbors: return dists, inds - def _kneighbors(self, X_ctype, N, k, I_ptr, D_ptr): """ Query the GPU index for the k nearest neighbors of column vectors in X. @@ -448,7 +463,8 @@ cdef class NearestNeighbors: Returns ---------- distances: cuDF DataFrame or numpy ndarray - The distances of the k-nearest neighbors for each column vector in X + The distances of the k-nearest neighbors for each column vector + in X indices: cuDF DataFrame of numpy ndarray The indices of the k-nearest neighbors for each column vector in X From 5b8c2649105081898cf324877f84d51f4ab5be1b Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 15:40:36 -0500 Subject: [PATCH 110/156] FIX solver methods pep8 changes --- python/cuml/solvers/cd.pyx | 201 ++++++++++++++++++------------------ python/cuml/solvers/sgd.pyx | 191 ++++++++++++++++++---------------- 2 files changed, 202 insertions(+), 190 deletions(-) diff --git a/python/cuml/solvers/cd.pyx b/python/cuml/solvers/cd.pyx index b89f17f568..c72852135e 100644 --- a/python/cuml/solvers/cd.pyx +++ b/python/cuml/solvers/cd.pyx @@ -33,59 +33,60 @@ from cuml.common.base import Base cdef extern from "solver/solver_c.h" namespace "ML::Solver": cdef void cdFit(float *input, - int n_rows, - int n_cols, - float *labels, - float *coef, - float *intercept, - bool fit_intercept, - bool normalize, - int epochs, - int loss, - float alpha, - float l1_ratio, - bool shuffle, - float tol) - + int n_rows, + int n_cols, + float *labels, + float *coef, + float *intercept, + bool fit_intercept, + bool normalize, + int epochs, + int loss, + float alpha, + float l1_ratio, + bool shuffle, + float tol) cdef void cdFit(double *input, - int n_rows, - int n_cols, - double *labels, - double *coef, - double *intercept, - bool fit_intercept, - bool normalize, - int epochs, - int loss, - double alpha, - double l1_ratio, - bool shuffle, - double tol) + int n_rows, + int n_cols, + double *labels, + double *coef, + double *intercept, + bool fit_intercept, + bool normalize, + int epochs, + int loss, + double alpha, + double l1_ratio, + bool shuffle, + double tol) cdef void cdPredict(const float *input, - int n_rows, - int n_cols, - const float *coef, - float intercept, - float *preds, - int loss) + int n_rows, + int n_cols, + const float *coef, + float intercept, + float *preds, + int loss) cdef void cdPredict(const double *input, - int n_rows, - int n_cols, - const double *coef, - double intercept, - double *preds, - int loss) + int n_rows, + int n_cols, + const double *coef, + double intercept, + double *preds, + int loss) + class CD(Base): """ - Coordinate Descent (CD) is a very common optimization algorithm that minimizes along - coordinate directions to find the minimum of a function. + Coordinate Descent (CD) is a very common optimization algorithm that + minimizes along coordinate directions to find the minimum of a function. - cuML's CD algorithm accepts a numpy matrix or a cuDF DataFrame as the input dataset. - The CD algorithm currently works with linear regression and ridge, lasso, and elastic-net penalties. + cuML's CD algorithm accepts a numpy matrix or a cuDF DataFrame as the + input dataset.algorithm The CD algorithm currently works with linear + regression and ridge, lasso, and elastic-net penalties. Examples --------- @@ -141,25 +142,33 @@ class CD(Base): 'squared_loss' uses linear regression alpha: float (default = 0.0001) The constant value which decides the degree of regularization. - 'alpha = 0' is equivalent to an ordinary least square, solved by the LinearRegression object. + 'alpha = 0' is equivalent to an ordinary least square, solved by the + LinearRegression object. l1_ratio: float (default = 0.15) - The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. For l1_ratio = 0 the penalty is an L2 penalty. - For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, the penalty is a combination of L1 and L2. + The ElasticNet mixing parameter, with 0 <= l1_ratio <= 1. For + l1_ratio = 0 the penalty is an L2 penalty. + For l1_ratio = 1 it is an L1 penalty. For 0 < l1_ratio < 1, + the penalty is a combination of L1 and L2. fit_intercept : boolean (default = True) If True, the model tries to correct for the global mean of y. If False, the model expects that you have centered the data. max_iter : int (default = 1000) - The number of times the model should iterate through the entire dataset during training (default = 1000) + The number of times the model should iterate through the entire + dataset during training (default = 1000) tol : float (default = 1e-3) - The tolerance for the optimization: if the updates are smaller than tol, solver stops. + The tolerance for the optimization: if the updates are smaller than tol, + solver stops. shuffle : boolean (default = True) - If set to ‘True’, a random coefficient is updated every iteration rather than looping over features sequentially by default. - This (setting to ‘True’) often leads to significantly faster convergence especially when tol is higher than 1e-4. + If set to ‘True’, a random coefficient is updated every iteration rather + than looping over features sequentially by default. + This (setting to ‘True’) often leads to significantly faster convergence + especially when tol is higher than 1e-4. """ def __init__(self, loss='squared_loss', alpha=0.0001, l1_ratio=0.15, - fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, shuffle=True): + fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, + shuffle=True): if loss in ['squared_loss']: self.loss = self._get_loss_int(loss) @@ -189,25 +198,16 @@ class CD(Base): 'squared_loss': 0, }[loss] - def _get_ctype_ptr(self, obj): - # The manner to access the pointers in the gdf's might change, so - # encapsulating access in the following 3 methods. They might also be - # part of future gdf versions. - return obj.device_ctypes_pointer.value - - def _get_column_ptr(self, obj): - return self._get_ctype_ptr(obj._column._data.to_gpu_array()) - def fit(self, X, y): """ Fit the model with X and y. Parameters ---------- - X : cuDF DataFrame + X : cuDF DataFrame or numpy array Dense matrix (floats or doubles) of shape (n_samples, n_features) - y: cuDF DataFrame + y: cuDF DataFrame or numpy array Dense vector (floats or doubles) of shape (n_samples, 1) """ @@ -251,36 +251,36 @@ class CD(Base): if self.gdf_datatype.type == np.float32: cdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.normalize, - self.max_iter, - self.loss, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol) + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.normalize, + self.max_iter, + self.loss, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol) self.intercept_ = c_intercept1 else: cdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.normalize, - self.max_iter, - self.loss, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol) + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.normalize, + self.max_iter, + self.loss, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol) self.intercept_ = c_intercept2 @@ -327,22 +327,21 @@ class CD(Base): if pred_datatype.type == np.float32: cdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) else: cdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) del(X_m) return preds - diff --git a/python/cuml/solvers/sgd.pyx b/python/cuml/solvers/sgd.pyx index b38cebe416..7a13d84eba 100644 --- a/python/cuml/solvers/sgd.pyx +++ b/python/cuml/solvers/sgd.pyx @@ -52,7 +52,6 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": float tol, int n_iter_no_change) - cdef void sgdFit(double *input, int n_rows, int n_cols, @@ -90,29 +89,32 @@ cdef extern from "solver/solver_c.h" namespace "ML::Solver": int loss) cdef void sgdPredictBinaryClass(const float *input, - int n_rows, - int n_cols, - const float *coef, - float intercept, - float *preds, - int loss) + int n_rows, + int n_cols, + const float *coef, + float intercept, + float *preds, + int loss) cdef void sgdPredictBinaryClass(const double *input, - int n_rows, - int n_cols, - const double *coef, - double intercept, - double *preds, - int loss) + int n_rows, + int n_cols, + const double *coef, + double intercept, + double *preds, + int loss) + class SGD(Base): """ - Stochastic Gradient Descent is a very common machine learning algorithm where one optimizes - some cost function via gradient steps. This makes SGD very attractive for large problems - when the exact solution is hard or even impossible to find. + Stochastic Gradient Descent is a very common machine learning algorithm + where one optimizes some cost function via gradient steps. This makes SGD + very attractive for large problems when the exact solution is hard or even + impossible to find. - cuML's SGD algorithm accepts a numpy matrix or a cuDF DataFrame as the input dataset. - The SGD algorithm currently works with linear regression, ridge regression and SVM models. + cuML's SGD algorithm accepts a numpy matrix or a cuDF DataFrame as the + input dataset. The SGD algorithm currently works with linear regression, + ridge regression and SVM models. Examples --------- @@ -160,16 +162,20 @@ class SGD(Base): 'squared_loss' uses linear regression penalty: 'none', 'l1', 'l2', 'elasticnet' (default = 'none') 'none' does not perform any regularization - 'l1' performs L1 norm (Lasso) which minimizes the sum of the abs value of coefficients - 'l2' performs L2 norm (Ridge) which minimizes the sum of the square of the coefficients - 'elasticnet' performs Elastic Net regularization which is a weighted average of L1 and L2 norms + 'l1' performs L1 norm (Lasso) which minimizes the sum of the abs value + of coefficients + 'l2' performs L2 norm (Ridge) which minimizes the sum of the square of + the coefficients + 'elasticnet' performs Elastic Net regularization which is a weighted + average of L1 and L2 norms alpha: float (default = 0.0001) The constant value which decides the degree of regularization fit_intercept : boolean (default = True) If True, the model tries to correct for the global mean of y. If False, the model expects that you have centered the data. epochs : int (default = 1000) - The number of times the model should iterate through the entire dataset during training (default = 1000) + The number of times the model should iterate through the entire dataset + during training (default = 1000) tol : float (default = 1e-3) The training process will stop if current_loss > previous_loss - tol shuffle : boolean (default = True) @@ -179,21 +185,26 @@ class SGD(Base): Initial learning rate power_t : float (default = 0.5) The exponent used for calculating the invscaling learning rate - learning_rate : 'optimal', 'constant', 'invscaling', 'adaptive' (default = 'constant') + learning_rate : 'optimal', 'constant', 'invscaling', + 'adaptive' (default = 'constant') optimal option supported in the next version constant keeps the learning rate constant - adaptive changes the learning rate if the training loss or the validation accuracy does not improve for n_iter_no_change epochs. + adaptive changes the learning rate if the training loss or the + validation accuracy does not improve for n_iter_no_change epochs. The old learning rate is generally divide by 5 n_iter_no_change : int (default = 5) the number of epochs to train without any imporvement in the model Notes ------ - For additional docs, see `scikitlearn's OLS + For additional docs, see `scikitlearn's OLS + """ - def __init__(self, loss='squared_loss', penalty='none', alpha=0.0001, l1_ratio=0.15, - fit_intercept=True, epochs=1000, tol=1e-3, shuffle=True, learning_rate='constant', eta0=0.0, power_t=0.5, batch_size=32, n_iter_no_change=5): + def __init__(self, loss='squared_loss', penalty='none', alpha=0.0001, + l1_ratio=0.15, fit_intercept=True, epochs=1000, tol=1e-3, + shuffle=True, learning_rate='constant', eta0=0.0, power_t=0.5, + batch_size=32, n_iter_no_change=5): if loss in ['hinge', 'log', 'squared_loss']: self.loss = self._get_loss_int(loss) @@ -226,12 +237,13 @@ class SGD(Base): if learning_rate == 'optimal': self.lr_type = 0 - raise TypeError("This option will be supported in the coming versions") + raise TypeError("This option will be supported in the future") if self.alpha == 0: raise ValueError("alpha must be > 0 since " - "learning_rate is 'optimal'. alpha is used " - "to compute the optimal learning rate.") + "learning_rate is 'optimal'. alpha is " + "used to compute the optimal learning " + " rate.") elif learning_rate == 'constant': self.lr_type = 1 @@ -316,7 +328,8 @@ class SGD(Base): self.n_alpha = 1 - self.coef_ = cudf.Series(np.zeros(self.n_cols, dtype=self.gdf_datatype)) + self.coef_ = cudf.Series(np.zeros(self.n_cols, + dtype=self.gdf_datatype)) cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) cdef float c_intercept1 @@ -324,45 +337,45 @@ class SGD(Base): if self.gdf_datatype.type == np.float32: sgdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.batch_size, - self.epochs, - self.lr_type, - self.eta0, - self.power_t, - self.loss, - self.penalty, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.batch_size, + self.epochs, + self.lr_type, + self.eta0, + self.power_t, + self.loss, + self.penalty, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol, self.n_iter_no_change) self.intercept_ = c_intercept1 else: sgdFit(X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.batch_size, - self.epochs, - self.lr_type, - self.eta0, - self.power_t, - self.loss, - self.penalty, - self.alpha, - self.l1_ratio, - self.shuffle, - self.tol, + self.n_rows, + self.n_cols, + y_ptr, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.batch_size, + self.epochs, + self.lr_type, + self.eta0, + self.power_t, + self.loss, + self.penalty, + self.alpha, + self.l1_ratio, + self.shuffle, + self.tol, self.n_iter_no_change) self.intercept_ = c_intercept2 @@ -410,20 +423,20 @@ class SGD(Base): if pred_datatype.type == np.float32: sgdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) else: sgdPredict(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) del(X_m) @@ -470,20 +483,20 @@ class SGD(Base): if pred_datatype.type == np.float32: sgdPredictBinaryClass(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) else: sgdPredictBinaryClass(X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) del(X_m) From 8be6563fc6aee4e65f5cec02f6cc6f5bdfb5924a Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 16:01:36 -0500 Subject: [PATCH 111/156] FIX Fix remaining pep8 errors overall --- python/cuml/cluster/kmeans.pyx | 39 ++++++++------- python/cuml/common/base.pyx | 3 -- python/cuml/common/cuda.pyx | 2 +- python/cuml/decomposition/pca.pyx | 34 +++++++------- python/cuml/decomposition/tsvd.pyx | 27 ++++++----- python/cuml/decomposition/tsvd_mg.pyx | 47 +++++++++---------- python/cuml/decomposition/utils.pxd | 2 +- python/cuml/linear_model/lasso.pyx | 3 +- .../cuml/linear_model/linear_regression.pyx | 3 +- .../linear_model/linear_regression_mg.pyx | 4 +- python/cuml/linear_model/ridge.pyx | 2 +- python/cuml/manifold/umap.pyx | 13 +++-- python/cuml/metrics/regression.pxd | 12 ++--- python/cuml/neighbors/nearest_neighbors.pyx | 6 +-- python/cuml/solvers/cd.pyx | 3 +- python/cuml/utils/pointer_utils.pyx | 7 +-- 16 files changed, 104 insertions(+), 103 deletions(-) diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx index 8dc98bd0e3..7cf561651e 100644 --- a/python/cuml/cluster/kmeans.pyx +++ b/python/cuml/cluster/kmeans.pyx @@ -65,7 +65,7 @@ cdef extern from "kmeans/kmeans.hpp" namespace "ML::kmeans": int n_features, double *centroids, int *labels, - int verbose); + int verbose) cdef void fit(cumlHandle& handle, int n_clusters, @@ -334,11 +334,11 @@ class KMeans(Base): 'does not match the number of clusters %i' % (self.init.shape, self.n_clusters)) init_value = Array - self.cluster_centers_ = cuda.device_array( - self.n_clusters * self.n_cols, - dtype=self.gdf_datatype) - self.cluster_centers_.copy_to_device( - numba_utils.row_matrix(self.init)) + dim_cc = self.n_clusters * self.n_cols + self.cluster_centers_ = cuda.device_array(dim_cc, + dtype=self.gdf_datatype) + si = self.init + self.cluster_centers_.copy_to_device(numba_utils.row_matrix(si)) elif (isinstance(self.init, np.ndarray)): if(self.init.shape[0] != self.n_clusters): @@ -363,9 +363,8 @@ class KMeans(Base): else: raise TypeError('initialization method not supported') - cdef uintptr_t cluster_centers_ptr = self._get_dev_array_ptr( - self.cluster_centers_) - + c_c = self.cluster_centers_ + cdef uintptr_t cluster_centers_ptr = self._get_dev_array_ptr(c_c) if self.gdf_datatype.type == np.float32: fit_predict( @@ -385,17 +384,17 @@ class KMeans(Base): elif self.gdf_datatype.type == np.float64: fit_predict( handle_[0], - self.n_clusters, # n_clusters - 0, # distance metric as squared L2: @todo - support other metrics # noqa: E501 - init_value, # init method - self.max_iter, # max_iterations - self.tol, # threshold - self.random_state, # seed - input_ptr, # srcdata - self.n_rows, # n_samples (rows) - self.n_cols, # n_features (cols) - cluster_centers_ptr, # pred_centroids); - labels_ptr, # pred_labels + self.n_clusters, # n_clusters + 0, # distance metric as squared L2: @todo - support other metrics # noqa: E501 + init_value, # init method + self.max_iter, # max_iterations + self.tol, # threshold + self.random_state, # seed + input_ptr, # srcdata + self.n_rows, # n_samples (rows) + self.n_cols, # n_features (cols) + cluster_centers_ptr, # pred_centroids); + labels_ptr, # pred_labels self.verbose) else: raise TypeError('KMeans supports only float32 and float64 input,' diff --git a/python/cuml/common/base.pyx b/python/cuml/common/base.pyx index 03a1f34db5..9429bcb285 100644 --- a/python/cuml/common/base.pyx +++ b/python/cuml/common/base.pyx @@ -82,7 +82,6 @@ class Base: self.handle = cuml.common.handle.Handle() if handle is None else handle self.verbose = verbose - def get_param_names(self): """ Returns a list of hyperparameter names owned by this class. It is @@ -92,7 +91,6 @@ class Base: """ return [] - def get_params(self, deep=True): """ Returns a dict of all params owned by this class. If the child class @@ -107,7 +105,6 @@ class Base: params[key] = var_value return params - def set_params(self, **params): """ Accepts a dict of params and updates the corresponding ones owned by diff --git a/python/cuml/common/cuda.pyx b/python/cuml/common/cuda.pyx index ac3845807f..2b209510a7 100644 --- a/python/cuml/common/cuda.pyx +++ b/python/cuml/common/cuda.pyx @@ -57,7 +57,7 @@ cdef class Stream: def __cinit__(self): if self.s != 0: return - cdef _Stream stream; + cdef _Stream stream cdef _Error e = cudaStreamCreate(&stream) if e != 0: raise CudaRuntimeError("Stream create") diff --git a/python/cuml/decomposition/pca.pyx b/python/cuml/decomposition/pca.pyx index 7ceba58baf..4dc5151a78 100644 --- a/python/cuml/decomposition/pca.pyx +++ b/python/cuml/decomposition/pca.pyx @@ -321,11 +321,9 @@ class PCA(Base): dtype=self.gdf_datatype)) self.components_ = cuda.to_device(np.zeros(n_components*n_cols, dtype=self.gdf_datatype)) - self.explained_variance_ = cudf.Series( - np.zeros(n_components, + self.explained_variance_ = cudf.Series(np.zeros(n_components, dtype=self.gdf_datatype)) - self.explained_variance_ratio_ = cudf.Series( - np.zeros(n_components, + self.explained_variance_ratio_ = cudf.Series(np.zeros(n_components, dtype=self.gdf_datatype)) self.mean_ = cudf.Series(np.zeros(n_cols, dtype=self.gdf_datatype)) self.singular_values_ = cudf.Series(np.zeros(n_components, @@ -385,15 +383,20 @@ class PCA(Base): cdef uintptr_t comp_ptr = self._get_dev_array_ptr(self.components_) - cdef uintptr_t explained_var_ptr = self._get_cudf_column_ptr( - self.explained_variance_) - cdef uintptr_t explained_var_ratio_ptr = self._get_cudf_column_ptr( - self.explained_variance_ratio_) - cdef uintptr_t singular_vals_ptr = self._get_cudf_column_ptr( - self.singular_values_) + cdef uintptr_t explained_var_ptr = \ + self._get_cudf_column_ptr(self.explained_variance_) + + cdef uintptr_t explained_var_ratio_ptr = \ + self._get_cudf_column_ptr(self.explained_variance_ratio_) + + cdef uintptr_t singular_vals_ptr = \ + self._get_cudf_column_ptr(self.singular_values_) + cdef uintptr_t mean_ptr = self._get_cudf_column_ptr(self.mean_) - cdef uintptr_t noise_vars_ptr = self._get_cudf_column_ptr( - self.noise_variance_) + + cdef uintptr_t noise_vars_ptr = \ + self._get_cudf_column_ptr(self.noise_variance_) + cdef uintptr_t t_input_ptr = self._get_dev_array_ptr(self.trans_input_) cdef cumlHandle* handle_ = self.handle.getHandle() @@ -565,7 +568,6 @@ class PCA(Base): """ - cdef uintptr_t input_ptr if (isinstance(X, cudf.DataFrame)): gdf_datatype = np.dtype(X[X.columns[0]]._column.dtype) @@ -591,9 +593,9 @@ class PCA(Base): params.n_cols = n_cols params.whiten = self.whiten - t_input_data = cuda.to_device( - np.zeros(params.n_rows*params.n_components, - dtype=gdf_datatype.type)) + t_input_data = \ + cuda.to_device(np.zeros(params.n_rows*params.n_components, + dtype=gdf_datatype.type)) cdef uintptr_t trans_input_ptr = self._get_dev_array_ptr(t_input_data) cdef uintptr_t components_ptr = self.components_ptr diff --git a/python/cuml/decomposition/tsvd.pyx b/python/cuml/decomposition/tsvd.pyx index a1e7a88cd2..d5192ab01d 100644 --- a/python/cuml/decomposition/tsvd.pyx +++ b/python/cuml/decomposition/tsvd.pyx @@ -258,11 +258,9 @@ class TruncatedSVD(Base): dtype=self.gdf_datatype)) self.components_ = cuda.to_device(np.zeros(n_components*n_cols, dtype=self.gdf_datatype)) - self.explained_variance_ = cudf.Series( - np.zeros(n_components, + self.explained_variance_ = cudf.Series(np.zeros(n_components, dtype=self.gdf_datatype)) - self.explained_variance_ratio_ = cudf.Series( - np.zeros(n_components, + self.explained_variance_ratio_ = cudf.Series(np.zeros(n_components, dtype=self.gdf_datatype)) self.mean_ = cudf.Series(np.zeros(n_cols, dtype=self.gdf_datatype)) self.singular_values_ = cudf.Series(np.zeros(n_components, @@ -314,12 +312,15 @@ class TruncatedSVD(Base): cdef uintptr_t comp_ptr = self._get_dev_array_ptr(self.components_) - cdef uintptr_t explained_var_ptr = self._get_cudf_column_ptr( - self.explained_variance_) - cdef uintptr_t explained_var_ratio_ptr = self._get_cudf_column_ptr( - self.explained_variance_ratio_) - cdef uintptr_t singular_vals_ptr = self._get_cudf_column_ptr( - self.singular_values_) + cdef uintptr_t explained_var_ptr = \ + self._get_cudf_column_ptr(self.explained_variance_) + + cdef uintptr_t explained_var_ratio_ptr = \ + self._get_cudf_column_ptr(self.explained_variance_ratio_) + + cdef uintptr_t singular_vals_ptr = \ + self._get_cudf_column_ptr(self.singular_values_) + cdef uintptr_t t_input_ptr = self._get_dev_array_ptr(self.trans_input_) if self.n_components> self.n_cols: @@ -498,9 +499,9 @@ class TruncatedSVD(Base): params.n_rows = len(X) params.n_cols = self.n_cols - t_input_data = cuda.to_device( - np.zeros(params.n_rows*params.n_components, - dtype=gdf_datatype.type)) + t_input_data = \ + cuda.to_device(np.zeros(params.n_rows*params.n_components, + dtype=gdf_datatype.type)) cdef uintptr_t trans_input_ptr = self._get_dev_array_ptr(t_input_data) cdef uintptr_t components_ptr = self.components_ptr diff --git a/python/cuml/decomposition/tsvd_mg.pyx b/python/cuml/decomposition/tsvd_mg.pyx index 23b433a882..093b867099 100644 --- a/python/cuml/decomposition/tsvd_mg.pyx +++ b/python/cuml/decomposition/tsvd_mg.pyx @@ -35,35 +35,35 @@ cdef extern from "tsvd/tsvd_spmg.h" namespace "ML": cdef void tsvdFitSPMG(float *h_input, float *h_components, float *h_singular_vals, - paramsTSVD prms, + paramsTSVD prms, int *gpu_ids, int n_gpus) cdef void tsvdFitSPMG(double *h_input, double *h_components, double *h_singular_vals, - paramsTSVD prms, + paramsTSVD prms, int *gpu_ids, int n_gpus) cdef void tsvdFitTransformSPMG(float *h_input, float *h_trans_input, - float *h_components, + float *h_components, float *h_explained_var, - float *h_explained_var_ratio, + float *h_explained_var_ratio, float *h_singular_vals, paramsTSVD prms, - int *gpu_ids, + int *gpu_ids, int n_gpus) cdef void tsvdFitTransformSPMG(double *h_input, double *h_trans_input, - double *h_components, + double *h_components, double *h_explained_var, - double *h_explained_var_ratio, + double *h_explained_var_ratio, double *h_singular_vals, paramsTSVD prms, - int *gpu_ids, + int *gpu_ids, int n_gpus) cdef void tsvdInverseTransformSPMG(float *h_trans_input, @@ -98,8 +98,10 @@ cdef extern from "tsvd/tsvd_spmg.h" namespace "ML": int *gpu_ids, int n_gpus) + class TSVDparams: - def __init__(self,n_components,tol,iterated_power,random_state,svd_solver): + def __init__(self, n_components, tol, iterated_power, random_state, + svd_solver): self.n_components = n_components self.svd_solver = svd_solver self.tol = tol @@ -108,6 +110,7 @@ class TSVDparams: self.n_cols = None self.n_rows = None + class TruncatedSVDSPMG: """ Create a DataFrame, fill it with data, and compute Truncated Singular Value @@ -194,11 +197,9 @@ class TruncatedSVDSPMG: self.explained_variance_ratio_ptr = None self.singular_values_ptr = None - self.algo_dict = { - 'full': COV_EIG_DQ, - 'auto': COV_EIG_DQ, - 'jacobi': COV_EIG_JACOBI - } + self.algo_dict = {'full': COV_EIG_DQ, + 'auto': COV_EIG_DQ, + 'jacobi': COV_EIG_JACOBI} def _get_algorithm_c_name(self, algorithm): return self.algo_dict[algorithm] @@ -211,20 +212,19 @@ class TruncatedSVDSPMG: self.components_ = cudf.utils.cudautils.zeros(n_cols*n_components, self.gdf_datatype) - self.explained_variance_ = cudf.Series(cudf.utils.cudautils.zeros( - n_components, - self.gdf_datatype)) + self.explained_variance_ = \ + cudf.Series(cudf.utils.cudautils.zeros(n_components, + self.gdf_datatype)) - self.explained_variance_ratio_ = cudf.Series(np.zeros( - n_components, - self.gdf_datatype)) + self.explained_variance_ratio_ = \ + cudf.Series(np.zeros(n_components, self.gdf_datatype)) self.mean_ = cudf.Series(cudf.utils.cudautils.zeros(n_cols, self.gdf_datatype)) - self.singular_values_ = cudf.Series(cudf.utils.cudautils.zeros( - n_components, - self.gdf_datatype)) + self.singular_values_ = \ + cudf.Series(cudf.utils.cudautils.zeros(n_components, + self.gdf_datatype)) self.noise_variance_ = cudf.Series(np.zeros(1, dtype=self.gdf_datatype)) @@ -595,4 +595,3 @@ class TruncatedSVDSPMG: else: raise ValueError('Number of GPUS should be 2 or more' 'For single GPU, use the normal TruncatedSVD') - diff --git a/python/cuml/decomposition/utils.pxd b/python/cuml/decomposition/utils.pxd index c15bc50a3a..d1bcfefb97 100644 --- a/python/cuml/decomposition/utils.pxd +++ b/python/cuml/decomposition/utils.pxd @@ -42,7 +42,7 @@ cdef extern from "ml_utils.h" namespace "ML": cdef cppclass paramsTSVD(paramsSolver): int n_components int max_sweeps - solver algorithm #= solver::COV_EIG_DQ + solver algorithm # = solver::COV_EIG_DQ bool trans_input cdef cppclass paramsPCA(paramsTSVD): diff --git a/python/cuml/linear_model/lasso.pyx b/python/cuml/linear_model/lasso.pyx index 62b1dff800..054c58f15f 100644 --- a/python/cuml/linear_model/lasso.pyx +++ b/python/cuml/linear_model/lasso.pyx @@ -134,7 +134,8 @@ class Lasso: tol: float or double. selection : str, ‘cyclic’, or 'random' - For additional docs, see `scikitlearn's Lasso `_. + For additional docs, see `scikitlearn's Lasso + `_. """ self._check_alpha(alpha) self.alpha = alpha diff --git a/python/cuml/linear_model/linear_regression.pyx b/python/cuml/linear_model/linear_regression.pyx index 02c3757cb1..0159e693d3 100644 --- a/python/cuml/linear_model/linear_regression.pyx +++ b/python/cuml/linear_model/linear_regression.pyx @@ -265,7 +265,8 @@ class LinearRegression(Base): msg = "y vector must be a cuDF series or Numpy ndarray" raise TypeError(msg) - self.coef_ = cudf.Series(np.zeros(self.n_cols, dtype=self.gdf_datatype)) + self.coef_ = cudf.Series(np.zeros(self.n_cols, + dtype=self.gdf_datatype)) cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) cdef float c_intercept1 diff --git a/python/cuml/linear_model/linear_regression_mg.pyx b/python/cuml/linear_model/linear_regression_mg.pyx index 396a4648fa..c0f2318dda 100644 --- a/python/cuml/linear_model/linear_regression_mg.pyx +++ b/python/cuml/linear_model/linear_regression_mg.pyx @@ -58,7 +58,7 @@ cdef extern from "glm/glm_spmg.h" namespace "ML::GLM": int n_rows, int n_cols, float *h_coef, - float intercept, + float intercept, float *preds, int *gpu_ids, int n_gpus) @@ -67,7 +67,7 @@ cdef extern from "glm/glm_spmg.h" namespace "ML::GLM": int n_rows, int n_cols, double *h_coef, - double intercept, + double intercept, double *preds, int *gpu_ids, int n_gpus) diff --git a/python/cuml/linear_model/ridge.pyx b/python/cuml/linear_model/ridge.pyx index 9b0d971ad2..a7f4ffbbe6 100644 --- a/python/cuml/linear_model/ridge.pyx +++ b/python/cuml/linear_model/ridge.pyx @@ -402,7 +402,7 @@ class Ridge(Base, RegressorMixin): params = dict() variables = ['alpha', 'fit_intercept', 'normalize', 'solver'] for key in variables: - var_value = getattr(self,key,None) + var_value = getattr(self, key, None) params[key] = var_value return params diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index f75576e525..ac3f20fc25 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -428,13 +428,12 @@ cdef class UMAP: order="C", dtype=np.float32)) cdef uintptr_t embed_ptr = embedding.device_ctypes_pointer.value - self.umap.transform( - x_ptr, - X_m.shape[0], - X_m.shape[1], - self.embeddings, - self.arr_embed.shape[0], - embed_ptr) + self.umap.transform(x_ptr, + X_m.shape[0], + X_m.shape[1], + self.embeddings, + self.arr_embed.shape[0], + embed_ptr) if isinstance(X, cudf.DataFrame): ret = cudf.DataFrame() diff --git a/python/cuml/metrics/regression.pxd b/python/cuml/metrics/regression.pxd index 41aa7092de..86012fe163 100644 --- a/python/cuml/metrics/regression.pxd +++ b/python/cuml/metrics/regression.pxd @@ -24,11 +24,11 @@ from cuml.common.handle cimport cumlHandle cdef extern from "metrics/metrics.hpp" namespace "ML::Metrics": float r2_score_py(const cumlHandle& handle, - float *y, - float *y_hat, - int n) + float *y, + float *y_hat, + int n) double r2_score_py(const cumlHandle& handle, - double *y, - double *y_hat, - int n) + double *y, + double *y_hat, + int n) diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index f17364cc93..a2c8efd698 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -282,7 +282,7 @@ cdef class NearestNeighbors: del self.k n_dims = X.shape[1] - self.k = new kNN(n_dims, verbose = self._verbose) + self.k = new kNN(n_dims, verbose=self._verbose) cdef uintptr_t X_ctype = -1 cdef uintptr_t dev_ptr = -1 @@ -372,8 +372,8 @@ cdef class NearestNeighbors: self.input[i] = deref(params) - self.k.fit( < kNNParams * > self.input, - < int > len(alloc_info)) + self.k.fit( self.input, + len(alloc_info)) def kneighbors(self, X, k=None): """ diff --git a/python/cuml/solvers/cd.pyx b/python/cuml/solvers/cd.pyx index c72852135e..e597e84cc4 100644 --- a/python/cuml/solvers/cd.pyx +++ b/python/cuml/solvers/cd.pyx @@ -243,7 +243,8 @@ class CD(Base): self.n_alpha = 1 - self.coef_ = cudf.Series(np.zeros(self.n_cols, dtype=self.gdf_datatype)) + self.coef_ = cudf.Series(np.zeros(self.n_cols, + dtype=self.gdf_datatype)) cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) cdef float c_intercept1 diff --git a/python/cuml/utils/pointer_utils.pyx b/python/cuml/utils/pointer_utils.pyx index 7726cc2b67..85af87b513 100644 --- a/python/cuml/utils/pointer_utils.pyx +++ b/python/cuml/utils/pointer_utils.pyx @@ -23,8 +23,9 @@ from libc.stdint cimport uintptr_t cdef extern from "ml_cuda_utils.h" namespace "ML": - cdef int get_device(void *ptr) + cdef int get_device(void *ptr) + def device_of_gpu_matrix(g): - cdef uintptr_t cptr = g.device_ctypes_pointer.value - return get_device( cptr) \ No newline at end of file + cdef uintptr_t cptr = g.device_ctypes_pointer.value + return get_device( cptr) From 55413738399d0be196747aecc4e1d4eab393b163 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Sat, 11 May 2019 16:03:07 -0500 Subject: [PATCH 112/156] DOC Add entry to changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f06e2ea21..57944829dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - PR #590: QN Recover from numeric errors - PR #482: Introduce cumlHandle for pca and tsvd - PR #573: Remove use of unnecessary cuDF column and series copies +- PR #601: Cython PEP8 cleanup and CI integration ## Bug Fixes - PR #584: Added missing virtual destructor to deviceAllocator and hostAllocator From 9b3e9d4e8750342661a51396510cef12ab8804e3 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 13 May 2019 11:54:04 -0400 Subject: [PATCH 113/156] Initial add of cuml handle to knn, spectral embedding/clustering, and umap --- cuML/src/knn/knn.cu | 70 ++++++++------------ cuML/src/knn/knn.h | 10 ++- cuML/src/spectral/spectral.h | 82 +++++++++++++----------- cuML/src/umap/init_embed/runner.h | 5 +- cuML/src/umap/init_embed/spectral_algo.h | 5 +- cuML/src/umap/runner.h | 14 ++-- cuML/src/umap/supervised.h | 3 +- cuML/src/umap/umap.cu | 20 +++--- cuML/src/umap/umap.h | 8 ++- cuML/test/knn_test.cu | 3 +- cuML/test/spectral_test.cu | 20 +++--- cuML/test/umap_test.cu | 12 ++-- cuML/test_mg/knn_test.cu | 3 +- 13 files changed, 132 insertions(+), 123 deletions(-) diff --git a/cuML/src/knn/knn.cu b/cuML/src/knn/knn.cu index 5f0a042927..518278fa17 100644 --- a/cuML/src/knn/knn.cu +++ b/cuML/src/knn/knn.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "common/cumlHandle.hpp" + #include "cuda_utils.h" #include "knn.h" #include @@ -23,6 +25,7 @@ #include #include + #include #include #include @@ -35,7 +38,10 @@ namespace ML { * Build a kNN object for training and querying a k-nearest neighbors model. * @param D number of features in each vector */ - kNN::kNN(int D, bool verbose): D(D), total_n(0), indices(0), verbose(verbose), owner(false){} + kNN::kNN(const cumlHandle &handle, int D, bool verbose): + D(D), total_n(0), indices(0), verbose(verbose), owner(false) { + this->handle = const_cast(&handle.getImpl()); + } kNN::~kNN() { try { @@ -84,10 +90,8 @@ namespace ML { */ void kNN::fit(kNNParams *input, int N) { - - if(this->owner) { + if(this->owner) for(kNNParams p : knn_params) { CUDA_CHECK(cudaFree(p.ptr)); } - } if(this->verbose) std::cout << "N=" << N << std::endl; @@ -107,6 +111,16 @@ namespace ML { } } + template + void ASSERT_MEM(T *ptr, std::string name) { + cudaPointerAttributes s_att; + cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr); + + if(s_err != 0 || s_att.device == -1) + std::cout << "Invalid device pointer encountered in " << name << + ". device=" << s_att.device << ", err=" << s_err << std::endl; + } + /** * Search the kNN for the k-nearest neighbors of a set of query vectors * @param search_items set of vectors to query for neighbors @@ -124,40 +138,9 @@ namespace ML { float *all_D = new float[indices*k*size_t(n)]; long *all_I = new long[indices*k*size_t(n)]; - cudaPointerAttributes s_att; - cudaError_t s_err = cudaPointerGetAttributes(&s_att, search_items); - - if(s_err != 0 || s_att.device == -1) - std::cout << "Invalid device pointer encountered in knn search: " << search_items << std::endl; - - s_err = cudaPointerGetAttributes(&s_att, res_I); - - if(s_err != 0 || s_att.device == -1) - std::cout << "Invalid index results pointer encountered in knn search: " << search_items << std::endl; - - s_err = cudaPointerGetAttributes(&s_att, res_D); - - if(s_err != 0 || s_att.device == -1) - std::cout << "Invalid distance results pointer encountered in knn search: " << search_items << std::endl; - - - /** - * Initial verification of memory - */ - for(int i = 0; i < indices; i++) { - kNNParams params = knn_params[i]; - - cudaPointerAttributes att; - cudaError_t err = cudaPointerGetAttributes(&att, params.ptr); - - if(err == 0 && att.device > -1) { - CUDA_CHECK(cudaSetDevice(att.device)); - - if(!verify_size(size_t(params.N)*size_t(this->D)*4l, att.device)) - return; - } - } - + ASSERT_MEM(search_items, "search items"); + ASSERT_MEM(res_I, "output index array"); + ASSERT_MEM(res_D, "output distance array"); #pragma omp parallel { @@ -170,9 +153,12 @@ namespace ML { cudaError_t err = cudaPointerGetAttributes(&att, params.ptr); if(err == 0 && att.device > -1) { + CUDA_CHECK(cudaSetDevice(att.device)); + CUDA_CHECK(cudaPeekAtLastError()); try { + faiss::gpu::StandardGpuResources gpu_res; cudaStream_t stream; @@ -203,7 +189,6 @@ namespace ML { std::cout << "Exception occurred: " << e.what() << std::endl; } - } else { std::stringstream ss; ss << "Input memory for " << ¶ms << " failed. isDevice?=" << att.devicePointer << ", N=" << params.N; @@ -215,8 +200,8 @@ namespace ML { merge_tables>(long(n), k, indices, result_D, result_I, all_D, all_I, id_ranges.data()); - MLCommon::updateDevice(res_D, result_D, k*size_t(n), 0); - MLCommon::updateDevice(res_I, result_I, k*size_t(n), 0); + MLCommon::updateDevice(res_D, result_D, k*size_t(n), handle->getStream()); + MLCommon::updateDevice(res_I, result_I, k*size_t(n), handle->getStream()); delete all_D; delete all_I; @@ -274,7 +259,8 @@ namespace ML { float *ptr_d; MLCommon::allocate(ptr_d, size_t(length)*size_t(D)); - MLCommon::updateDevice(ptr_d, ptr+(size_t(chunk_size)*i), size_t(length)*size_t(D), 0); + MLCommon::updateDevice(ptr_d, ptr+(size_t(chunk_size)*i), + size_t(length)*size_t(D), handle->getStream()); kNNParams p; p.N = length; diff --git a/cuML/src/knn/knn.h b/cuML/src/knn/knn.h index 5d56fe027b..9d692b4e7b 100644 --- a/cuML/src/knn/knn.h +++ b/cuML/src/knn/knn.h @@ -14,11 +14,16 @@ * limitations under the License. */ +#include "common/cumlHandle.hpp" + #include #include -#include #include +#include + + + #pragma once namespace ML { @@ -48,6 +53,7 @@ namespace ML { bool verbose; bool owner; + cumlHandle_impl *handle; public: @@ -55,7 +61,7 @@ namespace ML { * Build a kNN object for training and querying a k-nearest neighbors model. * @param D number of features in each vector */ - kNN(int D, bool verbose = false); + kNN(const cumlHandle &handle, int D, bool verbose = false); ~kNN(); void reset(); diff --git a/cuML/src/spectral/spectral.h b/cuML/src/spectral/spectral.h index 163f012c34..324fd4fc96 100644 --- a/cuML/src/spectral/spectral.h +++ b/cuML/src/spectral/spectral.h @@ -16,6 +16,8 @@ #pragma once +#include "cuML.hpp" + #include #include "sparse/nvgraph_wrappers.h" @@ -43,12 +45,12 @@ namespace ML { */ template - void fit_clusters(int *rows, int *cols, T *vals, int nnz, + void fit_clusters(const cumlHandle &handle, int *rows, int *cols, T *vals, int nnz, int n, int n_clusters, float eigen_tol, int *out) { - nvgraphHandle_t handle; + nvgraphHandle_t graphHandle; cudaDataType_t edge_dimT = CUDA_R_32F; - NVGRAPH_CHECK(nvgraphCreate (&handle)); + NVGRAPH_CHECK(nvgraphCreate (&graphHandle)); /** * Convert COO to CSR @@ -73,7 +75,7 @@ namespace ML { CSR_input->nvertices = n; CSR_input->source_offsets = src_offsets; - NVGRAPH_CHECK(nvgraphConvertTopology(handle, + NVGRAPH_CHECK(nvgraphConvertTopology(graphHandle, NVGRAPH_COO_32, (void*)COO_input, (void*)vals, &edge_dimT, NVGRAPH_CSR_32, (void*)CSR_input, (void*)vals)); @@ -94,17 +96,17 @@ namespace ML { clustering_params.kmean_max_iter = 0; nvgraphGraphDescr_t graph; - NVGRAPH_CHECK(nvgraphCreateGraphDescr(handle, &graph)); - NVGRAPH_CHECK(nvgraphSetGraphStructure(handle, graph, + NVGRAPH_CHECK(nvgraphCreateGraphDescr(graphHandle, &graph)); + NVGRAPH_CHECK(nvgraphSetGraphStructure(graphHandle, graph, (void*)CSR_input, NVGRAPH_CSR_32)); - NVGRAPH_CHECK(nvgraphAllocateEdgeData(handle, graph, 1, &edge_dimT)); - NVGRAPH_CHECK(nvgraphSetEdgeData(handle, graph, (void*)vals, 0)); + NVGRAPH_CHECK(nvgraphAllocateEdgeData(graphHandle, graph, 1, &edge_dimT)); + NVGRAPH_CHECK(nvgraphSetEdgeData(graphHandle, graph, (void*)vals, 0)); - NVGRAPH_CHECK(nvgraphSpectralClustering(handle, graph, weight_index, + NVGRAPH_CHECK(nvgraphSpectralClustering(graphHandle, graph, weight_index, &clustering_params, out, eigVals, embedding)); - NVGRAPH_CHECK(nvgraphDestroyGraphDescr(handle, graph)); - NVGRAPH_CHECK(nvgraphDestroy(handle)); + NVGRAPH_CHECK(nvgraphDestroyGraphDescr(graphHandle, graph)); + NVGRAPH_CHECK(nvgraphDestroy(graphHandle)); CUDA_CHECK(cudaFree(src_offsets)); CUDA_CHECK(cudaFree(dst_indices)); @@ -128,7 +130,7 @@ namespace ML { * @param out output array for labels (size m) */ template - void fit_clusters(long *knn_indices, T *knn_dists, int m, int n_neighbors, + void fit_clusters(const cumlHandle &handle, long *knn_indices, T *knn_dists, int m, int n_neighbors, int n_clusters, float eigen_tol, int *out) { int *rows, *cols; @@ -143,7 +145,7 @@ namespace ML { // todo: Need to symmetrize the knn to create the knn graph - fit_clusters(rows, cols, vals, m*n_neighbors, m, n_clusters, eigen_tol, out); + fit_clusters(handle, rows, cols, vals, m*n_neighbors, m, n_clusters, eigen_tol, out); CUDA_CHECK(cudaFree(rows)); CUDA_CHECK(cudaFree(cols)); @@ -163,10 +165,10 @@ namespace ML { * @param out output array for labels (size m) */ template - void fit_clusters(T *X, int m, int n, int n_neighbors, + void fit_clusters(const cumlHandle &handle, T *X, int m, int n, int n_neighbors, int n_clusters, float eigen_tol, int *out) { - kNN *knn = new kNN(n); + kNN knn(handle, n); long *knn_indices; float *knn_dists; @@ -179,16 +181,14 @@ namespace ML { params[0].ptr = X; - knn->fit(*¶ms, 1); - knn->search(X, m, knn_indices, knn_dists, n_neighbors); + knn.fit(params, 1); + knn.search(X, m, knn_indices, knn_dists, n_neighbors); - fit_clusters(knn_indices, knn_dists, m, n_neighbors, + fit_clusters(handle, knn_indices, knn_dists, m, n_neighbors, n_clusters, eigen_tol, out); CUDA_CHECK(cudaFree(knn_indices)); CUDA_CHECK(cudaFree(knn_dists)); - - delete knn; } /*** @@ -205,12 +205,12 @@ namespace ML { * @param out output array for labels (size m) */ template - void fit_embedding(int *rows, int*cols, T *vals, int nnz, int n, + void fit_embedding(const cumlHandle &handle, int *rows, int*cols, T *vals, int nnz, int n, int n_components, T *out) { - nvgraphHandle_t handle; + nvgraphHandle_t grapHandle; cudaDataType_t edge_dimT = CUDA_R_32F; - NVGRAPH_CHECK(nvgraphCreate (&handle)); + NVGRAPH_CHECK(nvgraphCreate (&grapHandle)); // Allocate csr arrays int *src_offsets, *dst_indices; @@ -229,7 +229,7 @@ namespace ML { CSR_input->nvertices = n; CSR_input->source_offsets = src_offsets; - NVGRAPH_CHECK(nvgraphConvertTopology(handle, NVGRAPH_COO_32, + NVGRAPH_CHECK(nvgraphConvertTopology(grapHandle, NVGRAPH_COO_32, (void*)COO_input, (void*)vals, &edge_dimT, NVGRAPH_CSR_32, (void*)CSR_input, (void*)vals)); @@ -251,17 +251,17 @@ namespace ML { clustering_params.kmean_max_iter = 1; nvgraphGraphDescr_t graph; - NVGRAPH_CHECK(nvgraphCreateGraphDescr(handle, &graph)); - NVGRAPH_CHECK(nvgraphSetGraphStructure(handle, graph, + NVGRAPH_CHECK(nvgraphCreateGraphDescr(grapHandle, &graph)); + NVGRAPH_CHECK(nvgraphSetGraphStructure(grapHandle, graph, (void*)CSR_input, NVGRAPH_CSR_32)); - NVGRAPH_CHECK(nvgraphAllocateEdgeData(handle, graph, 1, &edge_dimT)); - NVGRAPH_CHECK(nvgraphSetEdgeData(handle, graph, (void*)vals, 0)); + NVGRAPH_CHECK(nvgraphAllocateEdgeData(grapHandle, graph, 1, &edge_dimT)); + NVGRAPH_CHECK(nvgraphSetEdgeData(grapHandle, graph, (void*)vals, 0)); - NVGRAPH_CHECK(nvgraphSpectralClustering(handle, graph, weight_index, + NVGRAPH_CHECK(nvgraphSpectralClustering(grapHandle, graph, weight_index, &clustering_params, labels, eigVals, out)); - NVGRAPH_CHECK(nvgraphDestroyGraphDescr(handle, graph)); - NVGRAPH_CHECK(nvgraphDestroy(handle)); + NVGRAPH_CHECK(nvgraphDestroyGraphDescr(grapHandle, graph)); + NVGRAPH_CHECK(nvgraphDestroy(grapHandle)); CUDA_CHECK(cudaFree(src_offsets)); CUDA_CHECK(cudaFree(dst_indices)); @@ -284,7 +284,7 @@ namespace ML { * @param out output array for labels (size m) */ template - void fit_embedding(long *knn_indices, float *knn_dists, int m, int n_neighbors, + void fit_embedding(const cumlHandle &handle, long *knn_indices, float *knn_dists, int m, int n_neighbors, int n_components, T *out) { int *rows, *cols; @@ -300,7 +300,7 @@ namespace ML { // todo: Need to symmetrize the knn graph here. UMAP works here because // it has already done this. - fit_embedding(rows, cols, vals, m*n_neighbors, m, n_components, out); + fit_embedding(handle, rows, cols, vals, m*n_neighbors, m, n_components, out); CUDA_CHECK(cudaFree(rows)); CUDA_CHECK(cudaFree(cols)); @@ -319,11 +319,11 @@ namespace ML { * @param out output array for labels (size m) */ template - void fit_embedding(T *X, int m, int n, + void fit_embedding(const cumlHandle &handle, T *X, int m, int n, int n_neighbors, int n_components, T *out) { - kNN *knn = new kNN(n); + kNN knn(handle, n); long *knn_indices; float *knn_dists; @@ -335,16 +335,20 @@ namespace ML { params[0].N = m; params[0].ptr = X; - knn->fit(*¶ms, 1); - knn->search(X, m, knn_indices, knn_dists, n_neighbors); + knn.fit(*¶ms, 1); + knn.search(X, m, knn_indices, knn_dists, n_neighbors); - fit_embedding(knn_indices, knn_dists, m, n_neighbors, + std::cout << "About to call" << std::endl; + + fit_embedding(handle, knn_indices, knn_dists, m, n_neighbors, n_components, out); + std::cout << "DONE!" << std::endl; + CUDA_CHECK(cudaFree(knn_indices)); CUDA_CHECK(cudaFree(knn_dists)); - delete knn; + std::cout << "DONE!" << std::endl; } } } diff --git a/cuML/src/umap/init_embed/runner.h b/cuML/src/umap/init_embed/runner.h index 27cea5bb67..7b6feaf17c 100644 --- a/cuML/src/umap/init_embed/runner.h +++ b/cuML/src/umap/init_embed/runner.h @@ -30,7 +30,8 @@ namespace UMAPAlgo { using namespace ML; template - void run(const T *X, int n, int d, + void run(const cumlHandle &handle, + const T *X, int n, int d, const long *knn_indices, const T *knn_dists, MLCommon::Sparse::COO *coo, UMAPParams *params, T* embedding, cudaStream_t stream, @@ -46,7 +47,7 @@ namespace UMAPAlgo { break; case 1: - SpectralInit::launcher(X, n, d, + SpectralInit::launcher(handle, X, n, d, knn_indices, knn_dists, coo, params, embedding); diff --git a/cuML/src/umap/init_embed/spectral_algo.h b/cuML/src/umap/init_embed/spectral_algo.h index cbb71f7ef7..11e62aa2e4 100644 --- a/cuML/src/umap/init_embed/spectral_algo.h +++ b/cuML/src/umap/init_embed/spectral_algo.h @@ -36,13 +36,14 @@ namespace UMAPAlgo { * Performs a spectral layout initialization */ template - void launcher(const T *X, int n, int d, + void launcher(const cumlHandle &handle, + const T *X, int n, int d, const long *knn_indices, const T *knn_dists, MLCommon::Sparse::COO *coo, UMAPParams *params, T *embedding) { - Spectral::fit_embedding(coo->rows, coo->cols, coo->vals, coo->nnz, n, params->n_components, embedding); + Spectral::fit_embedding(handle,coo->rows, coo->cols, coo->vals, coo->nnz, n, params->n_components, embedding); } } } diff --git a/cuML/src/umap/runner.h b/cuML/src/umap/runner.h index 84c128fa9d..70d711c4fe 100644 --- a/cuML/src/umap/runner.h +++ b/cuML/src/umap/runner.h @@ -98,7 +98,8 @@ namespace UMAPAlgo { } template - size_t _fit(T *X, // input matrix + size_t _fit(const cumlHandle &handle, + T *X, // input matrix int n, // rows int d, // cols kNN *knn, @@ -147,7 +148,7 @@ namespace UMAPAlgo { /** * Run initialization method */ - InitEmbed::run(X, n, d, + InitEmbed::run(handle, X, n, d, knn_indices, knn_dists, &cgraph_coo, params, embeddings, stream, @@ -168,7 +169,8 @@ namespace UMAPAlgo { } template - size_t _fit(T *X, // input matrix + size_t _fit(const cumlHandle &handle, + T *X, // input matrix T *y, // labels int n, int d, @@ -238,6 +240,7 @@ namespace UMAPAlgo { if(params->verbose) std::cout << "Performing general intersection" << std::endl; Supervised::perform_general_intersection( + handle, y, &rgraph_coo, &final_coo, params, stream); @@ -254,7 +257,7 @@ namespace UMAPAlgo { /** * Initialize embeddings */ - InitEmbed::run(X, n, d, + InitEmbed::run(handle, X, n, d, knn_indices, knn_dists, &ocoo, params, embeddings, stream, params->init); @@ -281,7 +284,8 @@ namespace UMAPAlgo { * */ template - size_t _transform(const float *X, + size_t _transform(const cumlHandle &handle, + const float *X, int n, int d, T *embedding, diff --git a/cuML/src/umap/supervised.h b/cuML/src/umap/supervised.h index 93a5c03799..8e3bd21685 100644 --- a/cuML/src/umap/supervised.h +++ b/cuML/src/umap/supervised.h @@ -280,6 +280,7 @@ namespace UMAPAlgo { template void perform_general_intersection( + const cumlHandle &handle, T *y, COO *rgraph_coo, COO *final_coo, UMAPParams *params, @@ -288,7 +289,7 @@ namespace UMAPAlgo { /** * Calculate kNN for Y */ - kNN y_knn(1); + kNN y_knn(handle, 1); long *y_knn_indices; T *y_knn_dists; diff --git a/cuML/src/umap/umap.cu b/cuML/src/umap/umap.cu index 0a1239a148..a906420147 100644 --- a/cuML/src/umap/umap.cu +++ b/cuML/src/umap/umap.cu @@ -43,24 +43,23 @@ namespace ML { * @param embeddings * an array to return the output embeddings of size (n_samples, n_components) */ - void UMAP_API::fit(float *X, int n, int d, float *embeddings) { - this->knn = new kNN(d); + void UMAP_API::fit(const cumlHandle &handle, float *X, int n, int d, float *embeddings) { + this->knn = new kNN(handle, d); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - UMAPAlgo::_fit(X, n, d, knn, get_params(), embeddings, stream); + UMAPAlgo::_fit(handle, X, n, d, knn, get_params(), embeddings, stream); CUDA_CHECK(cudaStreamDestroy(stream)); } - void UMAP_API::fit(float *X, float *y, int n, int d, float *embeddings) { - this->knn = new kNN(d); + void UMAP_API::fit(const cumlHandle &handle, float *X, float *y, int n, int d, float *embeddings) { + this->knn = new kNN(handle, d); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - UMAPAlgo::_fit(X, y, n, d, knn, get_params(), embeddings, stream); + UMAPAlgo::_fit(handle, X, y, n, d, knn, get_params(), embeddings, stream); CUDA_CHECK(cudaStreamDestroy(stream)); } - /** * Project a set of X vectors into the embedding space. * @param X @@ -76,12 +75,11 @@ namespace ML { * @param out * pointer to array for storing output embeddings (n, n_components) */ - void UMAP_API::transform(float *X, int n, int d, - float *embedding, int embedding_n, - float *out) { + void UMAP_API::transform(const cumlHandle &handle, float *X, int n, int d, + float *embedding, int embedding_n, float *out) { cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); - UMAPAlgo::_transform(X, n, d, + UMAPAlgo::_transform(handle, X, n, d, embedding, embedding_n, knn, get_params(), out, stream); CUDA_CHECK(cudaStreamDestroy(stream)); diff --git a/cuML/src/umap/umap.h b/cuML/src/umap/umap.h index f658c1bcbd..25d4827d23 100644 --- a/cuML/src/umap/umap.h +++ b/cuML/src/umap/umap.h @@ -16,6 +16,8 @@ #pragma once +#include "common/cumlHandle.hpp" + #include "umapparams.h" #include "knn/knn.h" @@ -44,7 +46,7 @@ namespace ML { * @param embeddings * an array to return the output embeddings of size (n_samples, n_components) */ - void fit(float *X, int n, int d, float *embeddings); + void fit(const cumlHandle &handle, float *X, int n, int d, float *embeddings); /** * Fits a supervised UMAP model @@ -59,7 +61,7 @@ namespace ML { * @param embeddings * an array to return the output embeddings of size (n_samples, n_components) */ - void fit(float *X, float *y, int n, int d, float *embeddings); + void fit(const cumlHandle &handle, float *X, float *y, int n, int d, float *embeddings); /** * Project a set of X vectors into the embedding space. @@ -76,7 +78,7 @@ namespace ML { * @param out * pointer to array for storing output embeddings (n, n_components) */ - void transform(float *X, int n, int d, + void transform(const cumlHandle &handle, float *X, int n, int d, float *embedding, int embedding_n, float *out); diff --git a/cuML/test/knn_test.cu b/cuML/test/knn_test.cu index 97ba8438c4..5c9d6a527a 100644 --- a/cuML/test/knn_test.cu +++ b/cuML/test/knn_test.cu @@ -94,7 +94,8 @@ protected: long *d_ref_I; T* d_ref_D; - kNN *knn = new kNN(d); + cumlHandle handle; + kNN *knn = new kNN(handle, d); }; diff --git a/cuML/test/spectral_test.cu b/cuML/test/spectral_test.cu index 038562a020..807a7512af 100644 --- a/cuML/test/spectral_test.cu +++ b/cuML/test/spectral_test.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "cuML.hpp" + #include "spectral/spectral.h" #include "random/rng.h" @@ -48,18 +50,18 @@ TEST_F(TestSpectralClustering, Fit) { int k = 3; float *X; - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); MLCommon::allocate(X, n*d); + cumlHandle handle; + Random::Rng r(150, MLCommon::Random::GenTaps); - r.uniform(X, n*d, -1.0f, 1.0f, stream); + r.uniform(X, n*d, -1.0f, 1.0f, handle.getStream()); int *out; MLCommon::allocate(out, n, true); - ML::Spectral::fit_clusters(X, n, d, k, 10, 1e-3f, out); - CUDA_CHECK(cudaStreamDestroy(stream)); + + ML::Spectral::fit_clusters(handle, X, n, d, k, 10, 1e-3f, out); } typedef SpectralTest TestSpectralEmbedding; @@ -70,18 +72,16 @@ TEST_F(TestSpectralEmbedding, Fit) { int k = 3; float *X; - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); + cumlHandle handle; MLCommon::allocate(X, n*d); Random::Rng r(150, MLCommon::Random::GenTaps); - r.uniform(X, n*d, -1.0f, 1.0f, stream); + r.uniform(X, n*d, -1.0f, 1.0f, handle.getStream()); float *out; MLCommon::allocate(out, n*2, true); - ML::Spectral::fit_embedding(X, n, d, k, 2, out); - CUDA_CHECK(cudaStreamDestroy(stream)); + ML::Spectral::fit_embedding(handle, X, n, d, k, 2, out); } diff --git a/cuML/test/umap_test.cu b/cuML/test/umap_test.cu index 5a0debdd4c..7fcc2a7249 100644 --- a/cuML/test/umap_test.cu +++ b/cuML/test/umap_test.cu @@ -16,6 +16,8 @@ #include +#include "cuML.hpp" + #include "umap/umapparams.h" #include "umap/runner.h" #include "knn/knn.h" @@ -38,12 +40,14 @@ class UMAPTest: public ::testing::Test { protected: void basicTest() { + cumlHandle handle; + umap_params = new UMAPParams(); umap_params->n_neighbors = k; umap_params->verbose = true; umap_params->target_metric = UMAPParams::MetricType::CATEGORICAL; - kNN *knn = new kNN(d); + kNN *knn = new kNN(handle, d); cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); UMAPAlgo::find_ab(umap_params, stream); @@ -69,7 +73,7 @@ protected: std::cout << "Performing fit()" << std::endl; - UMAPAlgo::_fit(X_d, n, d, knn, umap_params, embeddings, stream); + UMAPAlgo::_fit(handle, X_d, n, d, knn, umap_params, embeddings, stream); std::cout << "done." << std::endl; @@ -78,13 +82,13 @@ protected: float *xformed; MLCommon::allocate(xformed, n*umap_params->n_components); - UMAPAlgo::_transform(X_d, n, d, embeddings, n, knn, umap_params, xformed, stream); + UMAPAlgo::_transform(handle, X_d, n, d, embeddings, n, knn, umap_params, xformed, stream); std::cout << "Done." << std::endl; std::cout << "Performing supervised fit" << std::endl; - UMAPAlgo::_fit(X_d, Y_d, n, d, knn, umap_params, embeddings, stream); + UMAPAlgo::_fit(handle, X_d, Y_d, n, d, knn, umap_params, embeddings, stream); std::cout << "Done." << std::endl; diff --git a/cuML/test_mg/knn_test.cu b/cuML/test_mg/knn_test.cu index b817d25e94..376e13c475 100644 --- a/cuML/test_mg/knn_test.cu +++ b/cuML/test_mg/knn_test.cu @@ -105,7 +105,8 @@ protected: long *d_ref_I; T* d_ref_D; - kNN *knn = new kNN(d); + cumlHandle handle; + kNN *knn = new kNN(handle, d); }; From abe0805f3ede9c5d5dabdbf59f80752450a3fb0f Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 13 May 2019 12:12:28 -0400 Subject: [PATCH 114/156] Fixing nearestNeighbors so that it can extend Base --- cuML/src/knn/knn.cu | 1 + cuML/src/spectral/spectral.h | 6 --- cuML/src/umap/init_embed/runner.h | 2 +- cuML/src/umap/umap.cu | 18 ++------- python/cuml/neighbors/nearest_neighbors.pyx | 44 ++++++++++----------- 5 files changed, 28 insertions(+), 43 deletions(-) diff --git a/cuML/src/knn/knn.cu b/cuML/src/knn/knn.cu index 518278fa17..174bdab954 100644 --- a/cuML/src/knn/knn.cu +++ b/cuML/src/knn/knn.cu @@ -42,6 +42,7 @@ namespace ML { D(D), total_n(0), indices(0), verbose(verbose), owner(false) { this->handle = const_cast(&handle.getImpl()); } + kNN::~kNN() { try { diff --git a/cuML/src/spectral/spectral.h b/cuML/src/spectral/spectral.h index 324fd4fc96..4ea0d4c4ec 100644 --- a/cuML/src/spectral/spectral.h +++ b/cuML/src/spectral/spectral.h @@ -338,17 +338,11 @@ namespace ML { knn.fit(*¶ms, 1); knn.search(X, m, knn_indices, knn_dists, n_neighbors); - std::cout << "About to call" << std::endl; - fit_embedding(handle, knn_indices, knn_dists, m, n_neighbors, n_components, out); - std::cout << "DONE!" << std::endl; - CUDA_CHECK(cudaFree(knn_indices)); CUDA_CHECK(cudaFree(knn_dists)); - - std::cout << "DONE!" << std::endl; } } } diff --git a/cuML/src/umap/init_embed/runner.h b/cuML/src/umap/init_embed/runner.h index 7b6feaf17c..4317498ef7 100644 --- a/cuML/src/umap/init_embed/runner.h +++ b/cuML/src/umap/init_embed/runner.h @@ -43,7 +43,7 @@ namespace UMAPAlgo { */ case 0: RandomInit::launcher(X, n, d, knn_indices, knn_dists, params, embedding, - stream); + handle); break; case 1: diff --git a/cuML/src/umap/umap.cu b/cuML/src/umap/umap.cu index a906420147..c9f74ae5d9 100644 --- a/cuML/src/umap/umap.cu +++ b/cuML/src/umap/umap.cu @@ -45,19 +45,13 @@ namespace ML { */ void UMAP_API::fit(const cumlHandle &handle, float *X, int n, int d, float *embeddings) { this->knn = new kNN(handle, d); - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - UMAPAlgo::_fit(handle, X, n, d, knn, get_params(), embeddings, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + UMAPAlgo::_fit(handle, X, n, d, knn, get_params(), embeddings, handle.getStream()); } void UMAP_API::fit(const cumlHandle &handle, float *X, float *y, int n, int d, float *embeddings) { this->knn = new kNN(handle, d); - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - UMAPAlgo::_fit(handle, X, y, n, d, knn, get_params(), embeddings, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + UMAPAlgo::_fit(handle, X, y, n, d, knn, get_params(), embeddings, handle.getStream()); } /** @@ -77,12 +71,8 @@ namespace ML { */ void UMAP_API::transform(const cumlHandle &handle, float *X, int n, int d, float *embedding, int embedding_n, float *out) { - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - UMAPAlgo::_transform(handle, X, n, d, - embedding, embedding_n, knn, - get_params(), out, stream); - CUDA_CHECK(cudaStreamDestroy(stream)); + UMAPAlgo::_transform(handle, X, n, d, embedding, embedding_n, knn, + get_params(), out, handle.getStream()); } /** diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index e70b87b014..aeb110d895 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -60,7 +60,7 @@ cdef extern from "knn/knn.h" namespace "ML": ) -cdef class NearestNeighbors: +class NearestNeighbors: """ NearestNeighbors is a unsupervised algorithm where if one wants to find the "closest" datapoint(s) to new unseen data, one can calculate a suitable "distance" between @@ -159,27 +159,27 @@ cdef class NearestNeighbors: `_. """ - cpdef kNN *k - - cdef int num_gpus - - cdef uintptr_t X_ctype - - cdef uintptr_t I_ptr - cdef uintptr_t D_ptr - - cdef object X_m - - cdef bool _should_downcast - cdef object n_gpus - cdef object devices - cdef bool _verbose - - cdef object n_neighbors - - cpdef kNNParams *input - - def __cinit__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, should_downcast = True): + # cpdef kNN *k + # + # cdef int num_gpus + # + # cdef uintptr_t X_ctype + # + # cdef uintptr_t I_ptr + # cdef uintptr_t D_ptr + # + # cdef object X_m + # + # cdef bool _should_downcast + # cdef object n_gpus + # cdef object devices + # cdef bool _verbose + # + # cdef object n_neighbors + # + # cpdef kNNParams *input + + def __init__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, should_downcast = True): """ Construct the NearestNeighbors object for training and querying. From a0901b09834440e6f3f8c9eb64d202492004559a Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 13 May 2019 13:39:30 -0500 Subject: [PATCH 115/156] FIX Correct kmeans typo --- python/cuml/cluster/kmeans.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx index 7cf561651e..2aac26ac08 100644 --- a/python/cuml/cluster/kmeans.pyx +++ b/python/cuml/cluster/kmeans.pyx @@ -404,7 +404,7 @@ class KMeans(Base): self.handle.sync() cc_df = cudf.DataFrame() for i in range(0, self.n_cols): - n_c = self.n_cluster + n_c = self.n_clusters n_cols = self.n_cols cc_df[str(i)] = self.cluster_centers_[i:n_c*n_cols:n_cols] self.cluster_centers_ = cc_df From 4432f5f4dd5ab37324351586f5bf54ea2a4bd499 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Mon, 13 May 2019 14:40:49 -0400 Subject: [PATCH 116/156] Adding cumlhandle to mlprims level due to updates in sgd --- ml-prims/test/hinge.cu | 19 +++++++++++-------- ml-prims/test/linearReg.cu | 19 +++++++++++-------- ml-prims/test/logisticReg.cu | 19 +++++++++++-------- 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/ml-prims/test/hinge.cu b/ml-prims/test/hinge.cu index 3ebedcbc0f..384bb0ed76 100644 --- a/ml-prims/test/hinge.cu +++ b/ml-prims/test/hinge.cu @@ -32,6 +32,8 @@ protected: cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); + allocator.reset(new defaultDeviceAllocator); + allocate(in, len); allocate(out, 1); allocate(out_lasso, 1); @@ -90,38 +92,38 @@ protected: T l1_ratio = 0.5; hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_grad, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out_lasso, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_lasso_grad, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out_ridge, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_ridge_grad, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet_grad, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); @@ -159,6 +161,7 @@ protected: T *out_ref, *out_lasso_ref, *out_ridge_ref, *out_elasticnet_ref; T *out_grad, *out_lasso_grad, *out_ridge_grad, *out_elasticnet_grad; T *out_grad_ref, *out_lasso_grad_ref, *out_ridge_grad_ref, *out_elasticnet_grad_ref; + std::shared_ptr allocator; }; const std::vector > inputsf = { diff --git a/ml-prims/test/linearReg.cu b/ml-prims/test/linearReg.cu index 558a8b1fee..0254ba17f1 100644 --- a/ml-prims/test/linearReg.cu +++ b/ml-prims/test/linearReg.cu @@ -32,6 +32,8 @@ protected: cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); + allocator.reset(new defaultDeviceAllocator); + allocate(in, len); allocate(out, 1); allocate(out_lasso, 1); @@ -90,38 +92,38 @@ protected: T l1_ratio = 0.5; linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_grad, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_lasso, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_lasso_grad, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_ridge, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_ridge_grad, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet_grad, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); @@ -159,6 +161,7 @@ protected: T *out_ref, *out_lasso_ref, *out_ridge_ref, *out_elasticnet_ref; T *out_grad, *out_lasso_grad, *out_ridge_grad, *out_elasticnet_grad; T *out_grad_ref, *out_lasso_grad_ref, *out_ridge_grad_ref, *out_elasticnet_grad_ref; + std::shared_ptr allocator; }; const std::vector > inputsf = { diff --git a/ml-prims/test/logisticReg.cu b/ml-prims/test/logisticReg.cu index d8c31b9ab7..2a61cc9d1e 100644 --- a/ml-prims/test/logisticReg.cu +++ b/ml-prims/test/logisticReg.cu @@ -31,6 +31,8 @@ protected: cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); + allocator.reset(new defaultDeviceAllocator); + allocate(in, len); allocate(out, 1); allocate(out_lasso, 1); @@ -89,38 +91,38 @@ protected: T l1_ratio = 0.5; logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_grad, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_lasso, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_lasso_grad, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_ridge, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_ridge_grad, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet_grad, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); @@ -158,6 +160,7 @@ protected: T *out_ref, *out_lasso_ref, *out_ridge_ref, *out_elasticnet_ref; T *out_grad, *out_lasso_grad, *out_ridge_grad, *out_elasticnet_grad; T *out_grad_ref, *out_lasso_grad_ref, *out_ridge_grad_ref, *out_elasticnet_grad_ref; + std::shared_ptr allocator; }; const std::vector > inputsf = { From 12b1673693636230e420195b5f0eac19a97945bf Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Mon, 13 May 2019 13:59:27 -0500 Subject: [PATCH 117/156] DOC Added documentation of rules skipped for cython flake8 --- python/.flake8.cython | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/.flake8.cython b/python/.flake8.cython index 4eb437c8ea..a25d52614e 100644 --- a/python/.flake8.cython +++ b/python/.flake8.cython @@ -18,3 +18,11 @@ filename = *.pyx, *.pxd exclude = *.egg, build, docs, .git ignore = E999, E225, E226, E227, W503, W504 + +# Rules ignored: +# E999: invalid syntax (works for Python, not Cython) +# E225: Missing whitespace around operators (breaks cython casting syntax like ) +# E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*) +# E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax) +# W503: line break before binary operator (breaks lines that start with a pointer) +# W504: line break after binary operator (breaks lines that end with a pointer) From ad4d4e51cadb4656bbe51af37eead366f1f4c02b Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Mon, 13 May 2019 15:06:39 -0400 Subject: [PATCH 118/156] Removing all the cudafree and allocate functions that were oversighted before. --- cuML/src/solver/cd.h | 6 +++--- cuML/src/solver/sgd.h | 4 ++-- ml-prims/src/functions/hinge.h | 28 ++++++++++++---------------- ml-prims/src/functions/linearReg.h | 28 ++++++++++++---------------- ml-prims/src/functions/logisticReg.h | 28 ++++++++++++---------------- 5 files changed, 41 insertions(+), 53 deletions(-) diff --git a/cuML/src/solver/cd.h b/cuML/src/solver/cd.h index 25aecd534c..82b85acc84 100644 --- a/cuML/src/solver/cd.h +++ b/cuML/src/solver/cd.h @@ -101,10 +101,10 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, std::vector h_coef(n_cols, math_t(0)); if (fit_intercept) { - mu_input.reserve(n_cols, stream); - mu_labels.reserve(1, stream); + mu_input.resize(n_cols, stream); + mu_labels.resize(1, stream); if (normalize) { - norm2_input.reserve(n_cols, stream); + norm2_input.resize(n_cols, stream); } GLM::preProcessData(handle, input, n_rows, n_cols, labels, diff --git a/cuML/src/solver/sgd.h b/cuML/src/solver/sgd.h index 7dbf4d7b90..2539aae0da 100644 --- a/cuML/src/solver/sgd.h +++ b/cuML/src/solver/sgd.h @@ -110,8 +110,8 @@ void sgdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, device_buffer norm2_input(allocator, stream, 0); if (fit_intercept) { - mu_input.reserve(n_cols, stream); - mu_labels.reserve(1, stream); + mu_input.resize(n_cols, stream); + mu_labels.resize(1, stream); GLM::preProcessData(handle, input, n_rows, n_cols, labels, intercept, mu_input.data(), mu_labels.data(), norm2_input.data(), diff --git a/ml-prims/src/functions/hinge.h b/ml-prims/src/functions/hinge.h index dc3aeccac9..10c60dd4ac 100644 --- a/ml-prims/src/functions/hinge.h +++ b/ml-prims/src/functions/hinge.h @@ -90,23 +90,21 @@ void hingeLossGrads(math_t *input, int n_rows, int n_cols, hingeLossGradMult(input, labels, labels_pred.data(), n_rows, n_cols, stream); Stats::mean(grads, input, n_cols, n_rows, false, false, stream); - math_t *pen_grads = NULL; + device_buffer pen_grads(allocator, stream, 0); if (pen != penalty::NONE) - allocate(pen_grads, n_cols); + pen_grads.resize(n_cols, stream); if (pen == penalty::L1) { - lassoGrad(pen_grads, coef, n_cols, alpha, stream); + lassoGrad(pen_grads.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::L2) { - ridgeGrad(pen_grads, coef, n_cols, alpha, stream); + ridgeGrad(pen_grads.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::ELASTICNET) { - elasticnetGrad(pen_grads, coef, n_cols, alpha, l1_ratio, stream); + elasticnetGrad(pen_grads.data(), coef, n_cols, alpha, l1_ratio, stream); } if (pen != penalty::NONE) { - LinAlg::add(grads, grads, pen_grads, n_cols, stream); - if (pen_grads != NULL) - CUDA_CHECK(cudaFree(pen_grads)); + LinAlg::add(grads, grads, pen_grads.data(), n_cols, stream); } } @@ -129,23 +127,21 @@ void hingeLoss(math_t *input, int n_rows, int n_cols, Stats::sum(loss, labels_pred.data(), 1, n_rows, false, stream); - math_t *pen_val = NULL; + device_buffer pen_val(allocator, stream, 0); if (pen != penalty::NONE) - allocate(pen_val, 1); + pen_val.resize(1, stream); if (pen == penalty::L1) { - lasso(pen_val, coef, n_cols, alpha, stream); + lasso(pen_val.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::L2) { - ridge(pen_val, coef, n_cols, alpha, stream); + ridge(pen_val.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::ELASTICNET) { - elasticnet(pen_val, coef, n_cols, alpha, l1_ratio, stream); + elasticnet(pen_val.data(), coef, n_cols, alpha, l1_ratio, stream); } if (pen != penalty::NONE) { - LinAlg::add(loss, loss, pen_val, 1, stream); - if (pen_val != NULL) - CUDA_CHECK(cudaFree(pen_val)); + LinAlg::add(loss, loss, pen_val.data(), 1, stream); } } diff --git a/ml-prims/src/functions/linearReg.h b/ml-prims/src/functions/linearReg.h index 9fd84ab9ba..2d3430b963 100644 --- a/ml-prims/src/functions/linearReg.h +++ b/ml-prims/src/functions/linearReg.h @@ -62,23 +62,21 @@ void linearRegLossGrads(math_t *input, int n_rows, int n_cols, Stats::mean(grads, input, n_cols, n_rows, false, false, stream); LinAlg::scalarMultiply(grads, grads, math_t(2), n_cols, stream); - math_t *pen_grads = NULL; + device_buffer pen_grads(allocator, stream, 0); if (pen != penalty::NONE) - allocate(pen_grads, n_cols); + pen_grads.resize(n_cols, stream); if (pen == penalty::L1) { - lassoGrad(pen_grads, coef, n_cols, alpha, stream); + lassoGrad(pen_grads.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::L2) { - ridgeGrad(pen_grads, coef, n_cols, alpha, stream); + ridgeGrad(pen_grads.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::ELASTICNET) { - elasticnetGrad(pen_grads, coef, n_cols, alpha, l1_ratio, stream); + elasticnetGrad(pen_grads.data(), coef, n_cols, alpha, l1_ratio, stream); } if (pen != penalty::NONE) { - LinAlg::add(grads, grads, pen_grads, n_cols, stream); - if (pen_grads != NULL) - CUDA_CHECK(cudaFree(pen_grads)); + LinAlg::add(grads, grads, pen_grads.data(), n_cols, stream); } } @@ -98,23 +96,21 @@ void linearRegLoss(math_t *input, int n_rows, int n_cols, Matrix::power(labels_pred.data(), n_rows, stream); Stats::mean(loss, labels_pred.data(), 1, n_rows, false, false, stream); - math_t *pen_val = NULL; + device_buffer pen_val(allocator, stream, 0); if (pen != penalty::NONE) - allocate(pen_val, 1); + pen_val.resize(1, stream); if (pen == penalty::L1) { - lasso(pen_val, coef, n_cols, alpha, stream); + lasso(pen_val.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::L2) { - ridge(pen_val, coef, n_cols, alpha, stream); + ridge(pen_val.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::ELASTICNET) { - elasticnet(pen_val, coef, n_cols, alpha, l1_ratio, stream); + elasticnet(pen_val.data(), coef, n_cols, alpha, l1_ratio, stream); } if (pen != penalty::NONE) { - LinAlg::add(loss, loss, pen_val, 1, stream); - if (pen_val != NULL) - CUDA_CHECK(cudaFree(pen_val)); + LinAlg::add(loss, loss, pen_val.data(), 1, stream); } } diff --git a/ml-prims/src/functions/logisticReg.h b/ml-prims/src/functions/logisticReg.h index df4173a303..afa60119f4 100644 --- a/ml-prims/src/functions/logisticReg.h +++ b/ml-prims/src/functions/logisticReg.h @@ -63,23 +63,21 @@ void logisticRegLossGrads(math_t *input, int n_rows, int n_cols, Stats::mean(grads, input, n_cols, n_rows, false, false, stream); - math_t *pen_grads = NULL; + device_buffer pen_grads(allocator, stream, 0); if (pen != penalty::NONE) - allocate(pen_grads, n_cols); + pen_grads.resize(n_cols, stream); if (pen == penalty::L1) { - lassoGrad(pen_grads, coef, n_cols, alpha, stream); + lassoGrad(pen_grads.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::L2) { - ridgeGrad(pen_grads, coef, n_cols, alpha, stream); + ridgeGrad(pen_grads.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::ELASTICNET) { - elasticnetGrad(pen_grads, coef, n_cols, alpha, l1_ratio, stream); + elasticnetGrad(pen_grads.data(), coef, n_cols, alpha, l1_ratio, stream); } if (pen != penalty::NONE) { - LinAlg::add(grads, grads, pen_grads, n_cols, stream); - if (pen_grads != NULL) - CUDA_CHECK(cudaFree(pen_grads)); + LinAlg::add(grads, grads, pen_grads.data(), n_cols, stream); } } @@ -114,23 +112,21 @@ void logisticRegLoss(math_t *input, int n_rows, int n_cols, Stats::mean(loss, labels_pred.data(), 1, n_rows, false, false, stream); - math_t *pen_val = NULL; + device_buffer pen_val(allocator, stream, 0); if (pen != penalty::NONE) - allocate(pen_val, 1); + pen_val.resize(1, stream); if (pen == penalty::L1) { - lasso(pen_val, coef, n_cols, alpha, stream); + lasso(pen_val.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::L2) { - ridge(pen_val, coef, n_cols, alpha, stream); + ridge(pen_val.data(), coef, n_cols, alpha, stream); } else if (pen == penalty::ELASTICNET) { - elasticnet(pen_val, coef, n_cols, alpha, l1_ratio, stream); + elasticnet(pen_val.data(), coef, n_cols, alpha, l1_ratio, stream); } if (pen != penalty::NONE) { - LinAlg::add(loss, loss, pen_val, 1, stream); - if (pen_val != NULL) - CUDA_CHECK(cudaFree(pen_val)); + LinAlg::add(loss, loss, pen_val.data(), 1, stream); } } From ca73fdaf6fa25ee0235486ea10513555cf976452 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 13 May 2019 12:31:26 -0700 Subject: [PATCH 119/156] Updating cython for cuml handle --- cuML/src/knn/.knn.cu.swo | Bin 0 -> 24576 bytes cuML/src/umap/init_embed/runner.h | 2 +- python/cuml/common/base.pyx | 2 +- python/cuml/neighbors/nearest_neighbors.pyx | 74 +++++++++++++++----- 4 files changed, 58 insertions(+), 20 deletions(-) create mode 100644 cuML/src/knn/.knn.cu.swo diff --git a/cuML/src/knn/.knn.cu.swo b/cuML/src/knn/.knn.cu.swo new file mode 100644 index 0000000000000000000000000000000000000000..ce2e311c95113f9886d033abc1f7bacf32c77d5f GIT binary patch literal 24576 zcmeI4dyFJUdBEE*?7>gjArFj6-CA5`A2)k?@aG* z-_CT8yL)cEKA&GAIK&2vnB)(m00OKC1&b&sVDb<_kw}7on1_&voq%K~5JkXW5mB(? z{JyH{$IQ;2*NG8IGA;e4XS(b8Rn=GZeO0?lyKk!=QNLEX-r>5!aVkIf+B@EV-e)ho z=XV{a9)wLVkr!vb8Y}ngdHdOe=(ppM>o>;hCv~#p>tVDOx81roUhlM;d>ok_kE8nd zS`Y}8dS@F+4uT9MFp$7@B+yBgcfI}6#q0L&*)74Z9onI;x^i(lF$ZA=5*SEeAc27d z1`-%ZU?72k1O^fqNZ|j71d{go&I8otTw9;h_I^&!{Q~>DV(<6$yg%Rm-e>QB*z^8B zUoqE)ef~nvd;4>68%SUvfq?`D5*SEeAc27d1`-%ZU?72k1O^fqNMIm=pMnJ3hU5Gk z>ApoA0PO##_W#S5JI>eP@8B`G1+IZN!}l+9oPUB3!5wfZeDPAp`8a$8J`9h*DR>9$ zhD+f4mpIO|@HBi9J_zrDQ_z6~JZL~2-U(O0cQ1CF=iokQK@(g!1iu3R{6@!l5*~;5 z!7^M34!m4)oR{Ex@K^9LxF2e86Z|S%2YbPR7cX+0zkol7kAeq>;g{gwFLa#e;VF0& z9)bs94OU?Vmf)>$E`0h8j`J>f02bjmEWi!$LmVnE!VB_>>aROBU{DDZ<7wMK$2u@cqrUfi2rRY?h6b9#QzAlI(c0@YI)U3Ubgin1u? zUwlrWL?m#^t0%?!>l$TZ(Xkh_xK*<1iTHH@{e4$j*(|su=%y)aai-}zrSXvwM${RH z+UqO-CO?x^Ki)N&go)d%1#0ww(hU|@=5QeD;P#_CmOS(uwa5)tytvZt#H+O>x4u@k z=?w34Y&3oIm&?A|pQS9Fs`%Ozfulw>o440$H#v`Fk~GhHQZ!L)+l-EOXNRngNw14- zj*J06Xm^qmJ|iT{nGRHzKJN2+_dag>)f~6cQ7Inc=b}S-Ntvw8Pk9t4jfshR*h$p> z{c33Lz>wVM9?DnTJcYL z!RCQr)Q3uAo|P~6TI?}h@; zYj}+VLt6$Al^AsbI!mfbD{9NoMt63l)zg#1M0r8jSy|OpNwq9(X4P(DY~t@I=V8Zh`DM_ zBWEfvQ?Xh4*{e(&>Q@OX2hr|mk%@`>`9wk}KmAjF$%Me#NB6}v(FrwEo*A|swHB0j z^WCZ${oRHur{Twx&a&+d%*|0h< zTsp~u!K?u|zN=R~ZqP7mCy^PdvCq0dt4j2zUIaLS+w$~LUv*?w;c8+TTeZb)ukJ7Vb=73i)Nxms zmO{FWZpgM5wUDDul`2Rww^xtfa&+;yx_xSXeroP`_27a!IyVn7 zyF=Ytouk8gK3PWSh&Ej*R;-PE)=<9J&_$9papW&`5_HC-{3WjNXXlg-K z7lzc$Qw!CFF)7eu(3GSj@9vH-%7^LKZIegcU1ExDy6cjwyTS-=$kar4UXOg4oX1og zE+^|O&q+p7l4~_d+7sjB>+9%`I;RB_p-ltzb2+3IxVrY}nkwT|qHsc4be z7A?waN|!Tj>oIkk7fFjL%BWcCd1}Z;8j`s}*L0dPMi@GYD%XeA-raj{7*lh%RcESG zwB-D;qw`bJmKA4w#1Z>{fzAGf^B9)c|LOkxSFrORheu!nZh&8gZ({3z3H}zIfzLu6 zZiGn~f*tTC_*ZQH@4%h;k(%P&%l%LAS}TG z+yXN&4I^+BTn_(&eg7r+7~Btc!eMwD`~ds@d3YKghBY_|H^VNt6uyN`{~7or_!PVs zLI_|M_JD#5;K$hX{{UZrr{K3?5st$(a3#D*`+X7~fd}DU=)m33f+qN|3M(M(E&hSQ z^}0)7e4Hh1Llq}I@;D>r93Lf?k9Z~Yhy6@%JjTMGsrkVPQJhSVBPv#-s(lx_jKQtn zmrAQlX*IEZ_fJo%^604kEz7+zrN5rSXduEWGg8fO#CS#Yg8RhfKF#M*}>+0B?p zSw&9l&R)qPX{TzFq_y!1(d1pso2=G_c(g}5Y^x=4)39=FVf~pBp5Z5YnuVCYY}V5g z6M@&EcuiS+n^(3fyY|=zBFjt?>VU}2CL{hKy_`F#SNQ6rNPM3|SVV{;>Gih6In0U}KpHdz|m&f!SU9#TZ($!N;!@bsyB6aI&v%892BNx7|p)QfG zwXT;m0&b0%Y6rS8_+F^Hv{ONFVj@XBjAzdZhOS8>ZnIhIC!a1Spo|q8wvVtRWoi5< zY;@|fCb17>(v(6lvA1KzaOtoPX6+!)b(z&Vs~WwWVzI&cfp~ch3yzy0bKDT(+IHlw zwD32y+Oqi4k&II`AC{DLsk6MdB8~~$yR?yEmc09Q+fV8ss%*q(x}b;+U62lGl1Uwk zqp6rBKNBo{rSsYnmo4lv(-&FgH5mHjqZ`gXCVzL zmT1?g5F^O6ZgJSBV23bT=3}}J;c~;bITYK92X{WZDCpW zbY5Lqo8sB@TD`BcieZ}PiAa-)@U?-Zj!4DCYl!2C6=xK-a7?krcblfmh><5{A%$Rt z<){fMZrh4>4(6^W@zxo~=a>p=_0>+WCKHdNi^oSEQl6NQxHqLg7DDNF;whBpNmOfe z`!L#-IbJlqU?o}YeuF7|%z99Jk~=ax9kyCwFfq|-GkCli?UXO;)W;-~QR}iP>q2WM z&hUtj$!t1p*MmQ-peI!ZlZ`pTJi#0t40k1MqFH&8dMK;xf|I~gHYO-tx$uKH9``Sk%q08)ENt#V}KVl{>>_4<$ z$6XG|FP~h{oHfzBwbd|A%DE;;wJ8gpE{-<64YPTQmtytUtvPbOER!8sr zuhWx{eA}-$|8I`L6B{QxQGtI&uRZMXCsrIfS0#H$>2k)=k=S{VOZKW*t590j7nrp1 zF#9?U_CfS1iXbqjDA?>Ufv3gB#5C2Jn2L7V#$F8r7B&%1nqWg6KDLO>-tah+)$#+g z4Ww%wg{Lr^*}CFNPaKs>oYNqvHFRrL@kw+)nqi%jJrgs$7)E}Z0QvH?%j{&)DF@e# z9WCDqr{RoUSt^|+>P!}uZ=0f0u2RV+orfe7T`Y6sSAGo2IFqp6N~NNUW53=2otT(D z;s?gFq?etOliKycxjpihC;H&Zn21AG&iKo+t9vCak2gP&r`?;XP= zOvWCVGAYOzv)-KLx2}dbw}KTu*_2jsyg6j%G|s&u#l-qcC`~;(!IcRqJ5_P^0~Ez9 z((l_jPmzZ0z4zF`$2@OsDw%cT%DJfAxCP(L-m{^eBILWNk#;YV<8hpf_y4 z-?~rti_hNgHm!oqNZ!88#KUH)wUrjSlTn*vWu5-uG4HT=rJ|y&PiccA5Jr(eIvW#vWqLfIIRt+zZdhqHF=(Z`zY8B$<5v3dqa@qgC3%hv_cD4The|F&`=l(we zABKCO0=vM04`Ay*0e6C&{~v?TVe3Bw55WEK>+l?Q{@=s5;T!NN_yoKQZi7Xrz>C=X zFTkI`{cr|G;c_?!UZ!j>!B60M_!c|^?}6Wf+h7r-4p+iO@Tb)2bMOev!mY3eeuVA+ zd3YFZf*V2h^S_NR;1h5fsxSoqhF|2X@HiZWIk*b0gy-0oe>b!t-M3%l`zSoketZMw z;bxeG1K_}4v*-RrcodGoQMeJtp#oRKMQ{O}4=-^x|7my-HsCrq2maFjg9hDWGMA5; z`(hpr){KKS<6zA=STm+GioFlkj2Jymk5Kyl9IP3OVi~L%|DUZH^TzMC?vLd!%(tG` z$LA;1W=o=Xdyt*K{7)6SH!--}m5Y7~rHZ@Da`t9?yi_{PXIj|e zds!;$11+h`BMr2bd!yerw8YtBD>^mvva<$OlMIhs-^6>9c_@a5H_Pe2VPU*R>*DpZ zEY4DOvV)xy5pNeaF z+lw)X>6e$$1m*C?mS&!gt{sQ9nO^Tq|JpHRynFtaRjskygTR1M){a4uiBm5UI4WwcK literal 0 HcmV?d00001 diff --git a/cuML/src/umap/init_embed/runner.h b/cuML/src/umap/init_embed/runner.h index 4317498ef7..ce7779fe68 100644 --- a/cuML/src/umap/init_embed/runner.h +++ b/cuML/src/umap/init_embed/runner.h @@ -43,7 +43,7 @@ namespace UMAPAlgo { */ case 0: RandomInit::launcher(X, n, d, knn_indices, knn_dists, params, embedding, - handle); + handle.getStream()); break; case 1: diff --git a/python/cuml/common/base.pyx b/python/cuml/common/base.pyx index 1027fa4f5c..7b625f950c 100644 --- a/python/cuml/common/base.pyx +++ b/python/cuml/common/base.pyx @@ -26,7 +26,7 @@ import cuml.common.cuda import cudf -class Base: +cdef class Base: """ Base class for all the ML algos. It handles some of the common operations across all algos. Every ML algo class exposed at cython level must inherit diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index aeb110d895..532b50ba0a 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -23,6 +23,11 @@ import numpy as np import pandas as pd import cudf import ctypes +import cuml +from libcpp.memory cimport shared_ptr +cimport cuml.common.handle +cimport cuml.common.cuda +from cuml.common.base import Base from libcpp cimport bool @@ -36,6 +41,16 @@ from libc.stdlib cimport calloc, malloc, free from cuml import numba_utils +cdef extern from "cuML.hpp" namespace "ML" nogil: + cdef cppclass deviceAllocator: + pass + + cdef cppclass cumlHandle: + cumlHandle() except + + void setStream(cuml.common.cuda._Stream s) + void setDeviceAllocator(shared_ptr[deviceAllocator] a) + cuml.common.cuda._Stream getStream() + cdef extern from "knn/knn.h" namespace "ML": cdef cppclass kNNParams: @@ -43,8 +58,8 @@ cdef extern from "knn/knn.h" namespace "ML": int N cdef cppclass kNN: - kNN(int D, bool verbose) except + - void search(const float *search_items, + kNN(const cumlHandle &handle, int D, bool verbose) except + + void search(float *search_items, int search_items_size, long *res_I, float *res_D, @@ -60,7 +75,7 @@ cdef extern from "knn/knn.h" namespace "ML": ) -class NearestNeighbors: +cdef class NearestNeighborsImpl: """ NearestNeighbors is a unsupervised algorithm where if one wants to find the "closest" datapoint(s) to new unseen data, one can calculate a suitable "distance" between @@ -159,27 +174,29 @@ class NearestNeighbors: `_. """ - # cpdef kNN *k + cpdef kNN *k # - # cdef int num_gpus + cdef int num_gpus # - # cdef uintptr_t X_ctype + cdef uintptr_t X_ctype # - # cdef uintptr_t I_ptr - # cdef uintptr_t D_ptr + cdef uintptr_t I_ptr + cdef uintptr_t D_ptr # - # cdef object X_m + cdef object X_m # - # cdef bool _should_downcast - # cdef object n_gpus - # cdef object devices - # cdef bool _verbose + cdef bool _should_downcast + cdef object n_gpus + cdef object devices + cdef bool _verbose # - # cdef object n_neighbors + cdef object n_neighbors # - # cpdef kNNParams *input + cpdef kNNParams *input - def __init__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, should_downcast = True): + cpdef object handle + + def __cinit__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, should_downcast = True, handle = None): """ Construct the NearestNeighbors object for training and querying. @@ -197,6 +214,7 @@ class NearestNeighbors: self._should_downcast = should_downcast self.input = malloc(sizeof(kNNParams)) self.k = NULL + self.handle = handle def __dealloc__(self): del self.k @@ -271,7 +289,7 @@ class NearestNeighbors: del self.k n_dims = X.shape[1] - self.k = new kNN(n_dims, verbose = self._verbose) + self.k = new kNN(self.handle, n_dims, verbose = self._verbose) cdef uintptr_t X_ctype = -1 cdef uintptr_t dev_ptr = -1 @@ -342,7 +360,7 @@ class NearestNeighbors: if self.k != NULL: del self.k - self.k = new kNN(n_dims, verbose = self._verbose) + self.k = new kNN(self.handle, n_dims, verbose = self._verbose) del self.input self.input = < kNNParams * > malloc(len(alloc_info) * sizeof(kNNParams)) @@ -463,3 +481,23 @@ class NearestNeighbors: inds, dists, k) + + + +class NearestNeighbors(Base): + + def __init__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, should_downcast = True, handle = None): + super(NearestNeighbors, self).__init__(handle, verbose) + + self._impl = NearestNeighborsImpl(n_neighbors, n_gpus, devices, verbose, should_downcast, self.handle) + + def fit(self, X): + return self._impl.fit(X) + + + def kneighbors(self, X, k = None): + return self._impl.kneighbors(X, k) + + + def _fit_mg(self, n_dims, alloc_info): + return self._impl._fit_mg(n_dims, alloc_info) From 939362864047eaf258f8d802e4029b8d627be000 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 13 May 2019 15:46:17 -0400 Subject: [PATCH 120/156] Adding cumlHandle support for UMAP cython --- python/cuml/manifold/umap.pyx | 93 +++++- python/cuml/neighbors/nearest_neighbors.pyx | 301 ++++++++++---------- 2 files changed, 231 insertions(+), 163 deletions(-) diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index e5531073af..680609c868 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -24,6 +24,13 @@ import pandas as pd import cudf import ctypes +import cuml +from libcpp.memory cimport shared_ptr +cimport cuml.common.handle +cimport cuml.common.cuda +from cuml.common.base import Base + + from cuml import numba_utils from numba import cuda @@ -32,6 +39,16 @@ from libcpp cimport bool from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free +cdef extern from "cuML.hpp" namespace "ML" nogil: + cdef cppclass deviceAllocator: + pass + + cdef cppclass cumlHandle: + cumlHandle() except + + void setStream(cuml.common.cuda._Stream s) + void setDeviceAllocator(shared_ptr[deviceAllocator] a) + cuml.common.cuda._Stream getStream() + cdef extern from "umap/umapparams.h" namespace "ML::UMAPParams": @@ -67,18 +84,21 @@ cdef extern from "umap/umap.h" namespace "ML": UMAP_API(UMAPParams *p) except + - void fit(float *X, + void fit(const cumlHandle &handle, + float *X, int n, int d, float *embeddings) - void fit(float *X, + void fit(const cumlHandle &handle, + float *X, float *y, int n, int d, float *embeddings) - void transform(float *X, + void transform(const cumlHandle &handle, + float *X, int n, int d, float *embedding, @@ -86,7 +106,7 @@ cdef extern from "umap/umap.h" namespace "ML": float *out) -cdef class UMAP: +cdef class UMAPImpl: """Uniform Manifold Approximation and Projection Finds a low dimensional embedding of the data that approximates @@ -220,7 +240,8 @@ cdef class UMAP: target_n_neighbors = -1, target_weights = 0.5, target_metric = "euclidean", - should_downcast = True): + should_downcast = True, + handle = None): self.umap_params = new UMAPParams() @@ -345,6 +366,7 @@ cdef class UMAP: y_m = self._downcast(y) y_raw = y_m.device_ctypes_pointer.value self.umap.fit( + self.handle, self.raw_data, y_raw, X_m.shape[0], @@ -355,6 +377,7 @@ cdef class UMAP: else: self.umap.fit( + self.handle, self.raw_data, X_m.shape[0], X_m.shape[1], @@ -420,7 +443,8 @@ cdef class UMAP: cdef uintptr_t embed_ptr = embedding.device_ctypes_pointer.value self.umap.transform( - x_ptr, + self.handle, + x_ptr, X_m.shape[0], X_m.shape[1], self.embeddings, @@ -437,3 +461,60 @@ cdef class UMAP: del X_m return ret + + +class UMAP(Base): + + def __init__(self, + n_neighbors=15, + n_components=2, + n_epochs=500, + learning_rate=1.0, + min_dist=0.1, + spread=1.0, + set_op_mix_ratio=1.0, + local_connectivity=1.0, + repulsion_strength=1.0, + negative_sample_rate=5, + transform_queue_size=4.0, + init="spectral", + verbose = False, + a = None, + b = None, + target_n_neighbors = -1, + target_weights = 0.5, + target_metric = "euclidean", + should_downcast = True, + handle = None): + + super(UMAP, self).__init__(handle, verbose) + + self._impl = UMAPImpl(n_neighbors, + n_components, + n_epochs, + learning_rate, + min_dist, + spread, + set_op_mix_ratio, + local_connectivity, + repulsion_strength, + negative_sample_rate, + transform_queue_size, + init, + verbose, + a, b, + target_n_neighbors, + target_weights, + target_metric, + should_downcast, + self.handle) + + + def fit(self, X, y = None): + return self._impl.fit(X, y) + + def transform(self, X): + return self._impl.transform(X) + + def fit_transform(self, X, y=None): + return self._impl.fit_transform(X, y) \ No newline at end of file diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index 532b50ba0a..a51ffdb2ad 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -76,103 +76,7 @@ cdef extern from "knn/knn.h" namespace "ML": cdef class NearestNeighborsImpl: - """ - NearestNeighbors is a unsupervised algorithm where if one wants to find the "closest" - datapoint(s) to new unseen data, one can calculate a suitable "distance" between - each and every point, and return the top K datapoints which have the smallest distance to it. - - cuML's KNN expects a cuDF DataFrame or a Numpy Array (where automatic chunking will be done - in to a Numpy Array in a future release), and fits a special data structure first to - approximate the distance calculations, allowing our querying times to be O(plogn) - and not the brute force O(np) [where p = no(features)]: - - Examples - --------- - .. code-block:: python - - import cudf - from cuml.neighbors import NearestNeighbors - import numpy as np - - np_float = np.array([ - [1,2,3], # Point 1 - [1,2,4], # Point 2 - [2,2,4] # Point 3 - ]).astype('float32') - - gdf_float = cudf.DataFrame() - gdf_float['dim_0'] = np.ascontiguousarray(np_float[:,0]) - gdf_float['dim_1'] = np.ascontiguousarray(np_float[:,1]) - gdf_float['dim_2'] = np.ascontiguousarray(np_float[:,2]) - - print('n_samples = 3, n_dims = 3') - print(gdf_float) - - nn_float = NearestNeighbors() - nn_float.fit(gdf_float) - distances,indices = nn_float.kneighbors(gdf_float,k=3) #get 3 nearest neighbors - - print(indices) - print(distances) - Output: - - .. code-block:: python - - import cudf - - # Both import methods supported - # from cuml.neighbors import NearestNeighbors - from cuml import NearestNeighbors - - n_samples = 3, n_dims = 3 - - dim_0 dim_1 dim_2 - - 0 1.0 2.0 3.0 - 1 1.0 2.0 4.0 - 2 2.0 2.0 4.0 - - # indices: - - index_neighbor_0 index_neighbor_1 index_neighbor_2 - 0 0 1 2 - 1 1 0 2 - 2 2 1 0 - # distances: - - distance_neighbor_0 distance_neighbor_1 distance_neighbor_2 - 0 0.0 1.0 2.0 - 1 0.0 1.0 1.0 - 2 0.0 1.0 2.0 - - Parameters - ---------- - n_neighbors: int (default = 5) - The top K closest datapoints you want the algorithm to return. If this number is large, - then expect the algorithm to run slower. - should_downcast : bool (default = False) - Currently only single precision is supported in the underlying undex. Setting this to - true will allow single-precision input arrays to be automatically downcasted to single - precision. Default = False. - - Notes - ------ - NearestNeighbors is a generative model. This means the data X has to be stored in order - for inference to occur. - - **Applications of NearestNeighbors** - - Applications of NearestNeighbors include recommendation systems where content or colloborative - filtering is used. Since NearestNeighbors is a relatively simple generative model, it is also - used in data visualization and regression / classification tasks. - - For an additional example see `the NearestNeighbors notebook - `_. - - For additional docs, see `scikitlearn's NearestNeighbors - `_. - """ cpdef kNN *k # @@ -275,14 +179,6 @@ cdef class NearestNeighborsImpl: def fit(self, X): - """ - Fit GPU index for performing nearest neighbor queries. - - Parameters - ---------- - X : cuDF DataFrame or numpy ndarray - Dense matrix (floats or doubles) of shape (n_samples, n_features) - """ assert len(X.shape) == 2, 'data should be two dimensional' if self.k != NULL: @@ -345,17 +241,7 @@ cdef class NearestNeighborsImpl: 1) def _fit_mg(self, n_dims, alloc_info): - """ - Fits a model using multiple GPUs. This method takes in a list of dict objects - representing the distribution of the underlying device pointers. The device - information can be extracted from the pointers. - :param n_dims - the number of features for each vector - :param alloc_info - a list of __cuda_array_interface__ dicts - :return: - """ if self.k != NULL: del self.k @@ -380,25 +266,7 @@ cdef class NearestNeighborsImpl: def kneighbors(self, X, k = None): - """ - Query the GPU index for the k nearest neighbors of row vectors in X. - - Parameters - ---------- - X : cuDF DataFrame or numpy ndarray - Dense matrix (floats or doubles) of shape (n_samples, n_features) - k: Integer - The number of neighbors - - Returns - ---------- - distances: cuDF DataFrame or numpy ndarray - The distances of the k-nearest neighbors for each column vector in X - - indices: cuDF DataFrame of numpy ndarray - The indices of the k-nearest neighbors for each column vector in X - """ if k is None: k = self.n_neighbors @@ -443,6 +311,139 @@ cdef class NearestNeighborsImpl: def _kneighbors(self, X_ctype, N, k, I_ptr, D_ptr): + + + cdef uintptr_t inds = I_ptr + cdef uintptr_t dists = D_ptr + cdef uintptr_t x = X_ctype + + self.k.search(x, + N, + inds, + dists, + k) + + + +class NearestNeighbors(Base): + """ + NearestNeighbors is a unsupervised algorithm where if one wants to find the "closest" + datapoint(s) to new unseen data, one can calculate a suitable "distance" between + each and every point, and return the top K datapoints which have the smallest distance to it. + + cuML's KNN expects a cuDF DataFrame or a Numpy Array (where automatic chunking will be done + in to a Numpy Array in a future release), and fits a special data structure first to + approximate the distance calculations, allowing our querying times to be O(plogn) + and not the brute force O(np) [where p = no(features)]: + + Examples + --------- + .. code-block:: python + + import cudf + from cuml.neighbors import NearestNeighbors + import numpy as np + + np_float = np.array([ + [1,2,3], # Point 1 + [1,2,4], # Point 2 + [2,2,4] # Point 3 + ]).astype('float32') + + gdf_float = cudf.DataFrame() + gdf_float['dim_0'] = np.ascontiguousarray(np_float[:,0]) + gdf_float['dim_1'] = np.ascontiguousarray(np_float[:,1]) + gdf_float['dim_2'] = np.ascontiguousarray(np_float[:,2]) + + print('n_samples = 3, n_dims = 3') + print(gdf_float) + + nn_float = NearestNeighbors() + nn_float.fit(gdf_float) + distances,indices = nn_float.kneighbors(gdf_float,k=3) #get 3 nearest neighbors + + print(indices) + print(distances) + + Output: + + .. code-block:: python + + import cudf + + # Both import methods supported + # from cuml.neighbors import NearestNeighbors + from cuml import NearestNeighbors + + n_samples = 3, n_dims = 3 + + dim_0 dim_1 dim_2 + + 0 1.0 2.0 3.0 + 1 1.0 2.0 4.0 + 2 2.0 2.0 4.0 + + # indices: + + index_neighbor_0 index_neighbor_1 index_neighbor_2 + 0 0 1 2 + 1 1 0 2 + 2 2 1 0 + # distances: + + distance_neighbor_0 distance_neighbor_1 distance_neighbor_2 + 0 0.0 1.0 2.0 + 1 0.0 1.0 1.0 + 2 0.0 1.0 2.0 + + Parameters + ---------- + n_neighbors: int (default = 5) + The top K closest datapoints you want the algorithm to return. If this number is large, + then expect the algorithm to run slower. + should_downcast : bool (default = False) + Currently only single precision is supported in the underlying undex. Setting this to + true will allow single-precision input arrays to be automatically downcasted to single + precision. Default = False. + + Notes + ------ + NearestNeighbors is a generative model. This means the data X has to be stored in order + for inference to occur. + + **Applications of NearestNeighbors** + + Applications of NearestNeighbors include recommendation systems where content or colloborative + filtering is used. Since NearestNeighbors is a relatively simple generative model, it is also + used in data visualization and regression / classification tasks. + + For an additional example see `the NearestNeighbors notebook + `_. + + For additional docs, see `scikitlearn's NearestNeighbors + `_. + """ + + def __init__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, + should_downcast = True, handle = None): + super(NearestNeighbors, self).__init__(handle, verbose) + self._impl = NearestNeighborsImpl(n_neighbors, n_gpus, devices, verbose, + should_downcast, self.handle) + + def fit(self, X): + """ + Fit GPU index for performing nearest neighbor queries. + + Parameters + ---------- + X : cuDF DataFrame or numpy ndarray + Dense matrix (floats or doubles) of shape (n_samples, n_features) + """ + return self._impl.fit(X) + + + def kneighbors(self, X, k = None): + """ Query the GPU index for the k nearest neighbors of column vectors in X. @@ -471,33 +472,19 @@ cdef class NearestNeighborsImpl: indices: cuDF DataFrame of numpy ndarray The indices of the k-nearest neighbors for each column vector in X """ - - cdef uintptr_t inds = I_ptr - cdef uintptr_t dists = D_ptr - cdef uintptr_t x = X_ctype - - self.k.search(x, - N, - inds, - dists, - k) - - - -class NearestNeighbors(Base): - - def __init__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, should_downcast = True, handle = None): - super(NearestNeighbors, self).__init__(handle, verbose) - - self._impl = NearestNeighborsImpl(n_neighbors, n_gpus, devices, verbose, should_downcast, self.handle) - - def fit(self, X): - return self._impl.fit(X) - - - def kneighbors(self, X, k = None): return self._impl.kneighbors(X, k) def _fit_mg(self, n_dims, alloc_info): + """ + Fits a model using multiple GPUs. This method takes in a list of dict objects + representing the distribution of the underlying device pointers. The device + information can be extracted from the pointers. + + :param n_dims + the number of features for each vector + :param alloc_info + a list of __cuda_array_interface__ dicts + :return: + """ return self._impl._fit_mg(n_dims, alloc_info) From 65e092f6ab51535eeedcca7b3670a36bcbff4292 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Mon, 13 May 2019 16:01:06 -0400 Subject: [PATCH 121/156] cudaStreamSync added to postProcessData function --- cuML/src/glm/preprocess.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cuML/src/glm/preprocess.h b/cuML/src/glm/preprocess.h index 8a903825af..99363afb3e 100644 --- a/cuML/src/glm/preprocess.h +++ b/cuML/src/glm/preprocess.h @@ -96,6 +96,8 @@ void postProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, stream); updateHost(intercept, d_intercept.data(), 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + Stats::meanAdd(input, input, mu_input, n_cols, n_rows, false, true, stream); Stats::meanAdd(labels, labels, mu_labels, 1, n_rows, false, true, stream); From 64eb5fcd57015d8b30f5ff615fe741cf7e9b712e Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 13 May 2019 16:16:35 -0400 Subject: [PATCH 122/156] Finally getting cython to build w/ cumlHandle changes --- cuML/src/umap/umap.cu | 6 ++-- cuML/src/umap/umap.h | 6 ++-- python/cuml/manifold/umap.pyx | 32 ++++++++++----------- python/cuml/neighbors/nearest_neighbors.pyx | 11 ++++--- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/cuML/src/umap/umap.cu b/cuML/src/umap/umap.cu index c9f74ae5d9..63164a03fa 100644 --- a/cuML/src/umap/umap.cu +++ b/cuML/src/umap/umap.cu @@ -43,13 +43,13 @@ namespace ML { * @param embeddings * an array to return the output embeddings of size (n_samples, n_components) */ - void UMAP_API::fit(const cumlHandle &handle, float *X, int n, int d, float *embeddings) { + void UMAP_API::fit(cumlHandle &handle, float *X, int n, int d, float *embeddings) { this->knn = new kNN(handle, d); UMAPAlgo::_fit(handle, X, n, d, knn, get_params(), embeddings, handle.getStream()); } - void UMAP_API::fit(const cumlHandle &handle, float *X, float *y, int n, int d, float *embeddings) { + void UMAP_API::fit(cumlHandle &handle, float *X, float *y, int n, int d, float *embeddings) { this->knn = new kNN(handle, d); UMAPAlgo::_fit(handle, X, y, n, d, knn, get_params(), embeddings, handle.getStream()); } @@ -69,7 +69,7 @@ namespace ML { * @param out * pointer to array for storing output embeddings (n, n_components) */ - void UMAP_API::transform(const cumlHandle &handle, float *X, int n, int d, + void UMAP_API::transform(cumlHandle &handle, float *X, int n, int d, float *embedding, int embedding_n, float *out) { UMAPAlgo::_transform(handle, X, n, d, embedding, embedding_n, knn, get_params(), out, handle.getStream()); diff --git a/cuML/src/umap/umap.h b/cuML/src/umap/umap.h index 25d4827d23..42077b880f 100644 --- a/cuML/src/umap/umap.h +++ b/cuML/src/umap/umap.h @@ -46,7 +46,7 @@ namespace ML { * @param embeddings * an array to return the output embeddings of size (n_samples, n_components) */ - void fit(const cumlHandle &handle, float *X, int n, int d, float *embeddings); + void fit(cumlHandle &handle, float *X, int n, int d, float *embeddings); /** * Fits a supervised UMAP model @@ -61,7 +61,7 @@ namespace ML { * @param embeddings * an array to return the output embeddings of size (n_samples, n_components) */ - void fit(const cumlHandle &handle, float *X, float *y, int n, int d, float *embeddings); + void fit(cumlHandle &handle, float *X, float *y, int n, int d, float *embeddings); /** * Project a set of X vectors into the embedding space. @@ -78,7 +78,7 @@ namespace ML { * @param out * pointer to array for storing output embeddings (n, n_components) */ - void transform(const cumlHandle &handle, float *X, int n, int d, + void transform(cumlHandle &handle, float *X, int n, int d, float *embedding, int embedding_n, float *out); diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index 680609c868..6df4785f3f 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -29,6 +29,7 @@ from libcpp.memory cimport shared_ptr cimport cuml.common.handle cimport cuml.common.cuda from cuml.common.base import Base +from cuml.common.handle cimport cumlHandle from cuml import numba_utils @@ -39,17 +40,6 @@ from libcpp cimport bool from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free -cdef extern from "cuML.hpp" namespace "ML" nogil: - cdef cppclass deviceAllocator: - pass - - cdef cppclass cumlHandle: - cumlHandle() except + - void setStream(cuml.common.cuda._Stream s) - void setDeviceAllocator(shared_ptr[deviceAllocator] a) - cuml.common.cuda._Stream getStream() - - cdef extern from "umap/umapparams.h" namespace "ML::UMAPParams": enum MetricType: @@ -84,20 +74,20 @@ cdef extern from "umap/umap.h" namespace "ML": UMAP_API(UMAPParams *p) except + - void fit(const cumlHandle &handle, + void fit(cumlHandle &handle, float *X, int n, int d, float *embeddings) - void fit(const cumlHandle &handle, + void fit(cumlHandle &handle, float *X, float *y, int n, int d, float *embeddings) - void transform(const cumlHandle &handle, + void transform(cumlHandle &handle, float *X, int n, int d, @@ -221,6 +211,8 @@ cdef class UMAPImpl: cdef bool _should_downcast + cdef object handle + def __cinit__(self, n_neighbors=15, n_components=2, @@ -243,6 +235,8 @@ cdef class UMAPImpl: should_downcast = True, handle = None): + self.handle = handle + self.umap_params = new UMAPParams() self.n_neighbors = n_neighbors @@ -361,12 +355,14 @@ cdef class UMAPImpl: order = "C", dtype=np.float32)) self.embeddings = self.arr_embed.device_ctypes_pointer.value + cdef cumlHandle * handle_ = < cumlHandle * > < size_t > self.handle.getHandle() + cdef uintptr_t y_raw if y is not None: y_m = self._downcast(y) y_raw = y_m.device_ctypes_pointer.value self.umap.fit( - self.handle, + handle_[0], self.raw_data, y_raw, X_m.shape[0], @@ -377,7 +373,7 @@ cdef class UMAPImpl: else: self.umap.fit( - self.handle, + handle_[0], self.raw_data, X_m.shape[0], X_m.shape[1], @@ -442,8 +438,10 @@ cdef class UMAPImpl: order = "C", dtype=np.float32)) cdef uintptr_t embed_ptr = embedding.device_ctypes_pointer.value + cdef cumlHandle * handle_ = < cumlHandle * > < size_t > self.handle.getHandle() + self.umap.transform( - self.handle, + handle_[0], x_ptr, X_m.shape[0], X_m.shape[1], diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index a51ffdb2ad..61bcadacf1 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -58,7 +58,7 @@ cdef extern from "knn/knn.h" namespace "ML": int N cdef cppclass kNN: - kNN(const cumlHandle &handle, int D, bool verbose) except + + kNN(cumlHandle &handle, int D, bool verbose) except + void search(float *search_items, int search_items_size, long *res_I, @@ -100,7 +100,8 @@ cdef class NearestNeighborsImpl: cpdef object handle - def __cinit__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, should_downcast = True, handle = None): + def __cinit__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, + should_downcast = True, handle = None): """ Construct the NearestNeighbors object for training and querying. @@ -185,7 +186,8 @@ cdef class NearestNeighborsImpl: del self.k n_dims = X.shape[1] - self.k = new kNN(self.handle, n_dims, verbose = self._verbose) + cdef cumlHandle * handle_ = < cumlHandle * > < size_t > self.handle.getHandle() + self.k = new kNN(handle_[0], n_dims, verbose = self._verbose) cdef uintptr_t X_ctype = -1 cdef uintptr_t dev_ptr = -1 @@ -246,7 +248,8 @@ cdef class NearestNeighborsImpl: if self.k != NULL: del self.k - self.k = new kNN(self.handle, n_dims, verbose = self._verbose) + cdef cumlHandle * handle_ = < cumlHandle * > < size_t > self.handle.getHandle() + self.k = new kNN(handle_[0], n_dims, verbose = self._verbose) del self.input self.input = < kNNParams * > malloc(len(alloc_info) * sizeof(kNNParams)) From e39bbb54f5170d40e0488c2f216a2acdb93a3b02 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Mon, 13 May 2019 19:04:18 -0700 Subject: [PATCH 123/156] Updating changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f06e2ea21..ae58bd04e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -60,6 +60,7 @@ - PR #552: Re-enable assert in kmeans tests with xfail as needed - PR #581: Add shared memory fast col major to row major function back with bound checks - PR #592: More efficient matrix copy/reverse methods +- PR #604: Adding cumlHandle to kNN, spectral methods, and UMAP ## Bug Fixes From 2d04ad30ab5680d0648a2729f6344c61c133538a Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 14 May 2019 03:50:39 -0700 Subject: [PATCH 124/156] Removing python Base.pyx from cdef --- python/cuml/common/base.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/common/base.pyx b/python/cuml/common/base.pyx index 7b625f950c..1027fa4f5c 100644 --- a/python/cuml/common/base.pyx +++ b/python/cuml/common/base.pyx @@ -26,7 +26,7 @@ import cuml.common.cuda import cudf -cdef class Base: +class Base: """ Base class for all the ML algos. It handles some of the common operations across all algos. Every ML algo class exposed at cython level must inherit From 9b3e3f93596f521cb421653b82104cd5285374bf Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 14 May 2019 06:52:46 -0400 Subject: [PATCH 125/156] Moving docs for UMAP cython to the proper place --- python/cuml/manifold/umap.pyx | 285 +++++++++++++++++----------------- 1 file changed, 145 insertions(+), 140 deletions(-) diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index 6df4785f3f..005becf6d2 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -97,109 +97,6 @@ cdef extern from "umap/umap.h" namespace "ML": cdef class UMAPImpl: - - """Uniform Manifold Approximation and Projection - Finds a low dimensional embedding of the data that approximates - an underlying manifold. - - Adapted from https://github.com/lmcinnes/umap/blob/master/umap/umap_.py - - Parameters - ---------- - n_neighbors: float (optional, default 15) - The size of local neighborhood (in terms of number of neighboring - sample points) used for manifold approximation. Larger values - result in more global views of the manifold, while smaller - values result in more local data being preserved. In general - values should be in the range 2 to 100. - n_components: int (optional, default 2) - The dimension of the space to embed into. This defaults to 2 to - provide easy visualization, but can reasonably be set to any - n_epochs: int (optional, default None) - The number of training epochs to be used in optimizing the - low dimensional embedding. Larger values result in more accurate - embeddings. If None is specified a value will be selected based on - the size of the input dataset (200 for large datasets, 500 for small). - learning_rate: float (optional, default 1.0) - The initial learning rate for the embedding optimization. - init: string (optional, default 'spectral') - How to initialize the low dimensional embedding. Options are: - * 'spectral': use a spectral embedding of the fuzzy 1-skeleton - * 'random': assign initial embedding positions at random. - min_dist: float (optional, default 0.1) - The effective minimum distance between embedded points. Smaller values - will result in a more clustered/clumped embedding where nearby points - on the manifold are drawn closer together, while larger values will - result on a more even dispersal of points. The value should be set - relative to the ``spread`` value, which determines the scale at which - embedded points will be spread out. - spread: float (optional, default 1.0) - The effective scale of embedded points. In combination with ``min_dist`` - this determines how clustered/clumped the embedded points are. - set_op_mix_ratio: float (optional, default 1.0) - Interpolate between (fuzzy) union and intersection as the set operation - used to combine local fuzzy simplicial sets to obtain a global fuzzy - simplicial sets. Both fuzzy set operations use the product t-norm. - The value of this parameter should be between 0.0 and 1.0; a value of - 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy - intersection. - local_connectivity: int (optional, default 1) - The local connectivity required -- i.e. the number of nearest - neighbors that should be assumed to be connected at a local level. - The higher this value the more connected the manifold becomes - locally. In practice this should be not more than the local intrinsic - dimension of the manifold. - repulsion_strength: float (optional, default 1.0) - Weighting applied to negative samples in low dimensional embedding - optimization. Values higher than one will result in greater weight - being given to negative samples. - negative_sample_rate: int (optional, default 5) - The number of negative samples to select per positive sample - in the optimization process. Increasing this value will result - in greater repulsive force being applied, greater optimization - cost, but slightly more accuracy. - transform_queue_size: float (optional, default 4.0) - For transform operations (embedding new points using a trained model_ - this will control how aggressively to search for nearest neighbors. - Larger values will result in slower performance but more accurate - nearest neighbor evaluation. - a: float (optional, default None) - More specific parameters controlling the embedding. If None these - values are set automatically as determined by ``min_dist`` and - ``spread``. - b: float (optional, default None) - More specific parameters controlling the embedding. If None these - values are set automatically as determined by ``min_dist`` and - ``spread``. - verbose: bool (optional, default False) - Controls verbosity of logging. - - Notes - ----- - This module is heavily based on Leland McInnes' reference UMAP package. - However, there are a number of differences and features that are not yet - implemented in cuml.umap: - * Specifying the random seed - * Using a non-euclidean distance metric (support for a fixed set - of non-euclidean metrics is planned for an upcoming release). - * Using a pre-computed pairwise distance matrix (under consideration - for future releases) - * Manual initialization of initial embedding positions - - In addition to these missing features, you should expect to see - the final embeddings differing between cuml.umap and the reference - UMAP. In particular, the reference UMAP uses an approximate kNN - algorithm for large data sizes while cuml.umap always uses exact - kNN. - - References - ---------- - * Leland McInnes, John Healy, James Melville - UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction - https://arxiv.org/abs/1802.03426 - - """ - cpdef UMAPParams *umap_params cpdef UMAP_API *umap cdef uintptr_t embeddings @@ -329,14 +226,6 @@ cdef class UMAPImpl: def fit(self, X, y = None): - """Fit X into an embedded space. - Parameters - ---------- - X : array, shape (n_samples, n_features) - X contains a sample per row. - y : array, shape (n_samples) - y contains a label per row. - """ assert len(X.shape) == 2, 'data should be two dimensional' assert X.shape[0] > 1, 'need more than 1 sample to build nearest neighbors graph' @@ -383,17 +272,6 @@ cdef class UMAPImpl: del X_m def fit_transform(self, X, y = None): - """Fit X into an embedded space and return that transformed - output. - Parameters - ---------- - X : array, shape (n_samples, n_features) or (n_samples, n_samples) - X contains a sample per row. - Returns - ------- - X_new : array, shape (n_samples, n_components) - Embedding of the training data in low-dimensional space. - """ self.fit(X, y) if isinstance(X, cudf.DataFrame): @@ -407,24 +285,6 @@ cdef class UMAPImpl: def transform(self, X): - """Transform X into the existing embedded space and return that - transformed output. - - Please refer to the reference UMAP implementation for information - on the differences between fit_transform() and running fit() transform(). - - Specifically, the transform() function is stochastic: - https://github.com/lmcinnes/umap/issues/158 - - Parameters - ---------- - X : array, shape (n_samples, n_features) - New data to be transformed. - Returns - ------- - X_new : array, shape (n_samples, n_components) - Embedding of the new data in low-dimensional space. - """ assert len(X.shape) == 2, 'data should be two dimensional' assert X.shape[0] > 1, 'need more than 1 sample to build nearest neighbors graph' @@ -463,6 +323,110 @@ cdef class UMAPImpl: class UMAP(Base): + """Uniform Manifold Approximation and Projection + Finds a low dimensional embedding of the data that approximates + an underlying manifold. + + Adapted from https://github.com/lmcinnes/umap/blob/master/umap/umap_.py + + Parameters + ---------- + n_neighbors: float (optional, default 15) + The size of local neighborhood (in terms of number of neighboring + sample points) used for manifold approximation. Larger values + result in more global views of the manifold, while smaller + values result in more local data being preserved. In general + values should be in the range 2 to 100. + n_components: int (optional, default 2) + The dimension of the space to embed into. This defaults to 2 to + provide easy visualization, but can reasonably be set to any + n_epochs: int (optional, default None) + The number of training epochs to be used in optimizing the + low dimensional embedding. Larger values result in more accurate + embeddings. If None is specified a value will be selected based on + the size of the input dataset (200 for large datasets, 500 for small). + learning_rate: float (optional, default 1.0) + The initial learning rate for the embedding optimization. + init: string (optional, default 'spectral') + How to initialize the low dimensional embedding. Options are: + * 'spectral': use a spectral embedding of the fuzzy 1-skeleton + * 'random': assign initial embedding positions at random. + min_dist: float (optional, default 0.1) + The effective minimum distance between embedded points. Smaller values + will result in a more clustered/clumped embedding where nearby points + on the manifold are drawn closer together, while larger values will + result on a more even dispersal of points. The value should be set + relative to the ``spread`` value, which determines the scale at which + embedded points will be spread out. + spread: float (optional, default 1.0) + The effective scale of embedded points. In combination with ``min_dist`` + this determines how clustered/clumped the embedded points are. + set_op_mix_ratio: float (optional, default 1.0) + Interpolate between (fuzzy) union and intersection as the set operation + used to combine local fuzzy simplicial sets to obtain a global fuzzy + simplicial sets. Both fuzzy set operations use the product t-norm. + The value of this parameter should be between 0.0 and 1.0; a value of + 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy + intersection. + local_connectivity: int (optional, default 1) + The local connectivity required -- i.e. the number of nearest + neighbors that should be assumed to be connected at a local level. + The higher this value the more connected the manifold becomes + locally. In practice this should be not more than the local intrinsic + dimension of the manifold. + repulsion_strength: float (optional, default 1.0) + Weighting applied to negative samples in low dimensional embedding + optimization. Values higher than one will result in greater weight + being given to negative samples. + negative_sample_rate: int (optional, default 5) + The number of negative samples to select per positive sample + in the optimization process. Increasing this value will result + in greater repulsive force being applied, greater optimization + cost, but slightly more accuracy. + transform_queue_size: float (optional, default 4.0) + For transform operations (embedding new points using a trained model_ + this will control how aggressively to search for nearest neighbors. + Larger values will result in slower performance but more accurate + nearest neighbor evaluation. + a: float (optional, default None) + More specific parameters controlling the embedding. If None these + values are set automatically as determined by ``min_dist`` and + ``spread``. + b: float (optional, default None) + More specific parameters controlling the embedding. If None these + values are set automatically as determined by ``min_dist`` and + ``spread``. + verbose: bool (optional, default False) + Controls verbosity of logging. + + Notes + ----- + This module is heavily based on Leland McInnes' reference UMAP package. + However, there are a number of differences and features that are not yet + implemented in cuml.umap: + * Specifying the random seed + * Using a non-euclidean distance metric (support for a fixed set + of non-euclidean metrics is planned for an upcoming release). + * Using a pre-computed pairwise distance matrix (under consideration + for future releases) + * Manual initialization of initial embedding positions + + In addition to these missing features, you should expect to see + the final embeddings differing between cuml.umap and the reference + UMAP. In particular, the reference UMAP uses an approximate kNN + algorithm for large data sizes while cuml.umap always uses exact + kNN. + + References + ---------- + * Leland McInnes, John Healy, James Melville + UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction + https://arxiv.org/abs/1802.03426 + + """ + + + def __init__(self, n_neighbors=15, n_components=2, @@ -509,10 +473,51 @@ class UMAP(Base): def fit(self, X, y = None): + """Fit X into an embedded space. + Parameters + ---------- + X : array, shape (n_samples, n_features) + X contains a sample per row. + y : array, shape (n_samples) + y contains a label per row. + """ + return self._impl.fit(X, y) def transform(self, X): + + """Transform X into the existing embedded space and return that + transformed output. + + Please refer to the reference UMAP implementation for information + on the differences between fit_transform() and running fit() transform(). + + Specifically, the transform() function is stochastic: + https://github.com/lmcinnes/umap/issues/158 + + Parameters + ---------- + X : array, shape (n_samples, n_features) + New data to be transformed. + Returns + ------- + X_new : array, shape (n_samples, n_components) + Embedding of the new data in low-dimensional space. + """ + return self._impl.transform(X) def fit_transform(self, X, y=None): + """Fit X into an embedded space and return that transformed + output. + Parameters + ---------- + X : array, shape (n_samples, n_features) or (n_samples, n_samples) + X contains a sample per row. + Returns + ------- + X_new : array, shape (n_samples, n_components) + Embedding of the training data in low-dimensional space. + """ + return self._impl.fit_transform(X, y) \ No newline at end of file From de5d549f81ec87c2a3f17d517fc5846461aaa82d Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 14 May 2019 09:17:01 -0400 Subject: [PATCH 126/156] Adding const to relevant parameters in epsilon neighborhood --- ml-prims/src/distance/distance.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ml-prims/src/distance/distance.h b/ml-prims/src/distance/distance.h index beb8d7616b..c24e2e275b 100644 --- a/ml-prims/src/distance/distance.h +++ b/ml-prims/src/distance/distance.h @@ -175,7 +175,7 @@ size_t getWorkspaceSize(InType* x, InType* y, int m, int n, int k) { template -void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, +void distance(InType* const x, InType* const y, OutType *dist, int m, int n, int k, void *workspace, size_t worksize, FinalLambda fin_op, cudaStream_t stream) { DistanceImpl distImpl; @@ -204,7 +204,7 @@ void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, */ template -void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, +void distance(InType* const x, InType* const y, OutType *dist, int m, int n, int k, void *workspace, size_t worksize, cudaStream_t stream) { auto default_fin_op = @@ -241,7 +241,7 @@ void distance(InType *x, InType *y, OutType *dist, int m, int n, int k, templatevoid > -size_t epsilon_neighborhood(T *a, T *b, bool *adj, int m, int n, int k, T eps, +size_t epsilon_neighborhood(T* const a, T* const b, bool *adj, int m, int n, int k, T eps, void *workspace, size_t worksize, cudaStream_t stream, Lambda fused_op) { auto epsilon_op = [n, eps, fused_op] __device__ (T val, int global_c_idx) { bool acc = val <= eps; @@ -276,7 +276,7 @@ size_t epsilon_neighborhood(T *a, T *b, bool *adj, int m, int n, int k, T eps, */ template -size_t epsilon_neighborhood(T *a, T *b, bool *adj, int m, int n, int k, T eps, +size_t epsilon_neighborhood(T* const a, T* const b, bool *adj, int m, int n, int k, T eps, void *workspace, size_t worksize, cudaStream_t stream) { return epsilon_neighborhood( a, b, adj, m, n, k, eps, workspace, worksize, stream, From 4fa5b8aa52da3ab482007437d0320b0c4351b0dc Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 14 May 2019 10:33:16 -0400 Subject: [PATCH 127/156] Removing comments --- python/cuml/neighbors/nearest_neighbors.pyx | 8 -------- 1 file changed, 8 deletions(-) diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index 61bcadacf1..9e7a4a783f 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -79,25 +79,17 @@ cdef class NearestNeighborsImpl: cpdef kNN *k - # cdef int num_gpus - # cdef uintptr_t X_ctype - # cdef uintptr_t I_ptr cdef uintptr_t D_ptr - # cdef object X_m - # cdef bool _should_downcast cdef object n_gpus cdef object devices cdef bool _verbose - # cdef object n_neighbors - # cpdef kNNParams *input - cpdef object handle def __cinit__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, From 0bede76f9eec535f72ec00e4d7627fa82d178863 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 14 May 2019 10:34:04 -0400 Subject: [PATCH 128/156] Removing accidental check-in --- cuML/src/knn/.knn.cu.swo | Bin 24576 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 cuML/src/knn/.knn.cu.swo diff --git a/cuML/src/knn/.knn.cu.swo b/cuML/src/knn/.knn.cu.swo deleted file mode 100644 index ce2e311c95113f9886d033abc1f7bacf32c77d5f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24576 zcmeI4dyFJUdBEE*?7>gjArFj6-CA5`A2)k?@aG* z-_CT8yL)cEKA&GAIK&2vnB)(m00OKC1&b&sVDb<_kw}7on1_&voq%K~5JkXW5mB(? z{JyH{$IQ;2*NG8IGA;e4XS(b8Rn=GZeO0?lyKk!=QNLEX-r>5!aVkIf+B@EV-e)ho z=XV{a9)wLVkr!vb8Y}ngdHdOe=(ppM>o>;hCv~#p>tVDOx81roUhlM;d>ok_kE8nd zS`Y}8dS@F+4uT9MFp$7@B+yBgcfI}6#q0L&*)74Z9onI;x^i(lF$ZA=5*SEeAc27d z1`-%ZU?72k1O^fqNZ|j71d{go&I8otTw9;h_I^&!{Q~>DV(<6$yg%Rm-e>QB*z^8B zUoqE)ef~nvd;4>68%SUvfq?`D5*SEeAc27d1`-%ZU?72k1O^fqNMIm=pMnJ3hU5Gk z>ApoA0PO##_W#S5JI>eP@8B`G1+IZN!}l+9oPUB3!5wfZeDPAp`8a$8J`9h*DR>9$ zhD+f4mpIO|@HBi9J_zrDQ_z6~JZL~2-U(O0cQ1CF=iokQK@(g!1iu3R{6@!l5*~;5 z!7^M34!m4)oR{Ex@K^9LxF2e86Z|S%2YbPR7cX+0zkol7kAeq>;g{gwFLa#e;VF0& z9)bs94OU?Vmf)>$E`0h8j`J>f02bjmEWi!$LmVnE!VB_>>aROBU{DDZ<7wMK$2u@cqrUfi2rRY?h6b9#QzAlI(c0@YI)U3Ubgin1u? zUwlrWL?m#^t0%?!>l$TZ(Xkh_xK*<1iTHH@{e4$j*(|su=%y)aai-}zrSXvwM${RH z+UqO-CO?x^Ki)N&go)d%1#0ww(hU|@=5QeD;P#_CmOS(uwa5)tytvZt#H+O>x4u@k z=?w34Y&3oIm&?A|pQS9Fs`%Ozfulw>o440$H#v`Fk~GhHQZ!L)+l-EOXNRngNw14- zj*J06Xm^qmJ|iT{nGRHzKJN2+_dag>)f~6cQ7Inc=b}S-Ntvw8Pk9t4jfshR*h$p> z{c33Lz>wVM9?DnTJcYL z!RCQr)Q3uAo|P~6TI?}h@; zYj}+VLt6$Al^AsbI!mfbD{9NoMt63l)zg#1M0r8jSy|OpNwq9(X4P(DY~t@I=V8Zh`DM_ zBWEfvQ?Xh4*{e(&>Q@OX2hr|mk%@`>`9wk}KmAjF$%Me#NB6}v(FrwEo*A|swHB0j z^WCZ${oRHur{Twx&a&+d%*|0h< zTsp~u!K?u|zN=R~ZqP7mCy^PdvCq0dt4j2zUIaLS+w$~LUv*?w;c8+TTeZb)ukJ7Vb=73i)Nxms zmO{FWZpgM5wUDDul`2Rww^xtfa&+;yx_xSXeroP`_27a!IyVn7 zyF=Ytouk8gK3PWSh&Ej*R;-PE)=<9J&_$9papW&`5_HC-{3WjNXXlg-K z7lzc$Qw!CFF)7eu(3GSj@9vH-%7^LKZIegcU1ExDy6cjwyTS-=$kar4UXOg4oX1og zE+^|O&q+p7l4~_d+7sjB>+9%`I;RB_p-ltzb2+3IxVrY}nkwT|qHsc4be z7A?waN|!Tj>oIkk7fFjL%BWcCd1}Z;8j`s}*L0dPMi@GYD%XeA-raj{7*lh%RcESG zwB-D;qw`bJmKA4w#1Z>{fzAGf^B9)c|LOkxSFrORheu!nZh&8gZ({3z3H}zIfzLu6 zZiGn~f*tTC_*ZQH@4%h;k(%P&%l%LAS}TG z+yXN&4I^+BTn_(&eg7r+7~Btc!eMwD`~ds@d3YKghBY_|H^VNt6uyN`{~7or_!PVs zLI_|M_JD#5;K$hX{{UZrr{K3?5st$(a3#D*`+X7~fd}DU=)m33f+qN|3M(M(E&hSQ z^}0)7e4Hh1Llq}I@;D>r93Lf?k9Z~Yhy6@%JjTMGsrkVPQJhSVBPv#-s(lx_jKQtn zmrAQlX*IEZ_fJo%^604kEz7+zrN5rSXduEWGg8fO#CS#Yg8RhfKF#M*}>+0B?p zSw&9l&R)qPX{TzFq_y!1(d1pso2=G_c(g}5Y^x=4)39=FVf~pBp5Z5YnuVCYY}V5g z6M@&EcuiS+n^(3fyY|=zBFjt?>VU}2CL{hKy_`F#SNQ6rNPM3|SVV{;>Gih6In0U}KpHdz|m&f!SU9#TZ($!N;!@bsyB6aI&v%892BNx7|p)QfG zwXT;m0&b0%Y6rS8_+F^Hv{ONFVj@XBjAzdZhOS8>ZnIhIC!a1Spo|q8wvVtRWoi5< zY;@|fCb17>(v(6lvA1KzaOtoPX6+!)b(z&Vs~WwWVzI&cfp~ch3yzy0bKDT(+IHlw zwD32y+Oqi4k&II`AC{DLsk6MdB8~~$yR?yEmc09Q+fV8ss%*q(x}b;+U62lGl1Uwk zqp6rBKNBo{rSsYnmo4lv(-&FgH5mHjqZ`gXCVzL zmT1?g5F^O6ZgJSBV23bT=3}}J;c~;bITYK92X{WZDCpW zbY5Lqo8sB@TD`BcieZ}PiAa-)@U?-Zj!4DCYl!2C6=xK-a7?krcblfmh><5{A%$Rt z<){fMZrh4>4(6^W@zxo~=a>p=_0>+WCKHdNi^oSEQl6NQxHqLg7DDNF;whBpNmOfe z`!L#-IbJlqU?o}YeuF7|%z99Jk~=ax9kyCwFfq|-GkCli?UXO;)W;-~QR}iP>q2WM z&hUtj$!t1p*MmQ-peI!ZlZ`pTJi#0t40k1MqFH&8dMK;xf|I~gHYO-tx$uKH9``Sk%q08)ENt#V}KVl{>>_4<$ z$6XG|FP~h{oHfzBwbd|A%DE;;wJ8gpE{-<64YPTQmtytUtvPbOER!8sr zuhWx{eA}-$|8I`L6B{QxQGtI&uRZMXCsrIfS0#H$>2k)=k=S{VOZKW*t590j7nrp1 zF#9?U_CfS1iXbqjDA?>Ufv3gB#5C2Jn2L7V#$F8r7B&%1nqWg6KDLO>-tah+)$#+g z4Ww%wg{Lr^*}CFNPaKs>oYNqvHFRrL@kw+)nqi%jJrgs$7)E}Z0QvH?%j{&)DF@e# z9WCDqr{RoUSt^|+>P!}uZ=0f0u2RV+orfe7T`Y6sSAGo2IFqp6N~NNUW53=2otT(D z;s?gFq?etOliKycxjpihC;H&Zn21AG&iKo+t9vCak2gP&r`?;XP= zOvWCVGAYOzv)-KLx2}dbw}KTu*_2jsyg6j%G|s&u#l-qcC`~;(!IcRqJ5_P^0~Ez9 z((l_jPmzZ0z4zF`$2@OsDw%cT%DJfAxCP(L-m{^eBILWNk#;YV<8hpf_y4 z-?~rti_hNgHm!oqNZ!88#KUH)wUrjSlTn*vWu5-uG4HT=rJ|y&PiccA5Jr(eIvW#vWqLfIIRt+zZdhqHF=(Z`zY8B$<5v3dqa@qgC3%hv_cD4The|F&`=l(we zABKCO0=vM04`Ay*0e6C&{~v?TVe3Bw55WEK>+l?Q{@=s5;T!NN_yoKQZi7Xrz>C=X zFTkI`{cr|G;c_?!UZ!j>!B60M_!c|^?}6Wf+h7r-4p+iO@Tb)2bMOev!mY3eeuVA+ zd3YFZf*V2h^S_NR;1h5fsxSoqhF|2X@HiZWIk*b0gy-0oe>b!t-M3%l`zSoketZMw z;bxeG1K_}4v*-RrcodGoQMeJtp#oRKMQ{O}4=-^x|7my-HsCrq2maFjg9hDWGMA5; z`(hpr){KKS<6zA=STm+GioFlkj2Jymk5Kyl9IP3OVi~L%|DUZH^TzMC?vLd!%(tG` z$LA;1W=o=Xdyt*K{7)6SH!--}m5Y7~rHZ@Da`t9?yi_{PXIj|e zds!;$11+h`BMr2bd!yerw8YtBD>^mvva<$OlMIhs-^6>9c_@a5H_Pe2VPU*R>*DpZ zEY4DOvV)xy5pNeaF z+lw)X>6e$$1m*C?mS&!gt{sQ9nO^Tq|JpHRynFtaRjskygTR1M){a4uiBm5UI4WwcK From a61aa4485b861353b2480c8f11d4940da32e94e6 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz Date: Tue, 14 May 2019 11:01:37 -0400 Subject: [PATCH 129/156] Improved the code structure. --- cuML/src/glm/preprocess.h | 7 +------ cuML/src/solver/cd.h | 11 +++++++---- cuML/test/sgd.cu | 1 + 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/cuML/src/glm/preprocess.h b/cuML/src/glm/preprocess.h index 99363afb3e..cf35c15de5 100644 --- a/cuML/src/glm/preprocess.h +++ b/cuML/src/glm/preprocess.h @@ -37,9 +37,6 @@ void preProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, math_t *mu_labels, math_t *norm2_input, bool fit_intercept, bool normalize, cudaStream_t stream) { - auto cublas_handle = handle.getCublasHandle(); - auto cusolver_handle = handle.getcusolverDnHandle(); - ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); ASSERT(n_rows > 1, @@ -71,14 +68,12 @@ void postProcessData(const cumlHandle_impl& handle, math_t *input, int n_rows, math_t *mu_input, math_t *mu_labels, math_t *norm2_input, bool fit_intercept, bool normalize, cudaStream_t stream) { - auto cublas_handle = handle.getCublasHandle(); - auto cusolver_handle = handle.getcusolverDnHandle(); - ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one"); ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two"); + cublasHandle_t cublas_handle = handle.getCublasHandle(); auto allocator = handle.getDeviceAllocator(); device_buffer d_intercept(allocator, stream, 1); diff --git a/cuML/src/solver/cd.h b/cuML/src/solver/cd.h index 82b85acc84..eca4055bf2 100644 --- a/cuML/src/solver/cd.h +++ b/cuML/src/solver/cd.h @@ -61,14 +61,16 @@ using namespace MLCommon; * boolean parameter to control if the intercept will be fitted or not * @param normalize * boolean parameter to control if the data will be normalized or not + * @param epochs + * Maximum number of iterations that solver will run * @param loss - * enum to use different loss functions. Only linear regression loss functions is supported right now. + * enum to use different loss functions. Only linear regression loss functions is supported right now * @param alpha * L1 parameter * @param l1_ratio - * ratio of alpha will be used for L1. (1 - l1_ratio) * alpha will be used for L2. + * ratio of alpha will be used for L1. (1 - l1_ratio) * alpha will be used for L2 * @param shuffle - * boolean parameter to control whether coordinates will be picked randomly or not. + * boolean parameter to control whether coordinates will be picked randomly or not * @param tol * tolerance to stop the solver * @param stream @@ -88,7 +90,6 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, "Parameter loss: Only SQRT_LOSS function is supported for now"); cublasHandle_t cublas_handle = handle.getCublasHandle(); - cusolverDnHandle_t cusolver_handle = handle.getcusolverDnHandle(); auto allocator = handle.getDeviceAllocator(); device_buffer pred(allocator, stream, n_rows); @@ -159,6 +160,8 @@ void cdFit(const cumlHandle_impl& handle, math_t *input, int n_rows, int n_cols, coef_prev = h_coef[ci]; updateHost(&(h_coef[ci]), coef_loc, 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + math_t diff = abs(coef_prev - h_coef[ci]); if (diff > d_coef_max) diff --git a/cuML/test/sgd.cu b/cuML/test/sgd.cu index 3ac3fd326d..824d29d6fa 100644 --- a/cuML/test/sgd.cu +++ b/cuML/test/sgd.cu @@ -205,6 +205,7 @@ protected: CUDA_CHECK(cudaFree(pred_svm_ref)); CUDA_CHECK(cudaFree(pred_log)); CUDA_CHECK(cudaFree(pred_log_ref)); + CUDA_CHECK(cudaStreamDestroy(stream)); } protected: From 554706b3e57ab7c24f07cbbb0e41af8c2bfe4751 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Tue, 14 May 2019 12:57:21 -0400 Subject: [PATCH 130/156] Bug fix for ridge.pyx --- python/cuml/linear_model/ridge.pyx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/cuml/linear_model/ridge.pyx b/python/cuml/linear_model/ridge.pyx index 8d5ec49070..69a472efa1 100644 --- a/python/cuml/linear_model/ridge.pyx +++ b/python/cuml/linear_model/ridge.pyx @@ -212,10 +212,9 @@ class Ridge(Base, RegressorMixin): self.intercept_value = 0.0 def _check_alpha(self, alpha): - for el in alpha: - if el <= 0.0: - msg = "alpha values have to be positive" - raise TypeError(msg.format(alpha)) + if alpha <= 0.0: + msg = "alpha value has to be positive" + raise TypeError(msg.format(alpha)) def _get_algorithm_int(self, algorithm): return { From 4ad67b1ffac2c7c9c68386ebf1e8601e980cc463 Mon Sep 17 00:00:00 2001 From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com> Date: Tue, 14 May 2019 12:59:37 -0400 Subject: [PATCH 131/156] stream sync added in glm.cu --- cuML/src/glm/glm.cu | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cuML/src/glm/glm.cu b/cuML/src/glm/glm.cu index f7ce075b0b..bd69245405 100644 --- a/cuML/src/glm/glm.cu +++ b/cuML/src/glm/glm.cu @@ -30,6 +30,7 @@ void olsFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, floa olsFit(handle.getImpl(), input, n_rows, n_cols, labels, coef, intercept, fit_intercept, normalize, handle.getStream(), algo); + CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } @@ -38,13 +39,15 @@ void olsFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, dou olsFit(handle.getImpl(), input, n_rows, n_cols, labels, coef, intercept, fit_intercept, normalize, handle.getStream(), algo); + CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } void olsPredict(const cumlHandle &handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds) { - olsPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, + olsPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, handle.getStream()); + CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } void olsPredict(const cumlHandle &handle, const double *input, int n_rows, int n_cols, const double *coef, @@ -52,6 +55,7 @@ void olsPredict(const cumlHandle &handle, const double *input, int n_rows, int n olsPredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, handle.getStream()); + CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } void ridgeFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, float *labels, float *alpha, @@ -60,6 +64,7 @@ void ridgeFit(const cumlHandle &handle, float *input, int n_rows, int n_cols, fl ridgeFit(handle.getImpl(), input, n_rows, n_cols, labels, alpha, n_alpha, coef, intercept, fit_intercept, normalize, handle.getStream(), algo); + CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } void ridgeFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, double *labels, @@ -68,18 +73,21 @@ void ridgeFit(const cumlHandle &handle, double *input, int n_rows, int n_cols, d ridgeFit(handle.getImpl(), input, n_rows, n_cols, labels, alpha, n_alpha, coef, intercept, fit_intercept, normalize, handle.getStream(), algo); + CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } void ridgePredict(const cumlHandle &handle, const float *input, int n_rows, int n_cols, const float *coef, float intercept, float *preds) { ridgePredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, handle.getStream()); + CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } void ridgePredict(const cumlHandle &handle, const double *input, int n_rows, int n_cols, const double *coef, double intercept, double *preds) { ridgePredict(handle.getImpl(), input, n_rows, n_cols, coef, intercept, preds, handle.getStream()); + CUDA_CHECK(cudaStreamSynchronize(handle.getStream())); } void qnFit(const cumlHandle &cuml_handle, float *X, float *y, int N, int D, From 25bbf9b88d3039e5a96713c1f0b73c142d05b648 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 14 May 2019 12:22:12 -0500 Subject: [PATCH 132/156] FEA Remove used of GLOB_RECURSE in cmake --- cpp/CMakeLists.txt | 95 +++--------------- cpp/test/CMakeLists.txt | 182 ++++++++++++++++++++++++++++++++++ cpp/test/prims/CMakeLists.txt | 85 ---------------- 3 files changed, 194 insertions(+), 168 deletions(-) create mode 100644 cpp/test/CMakeLists.txt delete mode 100644 cpp/test/prims/CMakeLists.txt diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 475c648789..72a5bfa23e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -16,7 +16,8 @@ set (CMAKE_FIND_NO_INSTALL_PREFIX TRUE FORCE) -cmake_minimum_required(VERSION 3.12 FATAL_ERROR) +cmake_minimum_required(VERSION 3.13 FATAL_ERROR) +cmake_policy(SET CMP0079 NEW) project(CUML VERSION 0.8.0 LANGUAGES CXX CUDA) @@ -44,7 +45,7 @@ option(BUILD_CUML_TESTS "Build cuML algorithm tests" ON) option(BUILD_CUML_MG_TESTS "Build cuML multigpu algorithm tests" ON) -option(BUILD_PRIM_TESTS "Build ml-prim tests" ON) +option(BUILD_PRIMS_TESTS "Build ml-prim tests" ON) option(BUILD_CUML_EXAMPLES "Build C++ API usage examples" ON) @@ -254,7 +255,9 @@ include_directories( add_subdirectory(${GTEST_DIR} ${PROJECT_BINARY_DIR}/googletest) -file(GLOB_RECURSE ml_prims_header "src_prims/*.h" "src_prims/*.hpp") +set(PRIMS_TEST_UTILS + src_prims/cuda_utils.h + src_prims/utils.h) ################################################################################################### # - build libcuml++ shared library ------------------------------------------------------------------ @@ -298,92 +301,18 @@ if(BUILD_CUML_CPP_LIBRARY) endif(BUILD_CUML_CPP_LIBRARY) ################################################################################################### -# - build ml_test executable ---------------------------------------------------------------------- +# - build test executables ------------------------------------------------------------------------ -if(BUILD_CUML_TESTS) - - file(GLOB_RECURSE cuml_test_cuda_sources "test/sg/*.cu") - - add_executable(ml_test - ${cuml_test_cuda_sources} - ${ml_prims_header}) - - target_link_libraries(ml_test - ${GTEST_LIBNAME} - ${CUDA_cublas_LIBRARY} - ${CUDA_curand_LIBRARY} - ${CUDA_cusolver_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ${CUDA_CUDART_LIBRARY} - gpufaisslib - ${CUDA_cusparse_LIBRARY} - ${CUDA_nvgraph_LIBRARY} - faisslib - ${CUML_CPP_TARGET} - pthread - ${ZLIB_LIBRARIES}) - -endif(BUILD_CUML_TESTS) - -################################################################################################### -# - build ml_mg_test executable ------------------------------------------------------------------- - -if(BUILD_CUML_MG_TESTS) - - file(GLOB_RECURSE cuml_mg_test_cuda_sources "test/mg/*.cu") - - add_executable(ml_mg_test - ${cuml_mg_test_cuda_sources} - ${ml_prims_header}) - - target_link_libraries(ml_mg_test - ${GTEST_LIBNAME} - ${CUDA_cublas_LIBRARY} - ${CUDA_curand_LIBRARY} - ${CUDA_cusolver_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ${CUDA_CUDART_LIBRARY} - ${CUDA_cusparse_LIBRARY} - ${CUDA_nvgraph_LIBRARY} - gpufaisslib - faisslib - ${CUML_CPP_TARGET} - pthread - ${ZLIB_LIBRARIES}) - -endif(BUILD_CUML_MG_TESTS) - -################################################################################################### -# - build prims_test executable ---------------------------------------------------------------- - -if(BUILD_PRIM_TESTS) - - file(GLOB_RECURSE mlprims_test_cuda_sources "test/prims/*.cu") - - set(MLPRIMS_LINK_LIBRARIES - ${CUDA_cublas_LIBRARY} - ${CUDA_curand_LIBRARY} - ${CUDA_cusolver_LIBRARY} - ${CUDA_cusparse_LIBRARY} - pthread - ${ZLIB_LIBRARIES}) - - add_executable(prims_test - ${mlprims_test_cuda_sources} - ${ml_prims_header}) - - target_link_libraries(prims_test - ${GTEST_LIBNAME} - ${MLPRIMS_LINK_LIBRARIES}) - -endif(BUILD_PRIM_TESTS) +if(BUILD_CUML_TESTS OR BUILD_CUML_MG_TESTS OR BUILD_PRIMS_TESTS) + add_subdirectory(test ${PROJECT_BINARY_DIR}/test) +endif(BUILD_CUML_TESTS OR BUILD_CUML_MG_TESTS OR BUILD_PRIMS_TESTS) ################################################################################################### # - build examples ------------------------------------------------------------------------- -if (DISABLE_EXAMPLES OR ${BUILD_CUML_EXAMPLES}) +if (BUILD_CUML_EXAMPLES) add_subdirectory(examples) -endif(DISABLE_EXAMPLES OR ${BUILD_CUML_EXAMPLES}) +endif(BUILD_CUML_EXAMPLES) ################################################################################################### # - install targets ------------------------------------------------------------------------------- diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt new file mode 100644 index 0000000000..0b9835334c --- /dev/null +++ b/cpp/test/CMakeLists.txt @@ -0,0 +1,182 @@ +# Copyright (c) 2018-2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +cmake_minimum_required(VERSION 3.12 FATAL_ERROR) +cmake_policy(SET CMP0079 NEW) + +project(cuml_test LANGUAGES CXX CUDA) + +# Policy CMP0079 set as NEW as of 3.13 allows googletest be built outside this folder +include_directories(${GTEST_DIR}/googletest/include) + +################################################################################################### +# - build ml_test executable ---------------------------------------------------------------- + +if(BUILD_CUML_TESTS) + + set(ML_TEST_LINK_LIBRARIES + ${GTEST_LIBNAME} + ${CUDA_cublas_LIBRARY} + ${CUDA_curand_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY} + ${CUDA_CUDART_LIBRARY} + gpufaisslib + ${CUDA_cusparse_LIBRARY} + ${CUDA_nvgraph_LIBRARY} + faisslib + ${CUML_CPP_TARGET} + pthread + ${ZLIB_LIBRARIES} + ) + + # (please keep the filenames in alphabetical order) + add_executable(ml + sg/cd_test.cu + sg/dbscan_test.cu + sg/handle_test.cu + sg/kmeans_test.cu + sg/knn_test.cu + sg/lkf_test.cu + sg/ols.cu + sg/pca_test.cu + sg/quasi_newton.cu + sg/rf_test.cu + sg/ridge.cu + sg/sgd.cu + sg/spectral_test.cu + sg/tsvd_test.cu + sg/umap_test.cu + ) + + target_link_libraries(ml + ${GTEST_LIBNAME} + ${ML_TEST_LINK_LIBRARIES}) + +endif(BUILD_CUML_TESTS) + +################################################################################################### +# - build test_ml_mg executable ---------------------------------------------------------------- + +if(BUILD_CUML_MG_TESTS) + + set(ML_MG_TEST_LINK_LIBRARIES + ${GTEST_LIBNAME} + ${CUDA_cublas_LIBRARY} + ${CUDA_curand_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY} + ${CUDA_CUDART_LIBRARY} + ${CUDA_cusparse_LIBRARY} + ${CUDA_nvgraph_LIBRARY} + gpufaisslib + faisslib + ${CUML_CPP_TARGET} + pthread + ${ZLIB_LIBRARIES} + ) + + # (please keep the filenames in alphabetical order) + add_executable(ml_mg + mg/knn_test_mg.cu + ) + + target_link_libraries(ml_mg + ${GTEST_LIBNAME} + ${ML_MG_TEST_LINK_LIBRARIES}) + +endif(BUILD_CUML_MG_TESTS) + +################################################################################################### +# - build prims_test executable ---------------------------------------------------------------- + +if(BUILD_PRIMS_TESTS) + + set(PRIMS_LINK_LIBRARIES + ${CUDA_cublas_LIBRARY} + ${CUDA_curand_LIBRARY} + ${CUDA_cusolver_LIBRARY} + ${CUDA_cusparse_LIBRARY} + pthread + ${ZLIB_LIBRARIES} + ) + + # (please keep the filenames in alphabetical order) + add_executable(prims + prims/add.cu + prims/binary_op.cu + prims/ternary_op.cu + prims/coalesced_reduction.cu + prims/cuda_utils.cu + prims/columnSort.cu + prims/contingencyMatrix.cu + prims/coo.cu + prims/cov.cu + prims/csr.cu + prims/decoupled_lookback.cu + prims/dist_adj.cu + prims/dist_cos.cu + prims/dist_euc_exp.cu + prims/dist_euc_unexp.cu + prims/dist_l1.cu + prims/divide.cu + prims/eig.cu + prims/eltwise.cu + prims/eltwise2d.cu + prims/gather.cu + prims/gemm.cu + prims/grid_sync.cu + prims/hinge.cu + prims/kselection.cu + prims/linearReg.cu + prims/log.cu + prims/logisticReg.cu + prims/map_then_reduce.cu + prims/math.cu + prims/matrix.cu + prims/matrix_vector_op.cu + prims/mean.cu + prims/mean_center.cu + prims/minmax.cu + prims/mvg.cu + prims/multiply.cu + prims/norm.cu + prims/penalty.cu + prims/permute.cu + prims/power.cu + prims/reduce.cu + prims/reduce_rows_by_key.cu + prims/reverse.cu + prims/rng.cu + prims/rng_int.cu + prims/rsvd.cu + prims/score.cu + prims/sigmoid.cu + prims/sqrt.cu + prims/stddev.cu + prims/strided_reduction.cu + prims/subtract.cu + prims/sum.cu + prims/svd.cu + prims/transpose.cu + prims/unary_op.cu + prims/weighted_mean.cu + ) + + target_link_libraries(prims + ${GTEST_LIBNAME} + ${PRIMS_LINK_LIBRARIES}) + +endif(BUILD_PRIMS_TESTS) diff --git a/cpp/test/prims/CMakeLists.txt b/cpp/test/prims/CMakeLists.txt deleted file mode 100644 index 01e0b19472..0000000000 --- a/cpp/test/prims/CMakeLists.txt +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2018-2019, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -cmake_minimum_required(VERSION 3.12 FATAL_ERROR) -project(prims_test LANGUAGES CXX CUDA) - -include_directories(${GTEST_DIR}/googletest/include) - -# (please keep the filenames in alphabetical order) -add_executable(prims_test - add.cu - binary_op.cu - ternary_op.cu - coalesced_reduction.cu - cuda_utils.cu - columnSort.cu - contingencyMatrix.cu - coo.cu - cov.cu - csr.cu - decoupled_lookback.cu - dist_adj.cu - dist_cos.cu - dist_euc_exp.cu - dist_euc_unexp.cu - dist_l1.cu - divide.cu - eig.cu - eltwise.cu - eltwise2d.cu - gather.cu - gemm.cu - grid_sync.cu - hinge.cu - kselection.cu - linearReg.cu - log.cu - logisticReg.cu - map_then_reduce.cu - math.cu - matrix.cu - matrix_vector_op.cu - mean.cu - mean_center.cu - minmax.cu - mvg.cu - multiply.cu - norm.cu - penalty.cu - permute.cu - power.cu - reduce.cu - reduce_rows_by_key.cu - reverse.cu - rng.cu - rng_int.cu - rsvd.cu - score.cu - sigmoid.cu - sqrt.cu - stddev.cu - strided_reduction.cu - subtract.cu - sum.cu - svd.cu - transpose.cu - unary_op.cu - weighted_mean.cu - ) - -target_link_libraries(prims_test - ${GTEST_LIBNAME} - ${MLPRIMS_LIBS}) From 2dcb126ebdb97b8fe8eba3058f9f6914f6f10a3e Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 14 May 2019 12:25:36 -0500 Subject: [PATCH 133/156] DOC Update readme for new test executables --- BUILD.md | 14 +++++++------- cpp/README.md | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/BUILD.md b/BUILD.md index d9ed320377..bdc9ec7c08 100644 --- a/BUILD.md +++ b/BUILD.md @@ -70,16 +70,16 @@ $ make install To run tests (optional): ```bash -$ ./ml_test # Single GPU algorithm tests -$ ./ml_mg_test # Multi GPU algorithm tests -$ ./prims_test # ML Primitive function tests +$ ./test/ml # Single GPU algorithm tests +$ ./test/ml_mg # Multi GPU algorithm tests +$ ./test/prims # ML Primitive function tests ``` If you want a list of the available tests: ```bash -$ ./ml_test --gtest_list_tests # Single GPU algorithm tests -$ ./ml_mg_test --gtest_list_tests # Multi GPU algorithm tests -$ ./prims_test --gtest_list_tests # ML Primitive function tests +$ ./test/ml --gtest_list_tests # Single GPU algorithm tests +$ ./test/ml_mg --gtest_list_tests # Multi GPU algorithm tests +$ ./test/prims --gtest_list_tests # ML Primitive function tests ``` 4. Build the `cuml` python package: @@ -116,7 +116,7 @@ cuML's cmake has the following configurable flags available: | BUILD_CUML_CPP_LIBRARY | [ON, OFF] | ON | Enable/disable building libcuml++ shared library. Setting this variable to `OFF` sets the variables BUILD_CUML_TESTS, BUILD_CUML_MG_TESTS and BUILD_CUML_EXAMPLES to `OFF` | | BUILD_CUML_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_test`. | | BUILD_CUML_MG_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_mg_test`. | -| BUILD_PRIM_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `prims_test`. | +| BUILD_PRIMS_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `prims_test`. | | BUILD_CUML_EXAMPLES | [ON, OFF] | ON | Enable/disable building cuML C++ API usage examples. | | CMAKE_CXX11_ABI | [ON, OFF] | ON | Enable/disable the GLIBCXX11 ABI | | DISABLE_OPENMP | [ON, OFF] | OFF | Set to `ON` to disable OpenMP | diff --git a/cpp/README.md b/cpp/README.md index 361dddaf32..b141336789 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -32,7 +32,7 @@ Current cmake offers the following configuration options: | BUILD_CUML_CPP_LIBRARY | [ON, OFF] | ON | Enable/disable building libcuml++ shared library. Setting this variable to `OFF` sets the variables BUILD_CUML_TESTS, BUILD_CUML_MG_TESTS and BUILD_CUML_EXAMPLES to `OFF` | | BUILD_CUML_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_test`. | | BUILD_CUML_MG_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `ml_mg_test`. | -| BUILD_PRIM_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `prims_test`. | +| BUILD_PRIMS_TESTS | [ON, OFF] | ON | Enable/disable building cuML algorithm test executable `prims_test`. | | BUILD_CUML_EXAMPLES | [ON, OFF] | ON | Enable/disable building cuML C++ API usage examples. | | CMAKE_CXX11_ABI | [ON, OFF] | ON | Enable/disable the GLIBCXX11 ABI | | DISABLE_OPENMP | [ON, OFF] | OFF | Set to `ON` to disable OpenMP | @@ -45,9 +45,9 @@ After running CMake in a `build` directory, if the `BUILD_*` options were not tu ```bash $ make -j # Build libcuml++ and all tests $ make -j cuml++ # Build libcuml++ -$ make -j ml_test # Build ml_test algorithm tests binary -$ make -j ml_mg_test # Build ml_mg_test multi GPU algorithms tests binary -$ make -j prims_test # Build prims_test ML primitive unit tests binary +$ make -j ml # Build ml_test algorithm tests binary +$ make -j ml_mg # Build ml_mg_test multi GPU algorithms tests binary +$ make -j prims # Build prims_test ML primitive unit tests binary ``` ### Third Party Modules From 16881ac5a377f47a95f6a62e1d65329147338c73 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 14 May 2019 12:27:53 -0500 Subject: [PATCH 134/156] FIX Update gpu ci scripts --- ci/gpu/build.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 15662761ff..bd4ea501a6 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -92,7 +92,11 @@ nvidia-smi logger "GoogleTest for libcuml..." cd $WORKSPACE/cpp/build -GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./ml_test +GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./test/ml + +logger "GoogleTest for libcuml mg..." +cd $WORKSPACE/cpp/build +GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp_mg/" ./test/ml_mg logger "Python pytest for cuml..." cd $WORKSPACE/python @@ -115,4 +119,4 @@ make -j${PARALLEL_LEVEL} prims_test logger "Run ml-prims test..." cd $WORKSPACE/cpp/build_prims -GTEST_OUTPUT="xml:${WORKSPACE}/test-results/ml-prims/" ./prims_test +GTEST_OUTPUT="xml:${WORKSPACE}/test-results/prims/" ./test/prims From e97fa7a34bf67979a5091bcab8962a8bc271ca7e Mon Sep 17 00:00:00 2001 From: wxbn Date: Fri, 10 May 2019 18:43:37 +0000 Subject: [PATCH 135/156] DistanceType as template parameter --- cuML/src/metrics/trustworthiness.cu | 25 ++----------- cuML/src/metrics/trustworthiness.h | 8 +++-- cuML/src/metrics/trustworthiness_c.h | 47 +++++++++++++++++++++++++ cuML/test/trustworthiness_test.cu | 2 +- python/cuml/metrics/trustworthiness.pyx | 29 +++++++-------- 5 files changed, 69 insertions(+), 42 deletions(-) create mode 100644 cuML/src/metrics/trustworthiness_c.h diff --git a/cuML/src/metrics/trustworthiness.cu b/cuML/src/metrics/trustworthiness.cu index 1d43dedcb6..d05064079d 100644 --- a/cuML/src/metrics/trustworthiness.cu +++ b/cuML/src/metrics/trustworthiness.cu @@ -50,7 +50,7 @@ namespace ML { knn.fit(¶ms, 1); knn.search(input, n, d_pred_I, d_pred_D, n_neighbors); - CUDA_CHECK(cudaFree(d_pred_D)); + d_alloc->deallocate(d_pred_D, n * n_neighbors * sizeof(math_t), stream); return d_pred_I; } @@ -176,27 +176,8 @@ namespace ML { return t; } - template - double trustworthiness_score(const cumlHandle& h, math_t* X, - math_t* X_embedded, int n, int m, int d, - int n_neighbors, int metric) - { - DistanceType distance_type = DistanceType(metric); - - if (distance_type == EucUnexpandedL2Sqrt) - { - return trustworthiness_score(h, - X, X_embedded, n, m, d, n_neighbors); - } - - std::ostringstream msg; - msg << "Unknown metric" << std::endl; - throw MLCommon::Exception(msg.str()); - } - - template double trustworthiness_score(const cumlHandle& h, - float* X, float* X_embedded, int n, int m, int d, - int n_neighbors, int metric); + template double trustworthiness_score(const cumlHandle& h, + float* X, float* X_embedded, int n, int m, int d, int n_neighbors); } } \ No newline at end of file diff --git a/cuML/src/metrics/trustworthiness.h b/cuML/src/metrics/trustworthiness.h index 51ce8c471d..3a2943fd29 100644 --- a/cuML/src/metrics/trustworthiness.h +++ b/cuML/src/metrics/trustworthiness.h @@ -16,18 +16,20 @@ #pragma once +#include #include #define MAX_BATCH_SIZE 512 #define N_THREADS 512 +using namespace MLCommon::Distance; + namespace ML { namespace Metrics { - template + template double trustworthiness_score(const cumlHandle& h, math_t* X, - math_t* X_embedded, int n, int m, int d, - int n_neighbors, int metric); + math_t* X_embedded, int n, int m, int d, int n_neighbors); } } \ No newline at end of file diff --git a/cuML/src/metrics/trustworthiness_c.h b/cuML/src/metrics/trustworthiness_c.h new file mode 100644 index 0000000000..4d0a7e7d67 --- /dev/null +++ b/cuML/src/metrics/trustworthiness_c.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2018-2019, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace MLCommon { + namespace Distance { + enum DistanceType { + /** evaluate as dist_ij = sum(x_ik^2) + sum(y_ij)^2 - 2*sum(x_ik * y_jk) */ + EucExpandedL2 = 0, + /** same as above, but inside the epilogue, perform square root operation */ + EucExpandedL2Sqrt, + /** cosine distance */ + EucExpandedCosine, + /** L1 distance */ + EucUnexpandedL1, + /** evaluate as dist_ij += (x_ik - y-jk)^2 */ + EucUnexpandedL2, + /** same as above, but inside the epilogue, perform square root operation */ + EucUnexpandedL2Sqrt, + }; + } +}; + +using namespace MLCommon::Distance; + +namespace ML { + namespace Metrics { + + template + double trustworthiness_score(const cumlHandle& h, math_t* X, + math_t* X_embedded, int n, int m, int d, + int n_neighbors); + + } +} \ No newline at end of file diff --git a/cuML/test/trustworthiness_test.cu b/cuML/test/trustworthiness_test.cu index 5434f07c55..2c997a616b 100644 --- a/cuML/test/trustworthiness_test.cu +++ b/cuML/test/trustworthiness_test.cu @@ -409,7 +409,7 @@ protected: updateDevice(d_X_embedded, X_embedded.data(), X_embedded.size(), stream); // euclidean test - score = trustworthiness_score(h, d_X, d_X_embedded, 50, 30, 8, 5, 5); + score = trustworthiness_score(h, d_X, d_X_embedded, 50, 30, 8, 5); d_alloc->deallocate(d_X, X.size() * sizeof(float), stream); d_alloc->deallocate(d_X_embedded, X_embedded.size() * sizeof(float), stream); diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx index 1efbf019ca..0aeab3698c 100644 --- a/python/cuml/metrics/trustworthiness.pyx +++ b/python/cuml/metrics/trustworthiness.pyx @@ -27,16 +27,16 @@ from numba import cuda from libc.stdint cimport uintptr_t from cuml.common.handle cimport cumlHandle -cdef extern from "metrics/trustworthiness.h" namespace "ML::Metrics": +cdef extern from "metrics/trustworthiness_c.h" namespace "MLCommon::Distance": - cdef double trustworthiness_score[T](const cumlHandle& h, T* X, - T* X_embedded, int n, int m, int d, - int n_neighbors, int metric) + ctypedef int DistanceType + ctypedef DistanceType euclidean "(MLCommon::Distance::DistanceType)5" +cdef extern from "metrics/trustworthiness_c.h" namespace "ML::Metrics": -metric_codes = { - 'euclidean': 5 -} + cdef double trustworthiness_score[T, DistanceType](const cumlHandle& h, T* X, + T* X_embedded, int n, int m, int d, + int n_neighbors) def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean', should_downcast=True): @@ -60,12 +60,6 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean trustworthiness score : double Trustworthiness of the low-dimensional embedding """ - - if metric in metric_codes: - metric_code = metric_codes[metric] - else: - raise Exception("Unknown metric") - if isinstance(X, cudf.DataFrame) and isinstance(X_embedded, cudf.DataFrame): datatype1 = np.dtype(X[X.columns[0]]._column.dtype) datatype2 = np.dtype(X_embedded[X_embedded.columns[0]]._column.dtype) @@ -104,9 +98,12 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean else: handle_ = handle.getHandle() - res = trustworthiness_score[float](handle_[0], d_X_ptr, - d_X_embedded_ptr, n_samples, n_features, - n_components, n_neighbors, metric_code) + if metric == 'euclidean': + res = trustworthiness_score[float, euclidean](handle_[0], d_X_ptr, + d_X_embedded_ptr, n_samples, n_features, + n_components, n_neighbors) + else: + raise Exception("Unknown metric") if handle is None: del handle_ From e5f89187ce3e1b951979996ad76ef80d1a3b94e8 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 14 May 2019 12:43:26 -0500 Subject: [PATCH 136/156] FIX Remove unused prims location variable --- cpp/CMakeLists.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 72a5bfa23e..5fdbfc9610 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -111,9 +111,6 @@ set(GTEST_LIBNAME "gtest_main" CACHE STRING set(FAISS_DIR ${PROJECT_SOURCE_DIR}/external/faiss CACHE STRING "Path to FAISS source directory") -set(MLPRIMS_DIR ${PROJECT_SOURCE_DIR}/src_prims/ CACHE STRING - "Path to the ml-prims repo") - set(CUB_DIR ${PROJECT_SOURCE_DIR}/external/cub CACHE STRING "Path to cub repo") @@ -323,7 +320,7 @@ install(TARGETS ${CUML_CPP_TARGET} DESTINATION lib) # - doxygen targets ------------------------------------------------------------------------------- # include(cmake/doxygen.cmake) -# add_doxygen_target(IN_DOXYFILE ${MLPRIMS_DIR}/Doxyfile.in +# add_doxygen_target(IN_DOXYFILE src_prims/Doxyfile.in # OUT_DOXYFILE ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile # CWD ${CMAKE_CURRENT_BINARY_DIR}) From 482a57ddcae503315dfb68836b247825ffd1c6b5 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 14 May 2019 12:57:24 -0500 Subject: [PATCH 137/156] FIX Flag typo in gpu ci build.sh --- ci/gpu/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index bd4ea501a6..5c25dba935 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -67,7 +67,7 @@ logger "Build libcuml..." mkdir -p $WORKSPACE/cpp/build cd $WORKSPACE/cpp/build logger "Run cmake libcuml..." -cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH -DBUILD_PRIM_TESTS=OFF .. +cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES=$CONDA_PREFIX/lib/libopenblas.a $GPU_ARCH -DBUILD_PRIMS_TESTS=OFF .. logger "Clean up make..." make clean From a4722809425b8fc7af5a99c8f1622fb33a6d83cd Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 14 May 2019 13:05:36 -0500 Subject: [PATCH 138/156] FIX make command in gpu ci build script --- ci/gpu/build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 5c25dba935..e4d224fce3 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -73,7 +73,7 @@ logger "Clean up make..." make clean logger "Make libcuml++ and algorithm tests..." -make -j${PARALLEL_LEVEL} cuml++ ml_test ml_mg_test +make -j${PARALLEL_LEVEL} cuml++ ml ml_mg logger "Install libcuml++..." make -j${PARALLEL_LEVEL} install @@ -115,7 +115,7 @@ cmake -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX -DCMAKE_CXX11_ABI=ON -DBLAS_LIBRARIES logger "Clean up make..." make clean logger "Make ml-prims test..." -make -j${PARALLEL_LEVEL} prims_test +make -j${PARALLEL_LEVEL} prims logger "Run ml-prims test..." cd $WORKSPACE/cpp/build_prims From 2031196e706397cdd9ba1050e6f7ae4ef2fd6934 Mon Sep 17 00:00:00 2001 From: wxbn Date: Tue, 14 May 2019 21:01:29 +0000 Subject: [PATCH 139/156] Pydoc (+fixes) --- cuML/src/random_projection/rproj.cu | 8 +++--- cuML/src/random_projection/rproj.hxx | 20 +++++++------ cuML/src/random_projection/rproj_c.h | 4 +-- ml-prims/src/random/rng.h | 1 + ml-prims/test/rng.cu | 38 +++++++++++++++++++++++++ python/cuml/random_projection/rproj.pyx | 32 +++++++++++++++++++-- 6 files changed, 85 insertions(+), 18 deletions(-) diff --git a/cuML/src/random_projection/rproj.cu b/cuML/src/random_projection/rproj.cu index 24665cf308..c3a84cd9e1 100644 --- a/cuML/src/random_projection/rproj.cu +++ b/cuML/src/random_projection/rproj.cu @@ -23,9 +23,9 @@ namespace ML { using namespace MLCommon; - template void RPROJfit(cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params); - template void RPROJfit(cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params); - template void RPROJtransform(cumlHandle& handle, float *input, rand_mat *random_matrix, float *output, paramsRPROJ* params); - template void RPROJtransform(cumlHandle& handle, double *input, rand_mat *random_matrix, double *output, paramsRPROJ* params); + template void RPROJfit(const cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params); + template void RPROJfit(const cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params); + template void RPROJtransform(const cumlHandle& handle, float *input, rand_mat *random_matrix, float *output, paramsRPROJ* params); + template void RPROJtransform(const cumlHandle& handle, double *input, rand_mat *random_matrix, double *output, paramsRPROJ* params); }; \ No newline at end of file diff --git a/cuML/src/random_projection/rproj.hxx b/cuML/src/random_projection/rproj.hxx index e5dcbe1601..12dffae3cc 100644 --- a/cuML/src/random_projection/rproj.hxx +++ b/cuML/src/random_projection/rproj.hxx @@ -39,12 +39,13 @@ namespace ML { * @input param params: data structure that includes all the parameters of the model */ template - void gaussian_random_matrix(cumlHandle& h, rand_mat *random_matrix, + void gaussian_random_matrix(const cumlHandle& h, rand_mat *random_matrix, paramsRPROJ& params) { cudaStream_t stream = h.getStream(); + auto d_alloc = h.getDeviceAllocator(); int len = params.n_components * params.n_features; - allocate(random_matrix->dense_data, len); + random_matrix->dense_data = (math_t*)d_alloc->allocate(len * sizeof(math_t), stream); auto rng = Random::Rng(params.random_state); math_t scale = 1.0 / sqrt(double(params.n_components)); rng.normal(random_matrix->dense_data, len, math_t(0), scale, stream); @@ -57,15 +58,16 @@ namespace ML { * @input param params: data structure that includes all the parameters of the model */ template - void sparse_random_matrix(cumlHandle& h, rand_mat *random_matrix, + void sparse_random_matrix(const cumlHandle& h, rand_mat *random_matrix, paramsRPROJ& params) { cudaStream_t stream = h.getStream(); + auto d_alloc = h.getDeviceAllocator(); if (params.density == 1.0f) { int len = params.n_components * params.n_features; - allocate(random_matrix->dense_data, len); + random_matrix->dense_data = (math_t*)d_alloc->allocate(len * sizeof(math_t), stream); auto rng = Random::Rng(params.random_state); math_t scale = 1.0 / sqrt(math_t(params.n_components)); rng.scaled_bernoulli(random_matrix->dense_data, len, math_t(0.5), scale, stream); @@ -95,17 +97,17 @@ namespace ML { indptr[indptr_idx] = offset; size_t len = offset; - allocate(random_matrix->indices, len); + random_matrix->indices = (int*)d_alloc->allocate(len * sizeof(int), stream); updateDevice(random_matrix->indices, indices, len, stream); alloc->deallocate(indices, indices_alloc, stream); len = indptr_idx+1; - allocate(random_matrix->indptr, len); + random_matrix->indptr = (int*)d_alloc->allocate(len * sizeof(int), stream); updateDevice(random_matrix->indptr, indptr, len, stream); alloc->deallocate(indptr, indptr_alloc, stream); len = offset; - allocate(random_matrix->sparse_data, len); + random_matrix->sparse_data = (math_t*)d_alloc->allocate(len * sizeof(math_t), stream); auto rng = Random::Rng(params.random_state); math_t scale = sqrt(1.0 / params.density) / sqrt(params.n_components); rng.scaled_bernoulli(random_matrix->sparse_data, len, math_t(0.5), scale, stream); @@ -121,7 +123,7 @@ namespace ML { * @input param params: data structure that includes all the parameters of the model */ template - void RPROJfit(cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params) + void RPROJfit(const cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params) { random_matrix->reset(); @@ -147,7 +149,7 @@ namespace ML { * @input param params: data structure that includes all the parameters of the model */ template - void RPROJtransform(cumlHandle& handle, math_t *input, rand_mat *random_matrix, + void RPROJtransform(const cumlHandle& handle, math_t *input, rand_mat *random_matrix, math_t *output, paramsRPROJ* params) { cudaStream_t stream = handle.getStream(); diff --git a/cuML/src/random_projection/rproj_c.h b/cuML/src/random_projection/rproj_c.h index af0715e0a2..f2d7352f16 100644 --- a/cuML/src/random_projection/rproj_c.h +++ b/cuML/src/random_projection/rproj_c.h @@ -69,11 +69,11 @@ namespace ML{ }; template - void RPROJfit(cumlHandle& handle, rand_mat *random_matrix, + void RPROJfit(const cumlHandle& handle, rand_mat *random_matrix, paramsRPROJ* params); template - void RPROJtransform(cumlHandle& handle, math_t *input, + void RPROJtransform(const cumlHandle& handle, math_t *input, rand_mat *random_matrix, math_t *output, paramsRPROJ* params); diff --git a/ml-prims/src/random/rng.h b/ml-prims/src/random/rng.h index 0d1b9610c7..2575126776 100644 --- a/ml-prims/src/random/rng.h +++ b/ml-prims/src/random/rng.h @@ -290,6 +290,7 @@ class Rng { * @param ptr the output array * @param len the number of elements in the output * @param prob coin-toss probability for heads + * @param scale scaling factor * @param stream stream where to launch the kernel */ template diff --git a/ml-prims/test/rng.cu b/ml-prims/test/rng.cu index 117a781d10..f4a23e9176 100644 --- a/ml-prims/test/rng.cu +++ b/ml-prims/test/rng.cu @@ -421,5 +421,43 @@ INSTANTIATE_TEST_CASE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd)); // std::cout << "mean_res:" << h_mean_result << "\n"; } +template +class ScaledBernoulliTest : public ::testing::Test { + protected: + void SetUp() override { + CUDA_CHECK(cudaStreamCreate(&stream)); + + Rng r(42); + + allocate(data, len * sizeof(T), stream); + r.scaled_bernoulli(data, len, T(0.5), T(scale), stream); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(data)); + } + + void rangeCheck() { + T* h_data = new T[len]; + updateHost(h_data, data, len, stream); + ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; })); + delete[] h_data; + } + + T* data; + cudaStream_t stream; +}; + +typedef ScaledBernoulliTest ScaledBernoulliTest1; +TEST_F(ScaledBernoulliTest1, RangeCheck) { + rangeCheck(); +} + +typedef ScaledBernoulliTest ScaledBernoulliTest2; +TEST_F(ScaledBernoulliTest2, RangeCheck) { + rangeCheck(); +} + + } // end namespace Random } // end namespace MLCommon diff --git a/python/cuml/random_projection/rproj.pyx b/python/cuml/random_projection/rproj.pyx index f7047a8d00..1162809f74 100644 --- a/python/cuml/random_projection/rproj.pyx +++ b/python/cuml/random_projection/rproj.pyx @@ -53,11 +53,11 @@ cdef extern from "random_projection/rproj_c.h" namespace "ML": size_t sparse_data_size # sparse CSC random matrix number of non-zero elements # Function used to fit the model - cdef void RPROJfit[T](cumlHandle& handle, rand_mat[T] *random_matrix, + cdef void RPROJfit[T](const cumlHandle& handle, rand_mat[T] *random_matrix, paramsRPROJ* params) # Function used to apply data transformation - cdef void RPROJtransform[T](cumlHandle& handle, T *input, + cdef void RPROJtransform[T](const cumlHandle& handle, T *input, rand_mat[T] *random_matrix, T *output, paramsRPROJ* params) @@ -66,6 +66,32 @@ cdef extern from "random_projection/rproj_c.h" namespace "ML": def johnson_lindenstrauss_min_dim(n_samples, eps=0.1): + """ + In mathematics, the Johnson–Lindenstrauss lemma states that high-dimensional data + can be embedded into lower dimension while preserving the distances. + + With p the random projection : + (1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2 + + This function finds the minimum number of components to guarantee that + the embedding is inside the eps error tolerance. + + Parameters + ---------- + + n_samples : int + Number of samples. + eps : float in (0,1) (default = 0.1) + Maximum distortion rate as defined by the Johnson-Lindenstrauss lemma. + + Returns + ------- + + n_components : int + The minimal number of components to guarantee with good probability + an eps-embedding with n_samples. + + """ return c_johnson_lindenstrauss_min_dim(n_samples, eps) cdef class BaseRandomProjection(): @@ -419,7 +445,7 @@ class SparseRandomProjection(Base, BaseRandomProjection): The Johnson–Lindenstrauss lemma can produce very conservative n_components parameter as it makes no assumption on dataset structure. - density : float in range ]0, 1] (default = 'auto') + density : float in range (0, 1] (default = 'auto') Ratio of non-zero component in the random projection matrix. If density = 'auto', the value is set to the minimum density From a0ab66d052969c0bd5645679728c121c89fefbb5 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 14 May 2019 16:14:36 -0500 Subject: [PATCH 140/156] FIX CI mg test fix --- ci/gpu/build.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index e4d224fce3..97592b1836 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -94,9 +94,10 @@ logger "GoogleTest for libcuml..." cd $WORKSPACE/cpp/build GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp/" ./test/ml -logger "GoogleTest for libcuml mg..." -cd $WORKSPACE/cpp/build -GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp_mg/" ./test/ml_mg +# Disabled while CI/the test become compatible +# logger "GoogleTest for libcuml mg..." +# cd $WORKSPACE/cpp/build +# GTEST_OUTPUT="xml:${WORKSPACE}/test-results/libcuml_cpp_mg/" ./test/ml_mg logger "Python pytest for cuml..." cd $WORKSPACE/python From 45ad13002e66c164b00f998362a3e3cdeb74e952 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 14 May 2019 17:31:22 -0500 Subject: [PATCH 141/156] FIX PEP8 style fixes --- .../cuml/linear_model/linear_regression.pyx | 3 +- python/cuml/linear_model/ridge.pyx | 62 +++++------ python/cuml/manifold/umap.pyx | 100 +++++++++--------- python/cuml/neighbors/nearest_neighbors.pyx | 45 ++++---- python/cuml/solvers/cd.pyx | 62 +++++------ python/cuml/solvers/sgd.pyx | 56 +++++----- 6 files changed, 163 insertions(+), 165 deletions(-) diff --git a/python/cuml/linear_model/linear_regression.pyx b/python/cuml/linear_model/linear_regression.pyx index ff9fccd7db..174f0d36b4 100644 --- a/python/cuml/linear_model/linear_regression.pyx +++ b/python/cuml/linear_model/linear_regression.pyx @@ -179,7 +179,8 @@ class LinearRegression(Base): """ - def __init__(self, algorithm='eig', fit_intercept=True, normalize=False, handle=None): + def __init__(self, algorithm='eig', fit_intercept=True, normalize=False, + handle=None): """ Initializes the linear regression class. diff --git a/python/cuml/linear_model/ridge.pyx b/python/cuml/linear_model/ridge.pyx index 36da6cccfd..0f0b22445e 100644 --- a/python/cuml/linear_model/ridge.pyx +++ b/python/cuml/linear_model/ridge.pyx @@ -311,16 +311,16 @@ class Ridge(Base, RegressorMixin): c_alpha1 = self.alpha ridgeFit(handle_[0], X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - &c_alpha1, - self.n_alpha, - coef_ptr, - &c_intercept1, - self.fit_intercept, - self.normalize, - self.algo) + self.n_rows, + self.n_cols, + y_ptr, + &c_alpha1, + self.n_alpha, + coef_ptr, + &c_intercept1, + self.fit_intercept, + self.normalize, + self.algo) self.intercept_ = c_intercept1 else: @@ -328,16 +328,16 @@ class Ridge(Base, RegressorMixin): ridgeFit(handle_[0], X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - &c_alpha2, - self.n_alpha, - coef_ptr, - &c_intercept2, - self.fit_intercept, - self.normalize, - self.algo) + self.n_rows, + self.n_cols, + y_ptr, + &c_alpha2, + self.n_alpha, + coef_ptr, + &c_intercept2, + self.fit_intercept, + self.normalize, + self.algo) self.intercept_ = c_intercept2 @@ -387,20 +387,20 @@ class Ridge(Base, RegressorMixin): if pred_datatype.type == np.float32: ridgePredict(handle_[0], - X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr) + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr) else: ridgePredict(handle_[0], X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr) + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr) self.handle.sync() diff --git a/python/cuml/manifold/umap.pyx b/python/cuml/manifold/umap.pyx index 1ba3411466..17a631f8b8 100644 --- a/python/cuml/manifold/umap.pyx +++ b/python/cuml/manifold/umap.pyx @@ -19,27 +19,27 @@ # cython: embedsignature = True # cython: language_level = 3 -import numpy as np -import pandas as pd import cudf +import cuml import ctypes +import numpy as np +import pandas as pd -import cuml -from libcpp.memory cimport shared_ptr -cimport cuml.common.handle -cimport cuml.common.cuda +from cuml import numba_utils from cuml.common.base import Base from cuml.common.handle cimport cumlHandle - -from cuml import numba_utils - from numba import cuda from libcpp cimport bool from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free +from libcpp.memory cimport shared_ptr + +cimport cuml.common.handle +cimport cuml.common.cuda + cdef extern from "umap/umapparams.h" namespace "ML::UMAPParams": enum MetricType: @@ -123,14 +123,14 @@ cdef class UMAPImpl: negative_sample_rate=5, transform_queue_size=4.0, init="spectral", - verbose = False, - a = None, - b = None, - target_n_neighbors = -1, - target_weights = 0.5, - target_metric = "euclidean", - should_downcast = True, - handle = None): + verbose=False, + a=None, + b=None, + target_n_neighbors=-1, + target_weights=0.5, + target_metric="euclidean", + should_downcast=True, + handle=None): self.handle = handle @@ -231,7 +231,7 @@ cdef class UMAPImpl: return X - def fit(self, X, y = None): + def fit(self, X, y=None): assert len(X.shape) == 2, 'data should be two dimensional' assert X.shape[0] > 1, 'need more than 1 sample to build nearest neighbors graph' # noqa E501 @@ -250,7 +250,7 @@ cdef class UMAPImpl: order="C", dtype=np.float32)) self.embeddings = self.arr_embed.device_ctypes_pointer.value - cdef cumlHandle * handle_ = < cumlHandle * > < size_t > self.handle.getHandle() + cdef cumlHandle* handle_ = self.handle.getHandle() cdef uintptr_t y_raw if y is not None: @@ -277,7 +277,7 @@ cdef class UMAPImpl: del X_m - def fit_transform(self, X, y = None): + def fit_transform(self, X, y=None): self.fit(X, y) if isinstance(X, cudf.DataFrame): @@ -306,14 +306,13 @@ cdef class UMAPImpl: cdef cumlHandle* handle_ = self.handle.getHandle() - self.umap.transform( - handle_[0], - x_ptr, - X_m.shape[0], - X_m.shape[1], - self.embeddings, - self.arr_embed.shape[0], - embed_ptr) + self.umap.transform(handle_[0], + x_ptr, + X_m.shape[0], + X_m.shape[1], + self.embeddings, + self.arr_embed.shape[0], + embed_ptr) if isinstance(X, cudf.DataFrame): ret = cudf.DataFrame() @@ -433,29 +432,27 @@ class UMAP(Base): """ - - def __init__(self, - n_neighbors=15, - n_components=2, - n_epochs=500, - learning_rate=1.0, - min_dist=0.1, - spread=1.0, - set_op_mix_ratio=1.0, - local_connectivity=1.0, - repulsion_strength=1.0, - negative_sample_rate=5, - transform_queue_size=4.0, - init="spectral", - verbose = False, - a = None, - b = None, - target_n_neighbors = -1, - target_weights = 0.5, - target_metric = "euclidean", - should_downcast = True, - handle = None): + n_neighbors=15, + n_components=2, + n_epochs=500, + learning_rate=1.0, + min_dist=0.1, + spread=1.0, + set_op_mix_ratio=1.0, + local_connectivity=1.0, + repulsion_strength=1.0, + negative_sample_rate=5, + transform_queue_size=4.0, + init="spectral", + verbose=False, + a=None, + b=None, + target_n_neighbors=-1, + target_weights=0.5, + target_metric="euclidean", + should_downcast=True, + handle=None): super(UMAP, self).__init__(handle, verbose) @@ -479,8 +476,7 @@ class UMAP(Base): should_downcast, self.handle) - - def fit(self, X, y = None): + def fit(self, X, y=None): """Fit X into an embedded space. Parameters ---------- diff --git a/python/cuml/neighbors/nearest_neighbors.pyx b/python/cuml/neighbors/nearest_neighbors.pyx index 82a93ee71a..e60e376369 100644 --- a/python/cuml/neighbors/nearest_neighbors.pyx +++ b/python/cuml/neighbors/nearest_neighbors.pyx @@ -24,22 +24,25 @@ import pandas as pd import cudf import ctypes import cuml -from libcpp.memory cimport shared_ptr -cimport cuml.common.handle -cimport cuml.common.cuda + +from cuml import numba_utils from cuml.common.base import Base +from cython.operator cimport dereference as deref + from libcpp cimport bool +from libcpp.memory cimport shared_ptr from librmm_cffi import librmm as rmm from libc.stdlib cimport malloc, free -from cython.operator cimport dereference as deref -from numba import cuda from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free -from cuml import numba_utils +from numba import cuda + +cimport cuml.common.handle +cimport cuml.common.cuda cdef extern from "cuML.hpp" namespace "ML" nogil: cdef cppclass deviceAllocator: @@ -91,9 +94,8 @@ cdef class NearestNeighborsImpl: cpdef kNNParams *input cpdef object handle - - def __cinit__(self, n_neighbors = 5, n_gpus = 1, devices = None, - verbose = False, should_downcast = True, handle = None): + def __cinit__(self, n_neighbors=5, n_gpus=1, devices=None, + verbose=False, should_downcast=True, handle=None): """ Construct the NearestNeighbors object for training and querying. @@ -186,7 +188,7 @@ cdef class NearestNeighborsImpl: n_dims = X.shape[1] cdef cumlHandle* handle_ = self.handle.getHandle() - self.k = new kNN(handle_[0], n_dims, verbose = self._verbose) + self.k = new kNN(handle_[0], n_dims, verbose=self._verbose) cdef uintptr_t X_ctype = -1 cdef uintptr_t dev_ptr = -1 @@ -251,7 +253,7 @@ cdef class NearestNeighborsImpl: del self.k cdef cumlHandle* handle_ = self.handle.getHandle() - self.k = new kNN(handle_[0], n_dims, verbose = self._verbose) + self.k = new kNN(handle_[0], n_dims, verbose=self._verbose) del self.input self.input = malloc(len(alloc_info) * sizeof(kNNParams)) @@ -269,7 +271,7 @@ cdef class NearestNeighborsImpl: self.k.fit( self.input, len(alloc_info)) - def kneighbors(self, X, k = None): + def kneighbors(self, X, k=None): if k is None: k = self.n_neighbors @@ -429,11 +431,12 @@ class NearestNeighbors(Base): `_. """ - def __init__(self, n_neighbors = 5, n_gpus = 1, devices = None, verbose = False, - should_downcast = True, handle = None): + def __init__(self, n_neighbors=5, n_gpus=1, devices=None, verbose=False, + should_downcast=True, handle=None): super(NearestNeighbors, self).__init__(handle, verbose) - self._impl = NearestNeighborsImpl(n_neighbors, n_gpus, devices, verbose, - should_downcast, self.handle) + self._impl = NearestNeighborsImpl(n_neighbors, n_gpus, devices, + verbose, should_downcast, + self.handle) def fit(self, X): """ @@ -446,8 +449,7 @@ class NearestNeighbors(Base): """ return self._impl.fit(X) - - def kneighbors(self, X, k = None): + def kneighbors(self, X, k=None): """ Query the GPU index for the k nearest neighbors of column vectors in X. @@ -480,12 +482,11 @@ class NearestNeighbors(Base): """ return self._impl.kneighbors(X, k) - def _fit_mg(self, n_dims, alloc_info): """ - Fits a model using multiple GPUs. This method takes in a list of dict objects - representing the distribution of the underlying device pointers. The device - information can be extracted from the pointers. + Fits a model using multiple GPUs. This method takes in a list of dict + objects representing the distribution of the underlying device + pointers. The device information can be extracted from the pointers. :param n_dims the number of features for each vector diff --git a/python/cuml/solvers/cd.pyx b/python/cuml/solvers/cd.pyx index 513fdc9c4d..c3ee51cebd 100644 --- a/python/cuml/solvers/cd.pyx +++ b/python/cuml/solvers/cd.pyx @@ -30,37 +30,36 @@ from cuml.common.handle cimport cumlHandle cdef extern from "solver/solver.hpp" namespace "ML::Solver": cdef void cdFit(cumlHandle& handle, - float *input, - int n_rows, - int n_cols, - float *labels, - float *coef, - float *intercept, - bool fit_intercept, - bool normalize, - int epochs, - int loss, - float alpha, - float l1_ratio, - bool shuffle, - float tol) except + - + float *input, + int n_rows, + int n_cols, + float *labels, + float *coef, + float *intercept, + bool fit_intercept, + bool normalize, + int epochs, + int loss, + float alpha, + float l1_ratio, + bool shuffle, + float tol) except + cdef void cdFit(cumlHandle& handle, - double *input, - int n_rows, - int n_cols, - double *labels, - double *coef, - double *intercept, - bool fit_intercept, - bool normalize, - int epochs, - int loss, - double alpha, - double l1_ratio, - bool shuffle, - double tol) except + + double *input, + int n_rows, + int n_cols, + double *labels, + double *coef, + double *intercept, + bool fit_intercept, + bool normalize, + int epochs, + int loss, + double alpha, + double l1_ratio, + bool shuffle, + double tol) except + cdef void cdPredict(cumlHandle& handle, const float *input, @@ -80,6 +79,7 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": double *preds, int loss) except + + class CD(Base): """ Coordinate Descent (CD) is a very common optimization algorithm that @@ -152,8 +152,8 @@ class CD(Base): """ def __init__(self, loss='squared_loss', alpha=0.0001, l1_ratio=0.15, - fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, - shuffle=True, handle=None): + fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, + shuffle=True, handle=None): if loss in ['squared_loss']: self.loss = self._get_loss_int(loss) diff --git a/python/cuml/solvers/sgd.pyx b/python/cuml/solvers/sgd.pyx index 4b71384b62..6488ad43da 100644 --- a/python/cuml/solvers/sgd.pyx +++ b/python/cuml/solvers/sgd.pyx @@ -90,22 +90,22 @@ cdef extern from "solver/solver.hpp" namespace "ML::Solver": int loss) except + cdef void sgdPredictBinaryClass(cumlHandle& handle, - const float *input, - int n_rows, - int n_cols, - const float *coef, - float intercept, - float *preds, - int loss) except + + const float *input, + int n_rows, + int n_cols, + const float *coef, + float intercept, + float *preds, + int loss) except + cdef void sgdPredictBinaryClass(cumlHandle& handle, - const double *input, - int n_rows, - int n_cols, - const double *coef, - double intercept, - double *preds, - int loss) except + + const double *input, + int n_rows, + int n_cols, + const double *coef, + double intercept, + double *preds, + int loss) except + class SGD(Base): @@ -491,22 +491,22 @@ class SGD(Base): if pred_datatype.type == np.float32: sgdPredictBinaryClass(handle_[0], - X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) else: sgdPredictBinaryClass(handle_[0], - X_ptr, - n_rows, - n_cols, - coef_ptr, - self.intercept_, - preds_ptr, - self.loss) + X_ptr, + n_rows, + n_cols, + coef_ptr, + self.intercept_, + preds_ptr, + self.loss) self.handle.sync() From 33e430583d9e372bab23127754646b4a2e7e3168 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Tue, 14 May 2019 17:35:04 -0500 Subject: [PATCH 142/156] FIX Remove conflicted files --- ...outer.home's conflicted copy 2019-05-14).h | 905 ----------------- ...outer.home's conflicted copy 2019-05-14).h | 936 ------------------ ...ter.home's conflicted copy 2019-05-14).txt | 88 -- ...uter.home's conflicted copy 2019-05-14).cu | 305 ------ ...uter.home's conflicted copy 2019-05-14).cu | 366 ------- ...uter.home's conflicted copy 2019-05-14).cu | 244 ----- ...uter.home's conflicted copy 2019-05-14).cu | 239 ----- ...uter.home's conflicted copy 2019-05-14).cu | 239 ----- 8 files changed, 3322 deletions(-) delete mode 100644 ml-prims/src/sparse/coo (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).h delete mode 100644 ml-prims/src/sparse/csr (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).h delete mode 100644 ml-prims/test/CMakeLists (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).txt delete mode 100644 ml-prims/test/coo (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu delete mode 100644 ml-prims/test/csr (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu delete mode 100644 ml-prims/test/hinge (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu delete mode 100644 ml-prims/test/linearReg (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu delete mode 100644 ml-prims/test/logisticReg (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu diff --git a/ml-prims/src/sparse/coo (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).h b/ml-prims/src/sparse/coo (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).h deleted file mode 100644 index ad4f7eecdc..0000000000 --- a/ml-prims/src/sparse/coo (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).h +++ /dev/null @@ -1,905 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - -#include "csr.h" - -#include "cusparse_wrappers.h" - -#include - -#include -#include -#include - -#include "cuda_utils.h" -#include - -#include - -#pragma once - -namespace MLCommon { -namespace Sparse { - - -/** @brief A Container object for sparse coordinate - * format. - * - * @tparam T: the type of the value array. - * - */ -template -class COO { - - protected: - bool owner; - public: - int *rows; - int *cols; - T *vals; - int nnz; - int n_rows; - int n_cols; - bool device; - - - /** - * @param device: are the underlying arrays going to be on device? - */ - COO(bool device = true): rows(nullptr), cols(nullptr), vals(nullptr), - nnz(-1), n_rows(-1), n_cols(-1), - device(device), owner(true){} - - /** - * @param rows: coo rows array - * @param cols: coo cols array - * @param vals: coo vals array - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - * @param device: are the underlying arrays on device? - */ - COO(int *rows, int *cols, T *vals, - int nnz, int n_rows = -1, int n_cols = -1, - bool device = true): - rows(rows), cols(cols), vals(vals), - nnz(nnz), n_rows(n_rows), n_cols(n_cols), - device(device), owner(false){} - - /** - * @param nnz: size of the rows/cols/vals arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - * @param device: are the underlying arrays on device? - */ - COO(int nnz, - int n_rows = -1, int n_cols = -1, - bool device = true, bool init = true): - rows(nullptr), cols(nullptr), vals(nullptr), nnz(nnz), - n_rows(n_rows), n_cols(n_cols), - device(device), owner(true){ - this->allocate(nnz, n_rows, n_cols, device, init); - } - - - ~COO() { - this->destroy(); - } - - /** - * @brief Size should be > 0, with the number of rows - * and cols in the dense matrix being > 0. - */ - bool validate_size() const { - if(this->nnz < 0 || n_rows < 0 || n_cols < 0) - return false; - return true; - } - - /** - * @brief If the underlying arrays have not been set, - * return false. Otherwise true. - */ - bool validate_mem() const { - if(this->rows == nullptr || - this->cols == nullptr || - this->vals == nullptr) { - return false; - } - - return true; - } - - /** - * @brief Send human-readable state information to output stream - */ - friend std::ostream & operator << (std::ostream &out, const COO &c) { - - if(c.validate_size() && c.validate_mem() ) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - out << arr2Str(c.rows, c.nnz, "rows", stream) << std::endl; - out << arr2Str(c.cols, c.nnz, "cols", stream) << std::endl; - out << arr2Str(c.vals, c.nnz, "vals", stream) << std::endl; - out << "nnz=" << c.nnz << std::endl; - out << "n_rows=" << c.n_rows << std::endl; - out << "n_cols=" << c.n_cols << std::endl; - out << "owner=" << c.owner << std::endl; - - cudaStreamDestroy(stream); - } else { - out << "Cannot print COO object: Uninitialized or invalid." << std::endl; - } - - return out; - } - - /** - * @brief Set the number of rows and cols - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - */ - void setSize(int n_rows, int n_cols) { - this->n_rows = n_rows; - this->n_cols = n_cols; - } - - /** - * @brief Set the number of rows and cols for a square dense matrix - * @param n: number of rows and cols - */ - void setSize(int n) { - this->n_rows = n; - this->n_cols = n; - } - - /** - * @brief Allocate the underlying arrays - * @param nnz: size of underlying row/col/val arrays - * @param device: allocate on device or host? - * @param init: should values be initialized to 0? - */ - void allocate(int nnz, - bool device = true, - bool init = true) { - this->allocate(nnz, -1, device, init); - } - - /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param size: the number of rows/cols in a square dense matrix - * @param device: allocate on device or host? - * @param init: should values be initialized to 0? - */ - void allocate(int nnz, int size, - bool device = true, - bool init = true) { - this->allocate(nnz, size, size, device, init); - } - - /** - * @brief Allocate the underlying arrays - * @param nnz: size of the underlying row/col/val arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of columns in the dense matrix - * @param device: allocate on device or host? - * @param init: should values be initialized to 0? - */ - void allocate(int nnz, int n_rows, int n_cols, - bool device = true, - bool init = true) { - this->n_rows = n_rows; - this->n_cols = n_cols; - this->nnz = nnz; - this->owner = true; - - if(device) { - MLCommon::allocate(this->rows, this->nnz, init); - MLCommon::allocate(this->cols, this->nnz, init); - MLCommon::allocate(this->vals, this->nnz, init); - } else { - this->rows = (int*)malloc(this->nnz*sizeof(int)); - this->cols = (int*)malloc(this->nnz*sizeof(int)); - this->vals = (T*)malloc(this->nnz*sizeof(T)); - } - } - - /** - * @brief Deallocate the underlying arrays if this object - * owns the underlying memory - */ - void destroy() { - - if(this->owner) { - try { - if(rows != nullptr) { - if(this->device) - CUDA_CHECK(cudaFree(rows)); - else - free(rows); - } - - if(cols != nullptr) { - if(this->device) - CUDA_CHECK(cudaFree(cols)); - else - free(cols); - } - - if(vals != nullptr) { - if(this->device) - CUDA_CHECK(cudaFree(vals)); - else - free(vals); - } - - rows = nullptr; - cols = nullptr; - vals = nullptr; - - } catch(Exception &e) { - std::cout << "An exception occurred freeing COO memory" << std::endl; - } - } - } -}; - - -template -cusparseStatus_t cusparse_gthr(cusparseHandle_t handle, - int nnz, - float *vals, - float *vals_sorted, - int *d_P) { - return cusparseSgthr( - handle, - nnz, - vals, - vals_sorted, - d_P, - CUSPARSE_INDEX_BASE_ZERO - ); -} - -template -cusparseStatus_t cusparse_gthr(cusparseHandle_t handle, - int nnz, - double *vals, - double *vals_sorted, - int *d_P) { - return cusparseDgthr( - handle, - nnz, - vals, - vals_sorted, - d_P, - CUSPARSE_INDEX_BASE_ZERO - ); -} - - - -/** - * @brief Sorts the arrays that comprise the coo matrix - * by row. - * - * @param m number of rows in coo matrix - * @param n number of cols in coo matrix - * @param rows rows array from coo matrix - * @param cols cols array from coo matrix - * @param vals vals array from coo matrix - * @param stream: cuda stream to use - */ -template -void coo_sort(int m, int n, int nnz, - int *rows, int *cols, T *vals, - cudaStream_t stream = 0) { - - cusparseHandle_t handle = NULL; - - size_t pBufferSizeInBytes = 0; - void *pBuffer = NULL; - int *d_P = NULL; - - CUSPARSE_CHECK(cusparseCreate(&handle)); - - CUSPARSE_CHECK(cusparseSetStream(handle, stream)); - - CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt( - handle, - m, - n, - nnz, - rows, - cols, - &pBufferSizeInBytes - )); - - allocate(d_P, nnz); - cudaMalloc(&pBuffer, pBufferSizeInBytes*sizeof(char)); - - CUSPARSE_CHECK(cusparseCreateIdentityPermutation( - handle, - nnz, - d_P)); - - CUSPARSE_CHECK(cusparseXcoosortByRow( - handle, - m, - n, - nnz, - rows, - cols, - d_P, - pBuffer - )); - - T* vals_sorted; - allocate(vals_sorted, nnz); - - CUSPARSE_CHECK(cusparse_gthr( - handle, - nnz, - vals, - vals_sorted, - d_P - )); - - cudaDeviceSynchronize(); - - - copy(vals, vals_sorted, nnz, stream); - - cudaFree(d_P); - cudaFree(vals_sorted); - cudaFree(pBuffer); - CUSPARSE_CHECK(cusparseDestroy(handle)); -} - - -/** - * @brief Sort the underlying COO arrays by row - * @tparam T: the type name of the underlying value array - * @param in: COO to sort by row - * @param stream: the cuda stream to use - */ -template - void coo_sort(COO* const in, cudaStream_t stream = 0) { - coo_sort(in->n_rows, in->n_cols, in->nnz, - in->rows, in->cols, in->vals, stream); - } - - -template -__global__ void coo_remove_zeros_kernel( - const int *rows, const int *cols, const T *vals, int nnz, - int *crows, int *ccols, T *cvals, - int *ex_scan, int *cur_ex_scan, int m) { - - int row = (blockIdx.x * TPB_X) + threadIdx.x; - - if (row < m) { - int start = cur_ex_scan[row]; - int stop = MLCommon::Sparse::get_stop_idx(row, m, nnz, cur_ex_scan); - int cur_out_idx = ex_scan[row]; - - for (int idx = start; idx < stop; idx++) { - if (vals[idx] != 0.0) { - crows[cur_out_idx] = rows[idx]; - ccols[cur_out_idx] = cols[idx]; - cvals[cur_out_idx] = vals[idx]; - ++cur_out_idx; - } - } - } -} - -template -__global__ void coo_remove_scalar_kernel( - const int *rows, const int *cols, const T *vals, int nnz, - int *crows, int *ccols, T *cvals, - int *ex_scan, int *cur_ex_scan, int m, T scalar) { - - int row = (blockIdx.x * TPB_X) + threadIdx.x; - - if (row < m) { - int start = cur_ex_scan[row]; - int stop = MLCommon::Sparse::get_stop_idx(row, m, nnz, cur_ex_scan); - int cur_out_idx = ex_scan[row]; - - for (int idx = start; idx < stop; idx++) { - if (vals[idx] != scalar) { - crows[cur_out_idx] = rows[idx]; - ccols[cur_out_idx] = cols[idx]; - cvals[cur_out_idx] = vals[idx]; - ++cur_out_idx; - } - } - } -} - - - -/** - * @brief Count all the rows in the coo row array and place them in the - * results matrix, indexed by row. - * - * @tparam TPB_X: number of threads to use per block - * @param rows the rows array of the coo matrix - * @param nnz the size of the rows array - * @param results array to place results - */ -template -__global__ void coo_row_count_kernel(int* const rows, int nnz, - int *results) { - int row = (blockIdx.x * TPB_X) + threadIdx.x; - if(row < nnz) { - atomicAdd(results+rows[row], 1); - } -} - -/** - * @brief Count the number of values for each row - * @tparam TPB_X: number of threads to use per block - * @param rows: rows array of the COO matrix - * @param nnz: size of the rows array - * @param results: output result array - * @param stream: cuda stream to use - */ -template -void coo_row_count(int* const rows, int nnz, int *results, - cudaStream_t stream) { - dim3 grid_rc(MLCommon::ceildiv(nnz, TPB_X), 1, 1); - dim3 blk_rc(TPB_X, 1, 1); - - coo_row_count_kernel<<>>( - rows, nnz, results); -} - -/** - * @brief Count the number of values for each row - * @tparam TPB_X: number of threads to use per block - * @tparam T: type name of underlying values array - * @param in: input COO object for counting rows - * @param results: output array with row counts (size=in->n_rows) - * @param stream: cuda stream to use - */ -template -void coo_row_count(COO* const in, int *results, cudaStream_t stream = 0) { - dim3 grid_rc(MLCommon::ceildiv(in->nnz, TPB_X), 1, 1); - dim3 blk_rc(TPB_X, 1, 1); - - coo_row_count_kernel<<>>( - in->rows, in->nnz, results); -} - -template -__global__ void coo_row_count_nz_kernel(int* const rows, T* const vals, int nnz, - int *results) { - int row = (blockIdx.x * TPB_X) + threadIdx.x; - if(row < nnz && vals[row] != 0.0) { - atomicAdd(results+rows[row], 1); - } -} - -template -__global__ void coo_row_count_scalar_kernel(int* const rows, T* const vals, int nnz, - T scalar, int *results) { - int row = (blockIdx.x * TPB_X) + threadIdx.x; - if(row < nnz && vals[row] != scalar) { - atomicAdd(results+rows[row], 1); - } -} - -/** - * @brief Count the number of values for each row matching a particular scalar - * @tparam TPB_X: number of threads to use per block - * @tparam T: the type name of the underlying value arrays - * @param in: Input COO array - * @param scalar: scalar to match for counting rows - * @param results: output row counts - * @param stream: cuda stream to use - */ -template -void coo_row_count_scalar(COO* const in, T scalar, int *results, - cudaStream_t stream = 0) { - dim3 grid_rc(MLCommon::ceildiv(in->nnz, TPB_X), 1, 1); - dim3 blk_rc(TPB_X, 1, 1); - - coo_row_count_scalar_kernel<<>>( - in->rows, in->vals, in->nnz, scalar, results); -} - -/** - * @brief Count the number of values for each row matching a particular scalar - * @tparam TPB_X: number of threads to use per block - * @tparam T: the type name of the underlying value arrays - * @param rows: Input COO row array - * @param cols: Input COO col array - * @param vals: Input COO val arrays - * @param nnz: size of input COO arrays - * @param scalar: scalar to match for counting rows - * @param results: output row counts - * @param stream: cuda stream to use - */ -template -void coo_row_count_scalar(int* const rows, T* const vals, int nnz, T scalar, - int *results, cudaStream_t stream = 0) { - dim3 grid_rc(MLCommon::ceildiv(nnz, TPB_X), 1, 1); - dim3 blk_rc(TPB_X, 1, 1); - - coo_row_count_scalar_kernel<<>>( - rows, vals, nnz, scalar, results); -} - -/** - * @brief Count the number of nonzeros for each row - * @tparam TPB_X: number of threads to use per block - * @tparam T: the type name of the underlying value arrays - * @param rows: Input COO row array - * @param cols: Input COO col array - * @param vals: Input COO val arrays - * @param nnz: size of input COO arrays - * @param results: output row counts - * @param stream: cuda stream to use - */ -template -void coo_row_count_nz(int* const rows, T* const vals, int nnz, int *results, - cudaStream_t stream = 0) { - dim3 grid_rc(MLCommon::ceildiv(nnz, TPB_X), 1, 1); - dim3 blk_rc(TPB_X, 1, 1); - - coo_row_count_nz_kernel<<>>( - rows, vals, nnz, results); -} - -/** - * @brief Count the number of nonzero values for each row - * @tparam TPB_X: number of threads to use per block - * @tparam T: the type name of the underlying value arrays - * @param in: Input COO array - * @param results: output row counts - * @param stream: cuda stream to use - */ -template -void coo_row_count_nz(COO* const in, int *results, cudaStream_t stream = 0) { - dim3 grid_rc(MLCommon::ceildiv(in->nnz, TPB_X), 1, 1); - dim3 blk_rc(TPB_X, 1, 1); - - coo_row_count_nz_kernel<<>>( - in->rows, in->vals, in->nnz, results); -} - - - -/** - * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix. - * - * @param rows: input array of rows (size n) - * @param cols: input array of cols (size n) - * @param vals: input array of vals (size n) - * @param nnz: size of current rows/cols/vals arrays - * @param crows: compressed array of rows - * @param ccols: compressed array of cols - * @param cvals: compressed array of vals - * @param cnnz: array of non-zero counts per row - * @param cur_cnnz array of counts per row - * @param scalar: scalar to remove from arrays - * @param n: number of rows in dense matrix - * @param stream: cuda stream to use - */ -template -void coo_remove_scalar( - const int *rows, const int *cols, const T *vals, int nnz, - int *crows, int *ccols, T *cvals, - int *cnnz, int *cur_cnnz, T scalar, int n, - cudaStream_t stream) { - - int *ex_scan, *cur_ex_scan; - MLCommon::allocate(ex_scan, n, true); - MLCommon::allocate(cur_ex_scan, n, true); - - thrust::device_ptr dev_cnnz = thrust::device_pointer_cast( - cnnz); - thrust::device_ptr dev_ex_scan = - thrust::device_pointer_cast(ex_scan); - thrust::exclusive_scan(thrust::cuda::par.on(stream), - dev_cnnz, dev_cnnz + n, dev_ex_scan); - CUDA_CHECK(cudaPeekAtLastError()); - - thrust::device_ptr dev_cur_cnnz = thrust::device_pointer_cast( - cur_cnnz); - thrust::device_ptr dev_cur_ex_scan = - thrust::device_pointer_cast(cur_ex_scan); - thrust::exclusive_scan(thrust::cuda::par.on(stream), - dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan); - CUDA_CHECK(cudaPeekAtLastError()); - - - dim3 grid(ceildiv(n, TPB_X), 1, 1); - dim3 blk(TPB_X, 1, 1); - - coo_remove_scalar_kernel<<>>( - rows, cols, vals, nnz, - crows, ccols, cvals, - dev_ex_scan.get(), dev_cur_ex_scan.get(), n, scalar - ); - CUDA_CHECK(cudaPeekAtLastError()); - - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(cur_ex_scan)); -} - -/** - * @brief Removes the values matching a particular scalar from a COO formatted sparse matrix. - * - * @param in: input COO matrix - * @param out: output COO matrix - * @param scalar: scalar to remove from arrays - * @param stream: cuda stream to use - */ -template -void coo_remove_scalar(COO* const in, - COO *out, - T scalar, - cudaStream_t stream) { - - int *row_count_nz, *row_count; - - MLCommon::allocate(row_count, in->n_rows, true); - MLCommon::allocate(row_count_nz, in->n_rows, true); - - MLCommon::Sparse::coo_row_count( - in->rows, in->nnz, row_count, stream); - CUDA_CHECK(cudaPeekAtLastError()); - - MLCommon::Sparse::coo_row_count_scalar( - in->rows, in->vals, in->nnz, scalar, row_count_nz, - stream); - CUDA_CHECK(cudaPeekAtLastError()); - - CUDA_CHECK(cudaStreamSynchronize(stream)); - - thrust::device_ptr d_row_count_nz = - thrust::device_pointer_cast(row_count_nz); - int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), - d_row_count_nz, d_row_count_nz+in->n_rows); - - out->allocate(out_nnz, in->n_rows, in->n_cols); - - coo_remove_scalar( - in->rows, in->cols, in->vals, in->nnz, - out->rows, out->cols, out->vals, - row_count_nz, row_count, scalar, in->n_rows, - stream); - CUDA_CHECK(cudaPeekAtLastError()); - - CUDA_CHECK(cudaFree(row_count)); - CUDA_CHECK(cudaFree(row_count_nz)); -} - -/** - * @brief Removes zeros from a COO formatted sparse matrix. - * - * @param in: input COO matrix - * @param out: output COO matrix - * @param stream: cuda stream to use - */ -template -void coo_remove_zeros(COO* const in, - COO *out, - cudaStream_t stream) { - coo_remove_scalar(in, out, T(0.0), stream); -} - -template -__global__ void from_knn_graph_kernel(long* const knn_indices, T* const knn_dists, int m, int k, - int *rows, int *cols, T *vals) { - - int row = (blockIdx.x * TPB_X) + threadIdx.x; - if(row < m) { - - for(int i = 0; i < k; i++) { - rows[row*k+i] = row; - cols[row*k+i] = knn_indices[row*k+i]; - vals[row*k+i] = knn_dists[row*k+i]; - } - } -} - -/** - * @brief Converts a knn graph, defined by index and distance matrices, - * into COO format. - * - * @param knn_indices: knn index array - * @param knn_dists: knn distance array - * @param m: number of vertices in graph - * @param k: number of nearest neighbors - * @param rows: output COO row array - * @param cols: output COO col array - * @param vals: output COO val array - */ -template -void from_knn(long* const knn_indices, T* const knn_dists, int m, int k, - int *rows, int *cols, T *vals) { - - dim3 grid(ceildiv(m, 32), 1, 1); - dim3 blk(32, 1, 1); - from_knn_graph_kernel<32, T><<>>( - knn_indices, knn_dists, m, k, rows, cols, vals); -} - -/** - * Converts a knn graph, defined by index and distance matrices, - * into COO format. - */ -template -void from_knn(long* const knn_indices, T* const knn_dists, int m, int k, - COO *out) { - - out->allocate(m*k, m, m); - - from_knn(knn_indices, knn_dists, m, k, - out->rows, out->cols, out->vals); -} - -/** - * @brief Generate the row indices array for a sorted COO matrix - * - * @param rows: COO rows array - * @param nnz: size of COO rows array - * @param row_ind: output row indices array - * @param m: number of rows in dense matrix - * @param stream: cuda stream to use - */ -template -void sorted_coo_to_csr( - T* const rows, int nnz, T *row_ind, int m, - cudaStream_t stream = 0) { - - T *row_counts; - MLCommon::allocate(row_counts, m, true); - - dim3 grid(ceildiv(m, 32), 1, 1); - dim3 blk(32, 1, 1); - - coo_row_count<32>(rows, nnz, row_counts, stream); - - // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(row_ind); - exclusive_scan(thrust::cuda::par.on(stream),row_counts_d, row_counts_d + m, c_ind_d); - - CUDA_CHECK(cudaFree(row_counts)); -} - -/** - * @brief Generate the row indices array for a sorted COO matrix - * - * @param coo: Input COO matrix - * @param row_ind: output row indices array - * @param stream: cuda stream to use - */ -template -void sorted_coo_to_csr(COO* const coo, int *row_ind, cudaStream_t stream = 0) { - sorted_coo_to_csr(coo->rows, coo->nnz, row_ind, coo->n_rows, stream); -} - -template< int TPB_X, typename T, typename Lambda> -__global__ void coo_symmetrize_kernel( - int *row_ind, int *rows, int *cols, T *vals, - int *orows, int *ocols, T *ovals, - int n, int cnnz, - Lambda reduction_op) { - - int row = (blockIdx.x * TPB_X) + threadIdx.x; - - if (row < n) { - - int start_idx = row_ind[row]; // each thread processes one row - int stop_idx = MLCommon::Sparse::get_stop_idx(row, n, cnnz, row_ind); - - int nnz = 0; - for (int idx = 0; idx < stop_idx-start_idx; idx++) { - - int out_idx = start_idx*2+nnz; - int row_lookup = cols[idx+start_idx]; - int t_start = row_ind[row_lookup]; // Start at - int t_stop = MLCommon::Sparse::get_stop_idx(row_lookup, n, cnnz, row_ind); - - T transpose = 0.0; - bool found_match = false; - for (int t_idx = t_start; t_idx < t_stop; t_idx++) { - - // If we find a match, let's get out of the loop - if (cols[t_idx] == rows[idx+start_idx] - && rows[t_idx] == cols[idx+start_idx] - && vals[t_idx] != 0.0) { - transpose = vals[t_idx]; - found_match = true; - break; - } - } - - - // if we didn't find an exact match, we need to add - // the transposed value into our current matrix. - if (!found_match && vals[idx] != 0.0) { - orows[out_idx + nnz] = cols[idx+start_idx]; - ocols[out_idx + nnz] = rows[idx+start_idx]; - ovals[out_idx + nnz] = vals[idx+start_idx]; - ++nnz; - } - - T val = vals[idx+start_idx]; - - // Custom reduction op on value and its transpose - T res = reduction_op(rows[idx+start_idx], cols[idx+start_idx], val, transpose); - - if (res != 0.0) { - orows[out_idx + nnz] = rows[idx+start_idx]; - ocols[out_idx + nnz] = cols[idx+start_idx]; - ovals[out_idx + nnz] = T(res); - ++nnz; - } - } - } -} - - -/** - * @brief takes a COO matrix which may not be symmetric and symmetrizes - * it, running a custom reduction function against the each value - * and its transposed value. - * - * @param in: Input COO matrix - * @param out: Output symmetrized COO matrix - * @param reduction_op: a custom reduction function - * @param stream: cuda stream to use - */ -template -void coo_symmetrize(COO* const in, - COO *out, - Lambda reduction_op, // two-argument reducer - cudaStream_t stream) { - - dim3 grid(ceildiv(in->n_rows, TPB_X), 1, 1); - dim3 blk(TPB_X, 1, 1); - - ASSERT(!out->validate_mem(), "Expecting unallocated COO for output"); - - int *in_row_ind; - MLCommon::allocate(in_row_ind, in->n_rows); - - sorted_coo_to_csr(in, in_row_ind, stream); - - out->allocate(in->nnz*2, in->n_rows, in->n_cols); - - coo_symmetrize_kernel<<>>( - in_row_ind, in->rows, in->cols, in->vals, - out->rows, out->cols, out->vals, - in->n_rows, in->nnz, reduction_op - ); - CUDA_CHECK(cudaPeekAtLastError()); - -} - -}; -}; diff --git a/ml-prims/src/sparse/csr (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).h b/ml-prims/src/sparse/csr (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).h deleted file mode 100644 index 2ab9a134bd..0000000000 --- a/ml-prims/src/sparse/csr (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).h +++ /dev/null @@ -1,936 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include "cuda_utils.h" - -#include "array/array.h" - -#include -#include - -#include -#include - -#include - - -namespace MLCommon { -namespace Sparse { - -static const float MIN_FLOAT = std::numeric_limits::min(); - -/** - * @brief a container object for sparse CSR formatted matrices - */ -template -class CSR { - -public: - int *row_ind; - int *row_ind_ptr; - T *vals; - int nnz; - int n_rows; - int n_cols; - - /** - * @brief default constructor - */ - CSR(): row_ind(nullptr), row_ind_ptr(nullptr), vals(nullptr), nnz(-1), n_rows(-1), n_cols(-1){} - - /* - * @brief construct a CSR object with arrays - * - * @param row_ind: the array of row_indices - * @param row_ind_ptr: array of row_index_ptr - * @param vals: array of data - * @param nnz: size of data and row_ind_ptr arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - */ - CSR(int* const row_ind, int* const row_ind_ptr, T* const vals, int nnz, int n_rows = -1, int n_cols = -1) { - this->row_ind = row_ind; - this->row_ind_ptr = row_ind_ptr; - this->vals = vals; - this->nnz = nnz; - this->n_rows = n_rows; - this->n_cols = n_cols; - } - /* - * @brief construct an empty allocated CSR given its size - * - * @param nnz: size of data and row_ind_ptr arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - * @param init: initialize arrays to zeros? - */ - - CSR(int nnz, int n_rows = -1, int n_cols = -1, bool init = true): - row_ind(nullptr), row_ind_ptr(nullptr), vals(nullptr), nnz(nnz), - n_rows(n_rows), n_cols(n_cols) { - this->allocate(nnz, n_rows, n_cols, init); - } - - ~CSR() { - this->free(); - } - - /** - * @brief validate size of CSR object is >0 and that - * number of rows/cols of dense matrix are also >0. - */ - bool validate_size() { - if(this->nnz < 0 || n_rows < 0 || n_cols < 0) - return false; - return true; - } - - /** - * @brief Return true if underlying arrays have been allocated, false otherwise. - */ - bool validate_mem() { - if(this->row_ind == nullptr || - this->row_ind_ptr == nullptr || - this->vals == nullptr) { - return false; - } - - return true; - } - - /** - * @brief Send human-readable object state to the given output stream - */ - friend std::ostream & operator << (std::ostream &out, const CSR &c) { - out << arr2Str(c->row_ind, c->nnz, "row_ind") << std::endl; - out << arr2Str(c->row_ind_ptr, c->nnz, "cols") << std::endl; - out << arr2Str(c->vals, c->nnz, "vals") << std::endl; - out << c->nnz << std::endl; - } - - /** - * @brief Sets the size of a non-square dense matrix - * @param n_rows: number of rows in dense matrix - * @param n_cols: number of cols in dense matrix - */ - void setSize(int n_rows, int n_cols) { - this->n_rows = n_rows; - this->n_cols = n_cols; - } - - /** - * @brief Sets the size of a square dense matrix - * @param n: number of rows & cols in dense matrix - */ - void setSize(int n) { - this->n_rows = n; - this->n_cols = n; - } - - /** - * @brief Allocate underlying arrays - * @param nnz: sets the size of the underlying arrays - * @param init: should arrays be initialized to zeros? - */ - void allocate(int nnz, bool init = true) { - this->allocate(nnz, -1, init); - } - - /** - * @brief Allocate underlying arrays and the size of the square dense matrix - * @param nnz: sets the size of the underlying arrays - * @param size: number of rows and cols in the square dense matrix - * @param init: should arrays be initialized to zeros? - */ - void allocate(int nnz, int size, bool init = true) { - this->allocate(nnz, size, size, init); - } - - /** - * @brief Allocate underlying arrays and the size of the non-square dense matrix - * @param nnz: sets the size of the underlying arrays - * @param n_rows: number of rows in the dense matrix - * @param n_cols: number of cols in the dense matrix - * @param init: should arrays be initialized to zeros? - */ - void allocate(int nnz, int n_rows, int n_cols, bool init = true) { - this->n_rows = n_rows; - this->n_cols = n_cols; - this->nnz = nnz; - MLCommon::allocate(this->row_ind, this->nnz, init); - MLCommon::allocate(this->row_ind_ptr, this->nnz, init); - MLCommon::allocate(this->vals, this->nnz, init); - } - - /** - * @brief Frees the memory from the underlying arrays - */ - void free() { - - try { - if(row_ind != nullptr) - CUDA_CHECK(cudaFree(row_ind)); - - if(row_ind_ptr != nullptr) - CUDA_CHECK(cudaFree(row_ind_ptr)); - - if(vals != nullptr) - CUDA_CHECK(cudaFree(vals)); - - row_ind = nullptr; - row_ind_ptr = nullptr; - vals = nullptr; - - } catch(Exception &e) { - std::cout << "An exception occurred freeing COO memory" << std::endl; - } - } - -}; - -template -__global__ void csr_row_normalize_l1_kernel( - int *ia, // csr row ex_scan (sorted by row) - T *vals, int nnz, // array of values and number of non-zeros - int m, // num rows in csr - T *result) { // output array - - // row-based matrix 1 thread per row - int row = (blockIdx.x * TPB_X) + threadIdx.x; - - // sum all vals for row and divide each val by sum - if(row < m) { - int start_idx = ia[row]; - int stop_idx = 0; - if(row < m-1) { - stop_idx = ia[row+1]; - } else - stop_idx = nnz; - - T sum = T(0.0); - for(int j = start_idx; j < stop_idx; j++) { - sum = sum + vals[j]; - } - - for(int j = start_idx; j < stop_idx; j++) { - if(sum != 0.0) { - T val = vals[j]; - result[j] = val / sum; - } - else { - result[j] = 0.0; - } - } - } -} - -/** - * @brief Perform L1 normalization on the rows of a given CSR-formatted sparse matrix - * - * @param ia: row_ind array - * @param vals: data array - * @param nnz: size of data array - * @param m: size of row_ind array - * @param result: l1 normalized data array - * @param stream: cuda stream to use - */ -template -void csr_row_normalize_l1( - int* const ia, // csr row ex_scan (sorted by row) - T* const vals, int nnz, // array of values and number of non-zeros - int m, // num rows in csr - T *result, - cudaStream_t stream) { // output array - - dim3 grid(MLCommon::ceildiv(m, TPB_X), 1, 1); - dim3 blk(TPB_X, 1, 1); - - csr_row_normalize_l1_kernel<<>>(ia, vals, nnz, - m, result); -} - -template -__global__ void csr_row_normalize_max_kernel( - int *ia, // csr row ind array (sorted by row) - T *vals, int nnz, // array of values and number of non-zeros - int m, // num total rows in csr - T *result) { // output array - - // row-based matrix 1 thread per row - int row = (blockIdx.x * TPB_X) + threadIdx.x; - - // sum all vals for row and divide each val by sum - if(row < m) { - int start_idx = ia[row]; - int stop_idx = 0; - if(row < m-1) { - stop_idx = ia[row+1]; - } else - stop_idx = nnz; - - T max = MIN_FLOAT; // todo: Make this min possible T - for(int j = start_idx; j < stop_idx; j++) { - if(vals[j] > max) - max = vals[j]; - } - - for(int j = start_idx; j < stop_idx; j++) { - if(max != 0.0) { - T val = vals[j]; - result[j] = val / max; - } - else { - result[j] = 0.0; - } - } - } -} - - -/** - * @brief Perform L_inf normalization on a given CSR-formatted sparse matrix - * - * @param ia: row_ind array - * @param vals: data array - * @param nnz: size of data array - * @param m: size of row_ind array - * @param result: l1 normalized data array - * @param stream: cuda stream to use - */ - -template -void csr_row_normalize_max( - int* const ia, // csr row ind array (sorted by row) - T* const vals, int nnz, // array of values and number of non-zeros - int m, // num total rows in csr - T *result, - cudaStream_t stream) { - - dim3 grid(MLCommon::ceildiv(m, TPB_X), 1, 1); - dim3 blk(TPB_X, 1, 1); - - csr_row_normalize_max_kernel<<>>(ia, vals, nnz, - m, result); - -} - -template -__device__ int get_stop_idx(T row, int m, int nnz, T *ind) { - int stop_idx = 0; - if(row < (m-1)) - stop_idx = ind[row+1]; - else - stop_idx = nnz; - - return stop_idx; -} - -template -__global__ void csr_to_coo_kernel(int *row_ind, int m, int *coo_rows, int nnz) { - - // row-based matrix 1 thread per row - int row = (blockIdx.x * TPB_X) + threadIdx.x; - if(row < m) { - int start_idx = row_ind[row]; - int stop_idx = get_stop_idx(row, m, nnz, row_ind); - for(int i = start_idx; i < stop_idx; i++) - coo_rows[i] = row; - } -} - -/** - * @brief Convert a CSR row_ind array to a COO rows array - * @param row_ind: Input CSR row_ind array - * @param m: size of row_ind array - * @param coo_rows: Output COO row array - * @param nnz: size of output COO row array - * @param stream: cuda stream to use - */ -template -void csr_to_coo(int *row_ind, int m, int *coo_rows, int nnz, - cudaStream_t stream) { - dim3 grid(MLCommon::ceildiv(m, TPB_X), 1, 1); - dim3 blk(TPB_X, 1, 1); - - csr_to_coo_kernel<<>>(row_ind, m, coo_rows, nnz); -} - - -template -__global__ void csr_add_calc_row_counts_kernel( - int *a_ind, int *a_indptr, T *a_val, int nnz1, - int *b_ind, int *b_indptr, T *b_val, int nnz2, - int m, int *out_rowcounts) { - - // loop through columns in each set of rows and - // calculate number of unique cols across both rows - int row = (blockIdx.x * TPB_X) + threadIdx.x; - - if(row < m) { - int a_start_idx = a_ind[row]; - int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); - - int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); - - /** - * Union of columns within each row of A and B so that we can scan through - * them, adding their values together. - */ - int max_size = (a_stop_idx - a_start_idx) + - (b_stop_idx - b_start_idx); - - int *arr = new int[max_size]; - int cur_arr_idx = 0; - for(int j = a_start_idx; j < a_stop_idx; j++) { - arr[cur_arr_idx] = a_indptr[j]; - cur_arr_idx++; - } - - int arr_size = cur_arr_idx; - int final_size = arr_size; - - for(int j = b_start_idx; j < b_stop_idx; j++) { - - int cur_col = b_indptr[j]; - bool found = false; - for(int k = 0; k < arr_size; k++) { - if(arr[k] == cur_col) { - found = true; - break; - } - } - - if(!found) { - final_size++; - } - } - - out_rowcounts[row] = final_size; - atomicAdd(out_rowcounts+m, final_size); - - delete arr; - } -} - - -template -__global__ void csr_add_kernel( - int *a_ind, int *a_indptr, T *a_val, int nnz1, - int *b_ind, int *b_indptr, T *b_val, int nnz2, - int m, - int *out_ind, int *out_indptr, T *out_val) { - - // 1 thread per row - int row = (blockIdx.x * TPB_X) + threadIdx.x; - - - if(row < m) { - int a_start_idx = a_ind[row]; - int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind); - - int b_start_idx = b_ind[row]; - int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind); - - int o_idx = out_ind[row]; - - int cur_o_idx = o_idx; - for(int j = a_start_idx; j < a_stop_idx; j++) { - out_indptr[cur_o_idx] = a_indptr[j]; - out_val[cur_o_idx] = a_val[j]; - cur_o_idx++; - } - - int arr_size = cur_o_idx-o_idx; - for(int j = b_start_idx; j < b_stop_idx; j++) { - int cur_col = b_indptr[j]; - bool found = false; - for(int k = o_idx; k < o_idx+arr_size; k++) { - // If we found a match, sum the two values - if(out_indptr[k] == cur_col) { - out_val[k] += b_val[j]; - found = true; - break; - } - } - - // if we didn't find a match, add the value for b - if(!found) { - out_indptr[o_idx+arr_size] = cur_col; - out_val[o_idx+arr_size] = b_val[j]; - arr_size++; - } - } - } -} - -/** - * @brief Calculate the CSR row_ind array that would result - * from summing together two CSR matrices - * @param a_ind: left hand row_ind array - * @param a_indptr: left hand index_ptr array - * @param a_val: left hand data array - * @param nnz1: size of left hand index_ptr and val arrays - * @param b_ind: right hand row_ind array - * @param b_indptr: right hand index_ptr array - * @param b_val: right hand data array - * @param nnz2: size of right hand index_ptr and val arrays - * @param m: size of output array (number of rows in final matrix) - * @param out_ind: output row_ind array - * @param stream: cuda stream to use - */ -template -size_t csr_add_calc_inds( - int* const a_ind, int* const a_indptr, T* const a_val, int nnz1, - int* const b_ind, int* const b_indptr, T* const b_val, int nnz2, - int m, int *out_ind, - cudaStream_t stream -) { - dim3 grid(ceildiv(m, TPB_X), 1, 1); - dim3 blk(TPB_X, 1, 1); - - int *row_counts; - MLCommon::allocate(row_counts, m+1, true); - - csr_add_calc_row_counts_kernel<<>>( - a_ind, a_indptr, a_val, nnz1, - b_ind, b_indptr, b_val, nnz2, - m, row_counts - ); - CUDA_CHECK(cudaPeekAtLastError()); - - int cnnz = 0; - MLCommon::updateHost(&cnnz, row_counts+m, 1, stream); - - // create csr compressed row index from row counts - thrust::device_ptr row_counts_d = thrust::device_pointer_cast(row_counts); - thrust::device_ptr c_ind_d = thrust::device_pointer_cast(out_ind); - exclusive_scan(thrust::cuda::par.on(stream),row_counts_d, row_counts_d + m, c_ind_d); - CUDA_CHECK(cudaFree(row_counts)); - - return cnnz; - -} - -/** - * @brief Calculate the CSR row_ind array that would result - * from summing together two CSR matrices - * @param a_ind: left hand row_ind array - * @param a_indptr: left hand index_ptr array - * @param a_val: left hand data array - * @param nnz1: size of left hand index_ptr and val arrays - * @param b_ind: right hand row_ind array - * @param b_indptr: right hand index_ptr array - * @param b_val: right hand data array - * @param nnz2: size of right hand index_ptr and val arrays - * @param m: size of output array (number of rows in final matrix) - * @param c_ind: output row_ind array - * @param c_indptr: output ind_ptr array - * @param c_val: output data array - * @param stream: cuda stream to use - */ -template -void csr_add_finalize( - int* const a_ind, int* const a_indptr, T* const a_val, int nnz1, - int* const b_ind, int* const b_indptr, T* const b_val, int nnz2, - int m, int* const c_ind, int *c_indptr, T *c_val, - cudaStream_t stream -) { - dim3 grid(MLCommon::ceildiv(m, TPB_X), 1, 1); - dim3 blk(TPB_X, 1, 1); - - csr_add_kernel<<>>( - a_ind, a_indptr, a_val, nnz1, - b_ind, b_indptr, b_val, nnz2, - m, c_ind, c_indptr, c_val - ); - CUDA_CHECK(cudaPeekAtLastError()); -} - -template void> -__global__ void csr_row_op_kernel(T* const row_ind, T n_rows, - T nnz, Lambda op) { - T row = blockIdx.x*TPB_X + threadIdx.x; - if(row < n_rows) { - T start_idx = row_ind[row]; - T stop_idx = row < n_rows-1 ? row_ind[row+1] : nnz; - op(row, start_idx, stop_idx); - } -} - -/** - * @brief Perform a custom row operation on a CSR matrix in batches. - * @tparam T numerical type of row_ind array - * @tparam TPB_X number of threads per block to use for underlying kernel - * @tparam Lambda type of custom operation function - * @param row_ind the CSR row_ind array to perform parallel operations over - * @param total_rows total number vertices in graph - * @param batchSize size of row_ind - * @param op custom row operation functor accepting the row and beginning index. - * @param stream cuda stream to use - */ -templatevoid> -void csr_row_op(T* const row_ind, T n_rows, T nnz, - Lambda op, cudaStream_t stream) { - - dim3 grid(MLCommon::ceildiv(n_rows, TPB_X), 1, 1); - dim3 blk(TPB_X, 1, 1); - csr_row_op_kernel<<>> - (row_ind, n_rows, nnz, op); - - CUDA_CHECK(cudaPeekAtLastError()); -} - -/** - * @brief Constructs an adjacency graph CSR row_ind_ptr array from - * a row_ind array and adjacency array. - * @tparam T the numeric type of the index arrays - * @tparam TPB_X the number of threads to use per block for kernels - * @tparam Lambda function for fused operation in the adj_graph construction - * @param row_ind the input CSR row_ind array - * @param total_rows number of vertices in graph - * @param batchSize number of vertices in current batch - * @param adj an adjacency array (size batchSize * total_rows) - * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph - * @param stream cuda stream to use - */ -templatevoid> -void csr_adj_graph_batched(T* const row_ind, T total_rows, T nnz, T batchSize, - bool* const adj, T *row_ind_ptr, cudaStream_t stream, Lambda fused_op) { - csr_row_op(row_ind, batchSize, nnz, - [fused_op, adj, total_rows, row_ind_ptr, batchSize] __device__ - (T row, T start_idx, T stop_idx) { - - fused_op(row, start_idx, stop_idx); - int k = 0; - for(T i=0; ivoid> -void csr_adj_graph_batched(T* const row_ind, T total_rows, T nnz, T batchSize, - bool* const adj, T *row_ind_ptr, cudaStream_t stream) { - csr_adj_graph_batched(row_ind, total_rows, nnz, batchSize, adj, - row_ind_ptr, stream, [] __device__ (T row, T start_idx, T stop_idx) {}); -} - -/** - * @brief Constructs an adjacency graph CSR row_ind_ptr array from a - * a row_ind array and adjacency array. - * @tparam T the numeric type of the index arrays - * @tparam TPB_X the number of threads to use per block for kernels - * @param row_ind the input CSR row_ind array - * @param n_rows number of total vertices in graph - * @param adj an adjacency array - * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph - * @param stream cuda stream to use - */ -templatevoid> -void csr_adj_graph(T* const row_ind, T total_rows, T nnz, - bool* const adj, T *row_ind_ptr, cudaStream_t stream, Lambda fused_op) { - - csr_adj_graph_batched(row_ind, total_rows, nnz, total_rows, adj, - row_ind_ptr, stream, fused_op); -} - -template -class WeakCCState { - public: - - bool *xa; - bool *fa; - bool *m; - bool owner; - - WeakCCState(T n): owner(true) { - MLCommon::allocate(xa, n, true); - MLCommon::allocate(fa, n, true); - MLCommon::allocate(m, 1, true); - } - - WeakCCState(bool *xa, bool *fa, bool *m): - owner(false), xa(xa), fa(fa), m(m) { - } - - ~WeakCCState() { - if(owner) { - try { - CUDA_CHECK(cudaFree(xa)); - CUDA_CHECK(cudaFree(fa)); - CUDA_CHECK(cudaFree(m)); - } catch(Exception &e) { - std::cout << "Exception freeing memory for WeakCCState: " << - e.what() << std::endl; - } - } - } -}; - -template -__global__ void weak_cc_label_device( - Type *labels, - Type *row_ind, Type *row_ind_ptr, Type nnz, - bool *fa, bool *xa, bool *m, - int startVertexId, int batchSize) { - int tid = threadIdx.x + blockIdx.x*TPB_X; - if(tidcj) { - ci = cj; - ci_mod = true; - } - } - if(ci_mod) { - atomicMin(labels + startVertexId + tid, ci); - xa[startVertexId + tid] = true; - m[0] = true; - } - } - } -} - - -template -__global__ void weak_cc_init_label_kernel(Type *labels, int startVertexId, int batchSize, - Type MAX_LABEL, Lambda filter_op) { - /** F1 and F2 in the paper correspond to fa and xa */ - /** Cd in paper corresponds to db_cluster */ - int tid = threadIdx.x + blockIdx.x*TPB_X; - if(tid -__global__ void weak_cc_init_all_kernel(Type *labels, bool *fa, bool *xa, - Type N, Type MAX_LABEL) { - int tid = threadIdx.x + blockIdx.x*TPB_X; - if(tid -void weak_cc_label_batched(Type *labels, - Type* const row_ind, Type* const row_ind_ptr, Type nnz, Type N, - WeakCCState *state, - Type startVertexId, Type batchSize, - cudaStream_t stream, Lambda filter_op) { - bool host_m; - bool *host_fa = (bool*)malloc(sizeof(bool)*N); - bool *host_xa = (bool*)malloc(sizeof(bool)*N); - - dim3 blocks(ceildiv(batchSize, TPB_X)); - dim3 threads(TPB_X); - Type MAX_LABEL = std::numeric_limits::max(); - - weak_cc_init_label_kernel<<>>(labels, - startVertexId, batchSize, MAX_LABEL, filter_op); - CUDA_CHECK(cudaPeekAtLastError()); - do { - CUDA_CHECK( cudaMemsetAsync(state->m, false, sizeof(bool), stream) ); - weak_cc_label_device<<>>( - labels, - row_ind, row_ind_ptr, nnz, - state->fa, state->xa, state->m, - startVertexId, batchSize); - CUDA_CHECK(cudaPeekAtLastError()); - - //** swapping F1 and F2 - MLCommon::updateHost(host_fa, state->fa, N, stream); - MLCommon::updateHost(host_xa, state->xa, N, stream); - MLCommon::updateDevice(state->fa, host_xa, N, stream); - MLCommon::updateDevice(state->xa, host_fa, N, stream); - - //** Updating m * - MLCommon::updateHost(&host_m, state->m, 1, stream); - CUDA_CHECK(cudaStreamSynchronize(stream)); - } while(host_m); -} - -/** - * @brief Compute weakly connected components. Note that the resulting labels - * may not be taken from a monotonically increasing set (eg. numbers may be - * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, - * which will make a monotonically increasing set of labels. - * - * This implementation comes from [1] and solves component labeling problem in - * parallel on CSR-indexes based upon the vertex degree and adjacency graph. - * - * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" - * - * @tparam Type the numeric type of non-floating point elements - * @tparam TPB_X the threads to use per block when configuring the kernel - * @tparam Lambda the type of an optional filter function (int)->bool - * @param labels an array for the output labels - * @param row_ind the compressed row index of the CSR array - * @param row_ind_ptr the row index pointer of the CSR array - * @param nnz the size of row_ind_ptr array - * @param N number of vertices - * @param startVertexId the starting vertex index for the current batch - * @param batchSize number of vertices for current batch - * @param state instance of inter-batch state management - * @param stream the cuda stream to use - * @param filter_op an optional filtering function to determine which points - * should get considered for labeling. - */ -templatebool> -void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr, - Type nnz, Type N, Type startVertexId, Type batchSize, - WeakCCState *state, cudaStream_t stream, Lambda filter_op) { - - dim3 blocks(ceildiv(N, TPB_X)); - dim3 threads(TPB_X); - - Type MAX_LABEL = std::numeric_limits::max(); - if(startVertexId == 0) { - weak_cc_init_all_kernel<<>> - (labels, state->fa, state->xa, N, MAX_LABEL); - CUDA_CHECK(cudaPeekAtLastError()); - } - weak_cc_label_batched(labels, row_ind, row_ind_ptr, nnz, N, state, - startVertexId, batchSize, stream, filter_op); -} - -/** - * @brief Compute weakly connected components. Note that the resulting labels - * may not be taken from a monotonically increasing set (eg. numbers may be - * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, - * which will make a monotonically increasing set of labels. - * - * This implementation comes from [1] and solves component labeling problem in - * parallel on CSR-indexes based upon the vertex degree and adjacency graph. - * - * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" - * - * @tparam Type the numeric type of non-floating point elements - * @tparam TPB_X the threads to use per block when configuring the kernel - * @tparam Lambda the type of an optional filter function (int)->bool - * @param labels an array for the output labels - * @param row_ind the compressed row index of the CSR array - * @param row_ind_ptr the row index pointer of the CSR array - * @param nnz the size of row_ind_ptr array - * @param N number of vertices - * @param startVertexId the starting vertex index for the current batch - * @param batchSize number of vertices for current batch - * @param state instance of inter-batch state management - * @param stream the cuda stream to use - */ -template -void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr, - Type nnz, Type N, Type startVertexId, Type batchSize, - WeakCCState *state, cudaStream_t stream) { - - weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, startVertexId, batchSize, - state, stream, [] __device__ (int tid) {return true;}); -} - -/** - * @brief Compute weakly connected components. Note that the resulting labels - * may not be taken from a monotonically increasing set (eg. numbers may be - * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, - * which will make a monotonically increasing set of labels. - * - * This implementation comes from [1] and solves component labeling problem in - * parallel on CSR-indexes based upon the vertex degree and adjacency graph. - * - * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" - * - * @tparam Type the numeric type of non-floating point elements - * @tparam TPB_X the threads to use per block when configuring the kernel - * @tparam Lambda the type of an optional filter function (int)->bool - * @param labels an array for the output labels - * @param row_ind the compressed row index of the CSR array - * @param row_ind_ptr the row index pointer of the CSR array - * @param nnz the size of row_ind_ptr array - * @param N number of vertices - * @param stream the cuda stream to use - * @param filter_op an optional filtering function to determine which points - * should get considered for labeling. - */ -templatebool> -void weak_cc(Type *labels, Type* const row_ind, Type* const row_ind_ptr, - Type nnz, Type N, cudaStream_t stream, Lambda filter_op) { - - WeakCCState state(N); - weak_cc_batched( - labels, row_ind, row_ind_ptr, - nnz, N, 0, N, stream, - filter_op); -} - -/** - * @brief Compute weakly connected components. Note that the resulting labels - * may not be taken from a monotonically increasing set (eg. numbers may be - * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, - * which will make a monotonically increasing set of labels. - * - * This implementation comes from [1] and solves component labeling problem in - * parallel on CSR-indexes based upon the vertex degree and adjacency graph. - * - * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" - * - * @tparam Type the numeric type of non-floating point elements - * @tparam TPB_X the threads to use per block when configuring the kernel - * @tparam Lambda the type of an optional filter function (int)->bool - * @param labels an array for the output labels - * @param row_ind the compressed row index of the CSR array - * @param row_ind_ptr the row index pointer of the CSR array - * @param nnz the size of row_ind_ptr array - * @param N number of vertices - * @param stream the cuda stream to use - * should get considered for labeling. - */ -template -void weak_cc(Type *labels, Type* const row_ind, Type* const row_ind_ptr, - Type nnz, Type N, cudaStream_t stream) { - - WeakCCState state(N); - weak_cc_batched( - labels, row_ind, row_ind_ptr, - nnz, N, 0, N, stream, - [](Type t){return true;}); -} - - - -}; -}; diff --git a/ml-prims/test/CMakeLists (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).txt b/ml-prims/test/CMakeLists (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).txt deleted file mode 100644 index d46cf06ff5..0000000000 --- a/ml-prims/test/CMakeLists (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).txt +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2018-2019, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -cmake_minimum_required(VERSION 3.8 FATAL_ERROR) -project(mlcommon_test LANGUAGES CXX CUDA) - -include_directories(${GTEST_DIR}/googletest/include) - -# single gpu unit-tests -# (please keep the filenames in alphabetical order) -add_executable(mlcommon_test - add.cu - array.cu - binary_op.cu - ternary_op.cu - coalesced_reduction.cu - cuda_utils.cu - columnSort.cu - contingencyMatrix.cu - coo.cu - cov.cu - csr.cu - decoupled_lookback.cu - dist_adj.cu - dist_cos.cu - dist_eps.cu - dist_euc_exp.cu - dist_euc_unexp.cu - dist_l1.cu - divide.cu - eig.cu - eltwise.cu - eltwise2d.cu - gather.cu - gemm.cu - grid_sync.cu - hinge.cu - kselection.cu - linearReg.cu - log.cu - logisticReg.cu - map_then_reduce.cu - math.cu - matrix.cu - matrix_vector_op.cu - mean.cu - mean_center.cu - minmax.cu - mvg.cu - multiply.cu - norm.cu - penalty.cu - permute.cu - power.cu - reduce.cu - reduce_rows_by_key.cu - reverse.cu - rng.cu - rng_int.cu - rsvd.cu - score.cu - sigmoid.cu - sqrt.cu - stddev.cu - strided_reduction.cu - subtract.cu - sum.cu - svd.cu - transpose.cu - unary_op.cu - weighted_mean.cu - ) - -target_link_libraries(mlcommon_test - ${GTEST_LIBNAME} - ${MLPRIMS_LIBS}) diff --git a/ml-prims/test/coo (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu b/ml-prims/test/coo (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu deleted file mode 100644 index a5757b6aa4..0000000000 --- a/ml-prims/test/coo (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu +++ /dev/null @@ -1,305 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "coo.h" -#include -#include "sparse/coo.h" -#include "random/rng.h" -#include "test_utils.h" - -#include - -namespace MLCommon { -namespace Sparse { - -template -class COOTest : public ::testing::TestWithParam> { -protected: - void SetUp() override {} - - void TearDown() override {} - -protected: - COOInputs params; -}; - -const std::vector> inputsf = { - {5, 10, 5, 1234ULL}}; - - -typedef COOTest SortedCOOToCSR; -TEST_P(SortedCOOToCSR, Result) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - int nnz = 8; - - int *in, *out, *exp; - - int *in_h = new int[nnz] { 0, 0, 1, 1, 2, 2, 3, 3 }; - int *exp_h = new int[4] {0, 2, 4, 6 }; - - allocate(in, nnz, true); - allocate(exp, 4, true); - allocate(out, 4, true); - - updateDevice(in, in_h, nnz, stream); - updateDevice(exp, exp_h, 4, stream); - - sorted_coo_to_csr(in, nnz, out, 4, stream); - - ASSERT_TRUE(devArrMatch(out, exp, 4, Compare())); - - cudaStreamDestroy(stream); - - delete in_h; - delete exp_h; - - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(exp)); - CUDA_CHECK(cudaFree(out)); - -} - - -typedef COOTest COOSymmetrize; -TEST_P(COOSymmetrize, Result) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - int nnz = 8; - - int *in_rows_h = new int[nnz] { 0, 0, 1, 1, 2, 2, 3, 3 }; - int *in_cols_h = new int[nnz] { 1, 3, 2, 3, 0, 1, 0, 2 }; - float *in_vals_h = new float[nnz]{ 0.5, 0.5, 0.5, 0.5, 0.5, 0, 0.5, 0.5 }; - - int *exp_rows_h = new int[nnz*2] {1, 0, 0, 0, 0, 1, 0, 0, 3, 1, 0, 0, 2, 0, 2, 3 }; - int *exp_cols_h = new int[nnz*2] {0, 1, 0, 0, 3, 2, 0, 0, 1, 3, 0, 0, 1, 0, 3, 2 }; - float *exp_vals_h = new float[nnz*2] {0.5, 0.5, 0, 0, 1, 0.5, 0, 0, 0.5, 0.5, 0, 0, 0.5, 0, 0.5, 0.5 }; - - COO expected(exp_rows_h, exp_cols_h, exp_vals_h, nnz*2, 4, 4, false); - - COO in(nnz, 4, 4); - updateDevice(in.rows, *&in_rows_h, nnz, stream); - updateDevice(in.cols, *&in_cols_h, nnz, stream); - updateDevice(in.vals, *&in_vals_h, nnz, stream); - - COO out; - - coo_symmetrize<32, float>(&in, &out, - [] __device__ (int row, int col, float val, float trans) { return val+trans; }, - stream); - - ASSERT_TRUE(out.nnz == expected.nnz); - ASSERT_TRUE(devArrMatch(out.rows, expected.rows, out.nnz, Compare())); - ASSERT_TRUE(devArrMatch(out.cols, expected.cols, out.nnz, Compare())); - ASSERT_TRUE(devArrMatch(out.vals, expected.vals, out.nnz, Compare())); - - cudaStreamDestroy(stream); - - delete in_rows_h; - delete in_cols_h; - delete in_vals_h; - - delete exp_rows_h; - delete exp_cols_h; - delete exp_vals_h; -} - - - -typedef COOTest COOSort; -TEST_P(COOSort, Result) { - - int *in_rows, *in_cols, *verify; - float *in_vals; - - params = ::testing::TestWithParam>::GetParam(); - Random::Rng r(params.seed); - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - allocate(in_vals, params.nnz); - r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream); - - int *in_rows_h = (int*)malloc(params.nnz * sizeof(int)); - int *in_cols_h = (int*)malloc(params.nnz * sizeof(int)); - int *verify_h = (int*)malloc(params.nnz * sizeof(int)); - - for(int i = 0; i < params.nnz; i++) { - in_rows_h[i] = params.nnz-i-1; - verify_h[i] = i; - in_cols_h[i] = i; - } - - allocate(in_rows, params.nnz); - allocate(in_cols, params.nnz); - allocate(verify, params.nnz); - - updateDevice(in_rows, in_rows_h, params.nnz, stream); - - updateDevice(in_cols, in_cols_h, params.nnz, stream); - updateDevice(verify, verify_h, params.nnz, stream); - - coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals); - - ASSERT_TRUE(devArrMatch(verify, in_rows, params.nnz, Compare())); - - free(in_rows_h); - free(in_cols_h); - free(verify_h); - - CUDA_CHECK(cudaFree(in_rows)); - CUDA_CHECK(cudaFree(in_cols)); - CUDA_CHECK(cudaFree(in_vals)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaStreamDestroy(stream)); -} - -typedef COOTest COORemoveZeros; -TEST_P(COORemoveZeros, Result) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - COO in_h(params.nnz, 5, 5, false); - COO in(params.nnz, 5, 5); - - params = ::testing::TestWithParam>::GetParam(); - - Random::Rng r(params.seed); - r.uniform(in.vals, params.nnz, float(-1.0), float(1.0), stream); - - updateHost(in_h.vals, in.vals, params.nnz, stream); - - in_h.vals[0] = 0; - in_h.vals[2] = 0; - in_h.vals[3] = 0; - - for(int i = 0; i < params.nnz; i++) { - in_h.rows[i] = params.nnz-i-1; - in_h.cols[i] = i; - } - - updateDevice(in.rows, in_h.rows, params.nnz, stream); - updateDevice(in.cols, in_h.cols, params.nnz, stream); - updateDevice(in.vals, in_h.vals, params.nnz, stream); - - coo_sort(&in); - - int out_rows_ref_h[2] = { 0, 3 }; - int out_cols_ref_h[2] = { 4, 1 }; - - float *out_vals_ref_h = (float*)malloc(2*sizeof(float)); - out_vals_ref_h[0] = in_h.vals[4]; - out_vals_ref_h[1] = in_h.vals[1]; - - COO out_ref(2, 5, 5); - COO out; - - updateDevice(out_ref.rows, *&out_rows_ref_h, 2, stream); - updateDevice(out_ref.cols, *&out_cols_ref_h, 2, stream); - updateDevice(out_ref.vals, out_vals_ref_h, 2, stream); - - coo_remove_zeros<32, float>(&in, &out, stream); - - ASSERT_TRUE(devArrMatch(out_ref.rows, out.rows, 2, Compare())); - ASSERT_TRUE(devArrMatch(out_ref.cols, out.cols, 2, Compare())); - ASSERT_TRUE(devArrMatch(out_ref.vals, out.vals, 2, Compare())); - - CUDA_CHECK(cudaStreamDestroy(stream)); - free(out_vals_ref_h); -} - - -typedef COOTest COORowCount; -TEST_P(COORowCount, Result) { - - int *in_rows,*verify, *results; - - int in_rows_h[5] = { 0, 0, 1, 2, 2 }; - int verify_h[5] = {2, 1, 2, 0, 0}; - - allocate(in_rows, 5); - allocate(verify, 5, true); - allocate(results, 5, true); - - updateDevice(in_rows, *&in_rows_h, 5, 0); - updateDevice(verify, *&verify_h, 5, 0); - - dim3 grid(ceildiv(5, 32), 1, 1); - dim3 blk(32, 1, 1); - coo_row_count<32>(in_rows, 5, results, 0); - cudaDeviceSynchronize(); - - ASSERT_TRUE(devArrMatch(verify, results, 5, Compare())); - - CUDA_CHECK(cudaFree(in_rows)); - CUDA_CHECK(cudaFree(verify)); -} - -typedef COOTest COORowCountNonzero; -TEST_P(COORowCountNonzero, Result) { - - int *in_rows,*verify, *results; - float *in_vals; - - int in_rows_h[5] = { 0, 0, 1, 2, 2 }; - float in_vals_h[5] = { 0.0, 5.0, 0.0, 1.0, 1.0 }; - int verify_h[5] = {1, 0, 2, 0, 0}; - - allocate(in_rows, 5); - allocate(verify, 5, true); - allocate(results, 5, true); - allocate(in_vals, 5, true); - - updateDevice(in_rows, *&in_rows_h, 5, 0); - updateDevice(verify, *&verify_h, 5, 0); - updateDevice(in_vals, *&in_vals_h, 5, 0); - - dim3 grid(ceildiv(5, 32), 1, 1); - dim3 blk(32, 1, 1); - coo_row_count_nz<32, float>(in_rows, in_vals, 5, results); - cudaDeviceSynchronize(); - - ASSERT_TRUE(devArrMatch(verify, results, 5, Compare())); - - CUDA_CHECK(cudaFree(in_rows)); - CUDA_CHECK(cudaFree(verify)); -} - -INSTANTIATE_TEST_CASE_P(COOTests, SortedCOOToCSR, - ::testing::ValuesIn(inputsf)); - - -INSTANTIATE_TEST_CASE_P(COOTests, COOSort, - ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(COOTests, COORemoveZeros, - ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(COOTests, COORowCount, - ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(COOTests, COORowCountNonzero, - ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(COOTests, COOSymmetrize, - ::testing::ValuesIn(inputsf)); - -} -} diff --git a/ml-prims/test/csr (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu b/ml-prims/test/csr (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu deleted file mode 100644 index 72f24f29e6..0000000000 --- a/ml-prims/test/csr (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu +++ /dev/null @@ -1,366 +0,0 @@ -/* - * Copyright (c) 2019, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "csr.h" -#include -#include "sparse/csr.h" - -#include "random/rng.h" -#include "test_utils.h" - -#include -#include - -namespace MLCommon { -namespace Sparse { - -template -class CSRTest : public ::testing::TestWithParam> { -protected: - void SetUp() override {} - - void TearDown() override {} - -protected: - CSRInputs params; -}; - -const std::vector> inputsf = { - {5, 10, 5, 1234ULL}}; - -typedef CSRTest CSRToCOO; -TEST_P(CSRToCOO, Result) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - int *ex_scan; - int *result, *verify; - - int *ex_scan_h = new int[4]{0, 4, 8, 9 }; - int *verify_h = new int[10]{ 0, 0, 0, 0, 1, 1, 1, 1, 2, 3 }; - - allocate(verify, 10); - allocate(ex_scan, 4); - allocate(result, 10, true); - - updateDevice(ex_scan, ex_scan_h, 4, stream); - updateDevice(verify, verify_h, 10, stream); - - csr_to_coo<32>(ex_scan, 4, result, 10, stream); - - ASSERT_TRUE(devArrMatch(verify, result, 10, Compare(), stream)); - - delete ex_scan_h; - delete verify_h; - - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); - - cudaStreamDestroy(stream); - -} - - -typedef CSRTest CSRRowNormalizeMax; -TEST_P(CSRRowNormalizeMax, Result) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - int *ex_scan; - float *in_vals, *result, *verify; - - int ex_scan_h[4] = {0, 4, 8, 9 }; - float in_vals_h[10] = { 5.0, 1.0, 0.0, 0.0, 10.0, 1.0, 0.0, 0.0, 1.0, 0.0 }; - - float verify_h[10] = { 1.0, 0.2, 0.0, 0.0, 1.0, 0.1, 0.0, 0.0, 1, 0.0 }; - - allocate(in_vals, 10); - allocate(verify, 10); - allocate(ex_scan, 4); - allocate(result, 10, true); - - updateDevice(ex_scan, *&ex_scan_h, 4, stream); - updateDevice(in_vals, *&in_vals_h, 10, stream); - updateDevice(verify, *&verify_h, 10, stream); - - csr_row_normalize_max<32, float>(ex_scan, in_vals, 10, 4, result, stream); - - ASSERT_TRUE(devArrMatch(verify, result, 10, Compare())); - - cudaStreamDestroy(stream); - - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(in_vals)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); -} - -typedef CSRTest CSRRowNormalizeL1; -TEST_P(CSRRowNormalizeL1, Result) { - - int *ex_scan; - float *in_vals, *result, *verify; - - int ex_scan_h[4] = {0, 4, 8, 9 }; - float in_vals_h[10] = { 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0 }; - - float verify_h[10] = { 0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0 }; - - allocate(in_vals, 10); - allocate(verify, 10); - allocate(ex_scan, 4); - allocate(result, 10, true); - - updateDevice(ex_scan, *&ex_scan_h, 4, 0); - updateDevice(in_vals, *&in_vals_h, 10, 0); - updateDevice(verify, *&verify_h, 10, 0); - - csr_row_normalize_l1<32, float>(ex_scan, in_vals, 10, 4, result, 0); - cudaDeviceSynchronize(); - - ASSERT_TRUE(devArrMatch(verify, result, 10, Compare())); - - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(in_vals)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); -} - -typedef CSRTest CSRSum; -TEST_P(CSRSum, Result) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - int *ex_scan, *ind_ptr_a, *ind_ptr_b, *verify_indptr; - float *in_vals_a, *in_vals_b, *verify; - - int ex_scan_h[4] = {0, 4, 8, 9 }; - - int indptr_a_h[10] = { 1, 2, 3, 4, 1, 2, 3, 5, 0, 1 }; - int indptr_b_h[10] = { 1, 2, 5, 4, 0, 2, 3, 5, 1, 0 }; - - float in_vals_h[10] = { 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0 }; - - float verify_h[14] = { 2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 }; - int verify_indptr_h[14] = { 1, 2, 3, 4, 5, 1, 2, 3, 5, 0, 0, 1, 1, 0 }; - - allocate(in_vals_a, 10); - allocate(in_vals_b, 10); - allocate(verify, 14); - allocate(ex_scan, 4); - allocate(verify_indptr, 14); - - allocate(ind_ptr_a, 10); - allocate(ind_ptr_b, 10); - - updateDevice(ex_scan, *&ex_scan_h, 4, stream); - updateDevice(in_vals_a, *&in_vals_h, 10, stream); - updateDevice(in_vals_b, *&in_vals_h, 10, stream); - updateDevice(verify, *&verify_h, 14, stream); - updateDevice(verify_indptr, *&verify_indptr_h, 14, stream); - updateDevice(ind_ptr_a, *&indptr_a_h, 10, stream); - updateDevice(ind_ptr_b, *&indptr_b_h, 10, stream); - - int *result_ind; - allocate(result_ind, 4); - - int nnz = csr_add_calc_inds( - ex_scan, ind_ptr_a, in_vals_a, 10, - ex_scan, ind_ptr_b, in_vals_b, 10, - 4, result_ind, - 0 - ); - - int *result_indptr; - float *result_val; - allocate(result_indptr, nnz); - allocate(result_val, nnz); - - csr_add_finalize( - ex_scan, ind_ptr_a, in_vals_a, 10, - ex_scan, ind_ptr_b, in_vals_b, 10, - 4, result_ind, result_indptr, result_val, - 0 - ); - - ASSERT_TRUE(nnz==14); - - ASSERT_TRUE(devArrMatch(verify, result_val, nnz, Compare())); - ASSERT_TRUE(devArrMatch(verify_indptr, result_indptr, nnz, Compare())); - - cudaStreamDestroy(stream); - - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(in_vals_a)); - CUDA_CHECK(cudaFree(in_vals_b)); - CUDA_CHECK(cudaFree(ind_ptr_a)); - CUDA_CHECK(cudaFree(ind_ptr_b)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result_indptr)); - CUDA_CHECK(cudaFree(result_val)); -} - -typedef CSRTest CSRRowOpTest; -TEST_P(CSRRowOpTest, Result) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - int *ex_scan; - float *result, *verify; - - int ex_scan_h[4] = {0, 4, 8, 9 }; - - float verify_h[10] = { 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0 }; - - allocate(verify, 10); - allocate(ex_scan, 4); - allocate(result, 10, true); - - updateDevice(ex_scan, *&ex_scan_h, 4, stream); - updateDevice(verify, *&verify_h, 10, stream); - - csr_row_op(ex_scan, 4, 10, - [result] __device__ (int row, int start_idx, int stop_idx) { - for(int i = start_idx; i < stop_idx; i++ ) - result[i] = row; - }, stream); - - ASSERT_TRUE(devArrMatch(verify, result, 10, Compare())); - - cudaStreamDestroy(stream); - - CUDA_CHECK(cudaFree(ex_scan)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); -} - -typedef CSRTest AdjGraphTest; -TEST_P(AdjGraphTest, Result) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - int *row_ind, *result, *verify; - bool *adj; - - int row_ind_h[3] = {0, 3, 6 }; - bool adj_h[18] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - - int verify_h[9] = { 0, 1, 2, 0, 1, 2, 0, 1, 2 }; - - allocate(row_ind, 3); - allocate(adj, 18); - allocate(result, 9, true); - allocate(verify, 9); - - updateDevice(row_ind, *&row_ind_h, 3, stream); - updateDevice(adj, *&adj_h, 18, stream); - updateDevice(verify, *&verify_h, 9, stream); - - csr_adj_graph_batched(row_ind, 6, 9, 3, adj, result, stream); - - ASSERT_TRUE(devArrMatch(verify, result, 9, Compare())); - - cudaStreamDestroy(stream); - - CUDA_CHECK(cudaFree(row_ind)); - CUDA_CHECK(cudaFree(adj)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); -} - -typedef CSRTest WeakCCTest; -TEST_P(WeakCCTest, Result) { - - cudaStream_t stream; - cudaStreamCreate(&stream); - - int *row_ind, *row_ind_ptr, *result, *verify; - - int row_ind_h1[3] = {0, 3, 6 }; - int row_ind_ptr_h1[9] = { 0, 1, 2, 0, 1, 2, 0, 1, 2 }; - int verify_h1[6] = { 1, 1, 1, 2147483647, 2147483647, 2147483647 }; - - int row_ind_h2[3] = {0, 2, 4 }; - int row_ind_ptr_h2[5] = { 3, 4, 3, 4, 5 }; - int verify_h2[6] = { 1, 1, 1, 5, 5, 5 }; - - allocate(row_ind, 3); - allocate(row_ind_ptr, 9); - allocate(result, 9, true); - allocate(verify, 9); - - WeakCCState state(6); - - /** - * Run batch #1 - */ - updateDevice(row_ind, *&row_ind_h1, 3, stream); - updateDevice(row_ind_ptr, *&row_ind_ptr_h1, 9, stream); - updateDevice(verify, *&verify_h1, 6, stream); - - weak_cc_batched(result, row_ind, row_ind_ptr, 9, 6, 0, 3, &state, stream); - - ASSERT_TRUE(devArrMatch(verify, result, 6, Compare())); - - /** - * Run batch #2 - */ - updateDevice(row_ind, *&row_ind_h2, 3, stream); - updateDevice(row_ind_ptr, *&row_ind_ptr_h2, 5, stream); - updateDevice(verify, *&verify_h2, 6, stream); - - weak_cc_batched(result, row_ind, row_ind_ptr, 5, 6, 4, 3, &state, stream); - - ASSERT_TRUE(devArrMatch(verify, result, 6, Compare())); - - cudaStreamDestroy(stream); - - CUDA_CHECK(cudaFree(row_ind)); - CUDA_CHECK(cudaFree(row_ind_ptr)); - CUDA_CHECK(cudaFree(verify)); - CUDA_CHECK(cudaFree(result)); -} - -INSTANTIATE_TEST_CASE_P(CSRTests, WeakCCTest, - ::testing::ValuesIn(inputsf)); - - -INSTANTIATE_TEST_CASE_P(CSRTests, AdjGraphTest, - ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(CSRTests, CSRRowOpTest, - ::testing::ValuesIn(inputsf)); - - -INSTANTIATE_TEST_CASE_P(CSRTests, CSRToCOO, - ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(CSRTests, CSRRowNormalizeMax, - ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(CSRTests, CSRRowNormalizeL1, - ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(CSRTests, CSRSum, - ::testing::ValuesIn(inputsf)); -}} - diff --git a/ml-prims/test/hinge (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu b/ml-prims/test/hinge (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu deleted file mode 100644 index 384bb0ed76..0000000000 --- a/ml-prims/test/hinge (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu +++ /dev/null @@ -1,244 +0,0 @@ -#include -#include "functions/hinge.h" -#include "random/rng.h" -#include "test_utils.h" - - -namespace MLCommon { -namespace Functions { - -template -struct HingeLossInputs { - T tolerance; - T n_rows; - T n_cols; - int len; -}; - -template -class HingeLossTest: public ::testing::TestWithParam > { -protected: - void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); - int len = params.len; - int n_rows = params.n_rows; - int n_cols = params.n_cols; - - T *labels, *coef; - - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - allocator.reset(new defaultDeviceAllocator); - - allocate(in, len); - allocate(out, 1); - allocate(out_lasso, 1); - allocate(out_ridge, 1); - allocate(out_elasticnet, 1); - allocate(out_grad, n_cols); - allocate(out_lasso_grad, n_cols); - allocate(out_ridge_grad, n_cols); - allocate(out_elasticnet_grad, n_cols); - allocate(out_ref, 1); - allocate(out_lasso_ref, 1); - allocate(out_ridge_ref, 1); - allocate(out_elasticnet_ref, 1); - allocate(out_grad_ref, n_cols); - allocate(out_lasso_grad_ref, n_cols); - allocate(out_ridge_grad_ref, n_cols); - allocate(out_elasticnet_grad_ref, n_cols); - - allocate(labels, params.n_rows); - allocate(coef, params.n_cols); - - T h_in[len] = {0.1, 0.35, -0.9, -1.4, 2.0, 3.1}; - updateDevice(in, h_in, len, stream); - - T h_labels[n_rows] = {0.3, 2.0, -1.1}; - updateDevice(labels, h_labels, n_rows, stream); - - T h_coef[n_cols] = {0.35, -0.24}; - updateDevice(coef, h_coef, n_cols, stream); - - T h_out_ref[1] = {2.6037}; - updateDevice(out_ref, h_out_ref, 1, stream); - - T h_out_lasso_ref[1] = {2.9577}; - updateDevice(out_lasso_ref, h_out_lasso_ref, 1, stream); - - T h_out_ridge_ref[1] = {2.71176}; - updateDevice(out_ridge_ref, h_out_ridge_ref, 1, stream); - - T h_out_elasticnet_ref[1] = {2.83473}; - updateDevice(out_elasticnet_ref, h_out_elasticnet_ref, 1, stream); - - T h_out_grad_ref[n_cols] = {-0.24333, -1.1933}; - updateDevice(out_grad_ref, h_out_grad_ref, n_cols, stream); - - T h_out_lasso_grad_ref[n_cols] = {0.3566, -1.7933}; - updateDevice(out_lasso_grad_ref, h_out_lasso_grad_ref, n_cols, stream); - - T h_out_ridge_grad_ref[n_cols] = {0.1766, -1.4813}; - updateDevice(out_ridge_grad_ref, h_out_ridge_grad_ref, n_cols, stream); - - T h_out_elasticnet_grad_ref[n_cols] = {0.2666, -1.63733}; - updateDevice(out_elasticnet_grad_ref, h_out_elasticnet_grad_ref, n_cols, stream); - - T alpha = 0.6; - T l1_ratio = 0.5; - - hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out, penalty::NONE, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_grad, penalty::NONE, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out_lasso, penalty::L1, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_lasso_grad, penalty::L1, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out_ridge, penalty::L2, - alpha, l1_ratio, cublas_handle, allocator, stream); - - hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_ridge_grad, penalty::L2, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, allocator, stream); - - hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet_grad, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(labels)); - CUDA_CHECK(cudaFree(coef)); - - } - - void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_lasso)); - CUDA_CHECK(cudaFree(out_ridge)); - CUDA_CHECK(cudaFree(out_elasticnet)); - CUDA_CHECK(cudaFree(out_grad)); - CUDA_CHECK(cudaFree(out_lasso_grad)); - CUDA_CHECK(cudaFree(out_ridge_grad)); - CUDA_CHECK(cudaFree(out_elasticnet_grad)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out_lasso_ref)); - CUDA_CHECK(cudaFree(out_ridge_ref)); - CUDA_CHECK(cudaFree(out_elasticnet_ref)); - CUDA_CHECK(cudaFree(out_grad_ref)); - CUDA_CHECK(cudaFree(out_lasso_grad_ref)); - CUDA_CHECK(cudaFree(out_ridge_grad_ref)); - CUDA_CHECK(cudaFree(out_elasticnet_grad_ref)); - } - -protected: - HingeLossInputs params; - T *in; - T *out, *out_lasso, *out_ridge, *out_elasticnet; - T *out_ref, *out_lasso_ref, *out_ridge_ref, *out_elasticnet_ref; - T *out_grad, *out_lasso_grad, *out_ridge_grad, *out_elasticnet_grad; - T *out_grad_ref, *out_lasso_grad_ref, *out_ridge_grad_ref, *out_elasticnet_grad_ref; - std::shared_ptr allocator; -}; - -const std::vector > inputsf = { - {0.01f, 3, 2, 6} -}; - -const std::vector > inputsd = { - {0.01, 3, 2, 6} -}; - -typedef HingeLossTest HingeLossTestF; -TEST_P(HingeLossTestF, Result) { - - - ASSERT_TRUE(devArrMatch(out_ref, out, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_ref, out_lasso, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_ref, out_ridge, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_ref, out_elasticnet, 1, - CompareApprox(params.tolerance))); - - - ASSERT_TRUE(devArrMatch(out_grad_ref, out_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_grad_ref, out_lasso_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_grad_ref, out_ridge_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_grad_ref, out_elasticnet_grad, params.n_cols, - CompareApprox(params.tolerance))); - - -} - -typedef HingeLossTest HingeLossTestD; -TEST_P(HingeLossTestD, Result){ - - - ASSERT_TRUE(devArrMatch(out_ref, out, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_ref, out_lasso, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_ref, out_ridge, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_ref, out_elasticnet, 1, - CompareApprox(params.tolerance))); - - - ASSERT_TRUE(devArrMatch(out_grad_ref, out_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_grad_ref, out_lasso_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_grad_ref, out_ridge_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_grad_ref, out_elasticnet_grad, params.n_cols, - CompareApprox(params.tolerance))); - - -} - -INSTANTIATE_TEST_CASE_P(HingeLossTests, HingeLossTestF, ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(HingeLossTests, HingeLossTestD, ::testing::ValuesIn(inputsd)); - -} // end namespace Functions -} // end namespace MLCommon diff --git a/ml-prims/test/linearReg (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu b/ml-prims/test/linearReg (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu deleted file mode 100644 index 0254ba17f1..0000000000 --- a/ml-prims/test/linearReg (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu +++ /dev/null @@ -1,239 +0,0 @@ -#include -#include "functions/linearReg.h" -#include "random/rng.h" -#include "test_utils.h" - - -namespace MLCommon { -namespace Functions { - -template -struct LinRegLossInputs { - T tolerance; - T n_rows; - T n_cols; - int len; -}; - -template -class LinRegLossTest: public ::testing::TestWithParam > { -protected: - void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); - int len = params.len; - int n_rows = params.n_rows; - int n_cols = params.n_cols; - - T *labels, *coef; - - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - allocator.reset(new defaultDeviceAllocator); - - allocate(in, len); - allocate(out, 1); - allocate(out_lasso, 1); - allocate(out_ridge, 1); - allocate(out_elasticnet, 1); - allocate(out_grad, n_cols); - allocate(out_lasso_grad, n_cols); - allocate(out_ridge_grad, n_cols); - allocate(out_elasticnet_grad, n_cols); - allocate(out_ref, 1); - allocate(out_lasso_ref, 1); - allocate(out_ridge_ref, 1); - allocate(out_elasticnet_ref, 1); - allocate(out_grad_ref, n_cols); - allocate(out_lasso_grad_ref, n_cols); - allocate(out_ridge_grad_ref, n_cols); - allocate(out_elasticnet_grad_ref, n_cols); - - allocate(labels, params.n_rows); - allocate(coef, params.n_cols); - - T h_in[len] = {0.1, 0.35, -0.9, -1.4, 2.0, 3.1}; - updateDevice(in, h_in, len, stream); - - T h_labels[n_rows] = {0.3, 2.0, -1.1}; - updateDevice(labels, h_labels, n_rows, stream); - - T h_coef[n_cols] = {0.35, -0.24}; - updateDevice(coef, h_coef, n_cols, stream); - - T h_out_ref[1] = {1.854842}; - updateDevice(out_ref, h_out_ref, 1, stream); - - T h_out_lasso_ref[1] = {2.2088}; - updateDevice(out_lasso_ref, h_out_lasso_ref, 1, stream); - - T h_out_ridge_ref[1] = {1.9629}; - updateDevice(out_ridge_ref, h_out_ridge_ref, 1, stream); - - T h_out_elasticnet_ref[1] = {2.0858}; - updateDevice(out_elasticnet_ref, h_out_elasticnet_ref, 1, stream); - - T h_out_grad_ref[n_cols] = {-0.56995, -3.12486}; - updateDevice(out_grad_ref, h_out_grad_ref, n_cols, stream); - - T h_out_lasso_grad_ref[n_cols] = {0.03005, -3.724866}; - updateDevice(out_lasso_grad_ref, h_out_lasso_grad_ref, n_cols, stream); - - T h_out_ridge_grad_ref[n_cols] = {-0.14995, -3.412866}; - updateDevice(out_ridge_grad_ref, h_out_ridge_grad_ref, n_cols, stream); - - T h_out_elasticnet_grad_ref[n_cols] = {-0.05995, -3.568866}; - updateDevice(out_elasticnet_grad_ref, h_out_elasticnet_grad_ref, n_cols, stream); - - T alpha = 0.6; - T l1_ratio = 0.5; - - linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out, penalty::NONE, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_grad, penalty::NONE, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_lasso, penalty::L1, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_lasso_grad, penalty::L1, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_ridge, penalty::L2, - alpha, l1_ratio, cublas_handle, allocator, stream); - - linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_ridge_grad, penalty::L2, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, allocator, stream); - - linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet_grad, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(labels)); - CUDA_CHECK(cudaFree(coef)); - - } - - void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_lasso)); - CUDA_CHECK(cudaFree(out_ridge)); - CUDA_CHECK(cudaFree(out_elasticnet)); - CUDA_CHECK(cudaFree(out_grad)); - CUDA_CHECK(cudaFree(out_lasso_grad)); - CUDA_CHECK(cudaFree(out_ridge_grad)); - CUDA_CHECK(cudaFree(out_elasticnet_grad)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out_lasso_ref)); - CUDA_CHECK(cudaFree(out_ridge_ref)); - CUDA_CHECK(cudaFree(out_elasticnet_ref)); - CUDA_CHECK(cudaFree(out_grad_ref)); - CUDA_CHECK(cudaFree(out_lasso_grad_ref)); - CUDA_CHECK(cudaFree(out_ridge_grad_ref)); - CUDA_CHECK(cudaFree(out_elasticnet_grad_ref)); - } - -protected: - LinRegLossInputs params; - T *in; - T *out, *out_lasso, *out_ridge, *out_elasticnet; - T *out_ref, *out_lasso_ref, *out_ridge_ref, *out_elasticnet_ref; - T *out_grad, *out_lasso_grad, *out_ridge_grad, *out_elasticnet_grad; - T *out_grad_ref, *out_lasso_grad_ref, *out_ridge_grad_ref, *out_elasticnet_grad_ref; - std::shared_ptr allocator; -}; - -const std::vector > inputsf = { - {0.01f, 3, 2, 6} -}; - -const std::vector > inputsd = { - {0.01, 3, 2, 6} -}; - -typedef LinRegLossTest LinRegLossTestF; -TEST_P(LinRegLossTestF, Result) { - - ASSERT_TRUE(devArrMatch(out_ref, out, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_ref, out_lasso, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_ref, out_ridge, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_ref, out_elasticnet, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_grad_ref, out_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_grad_ref, out_lasso_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_grad_ref, out_ridge_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_grad_ref, out_elasticnet_grad, params.n_cols, - CompareApprox(params.tolerance))); - -} - -typedef LinRegLossTest LinRegLossTestD; -TEST_P(LinRegLossTestD, Result){ - - ASSERT_TRUE(devArrMatch(out_ref, out, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_ref, out_lasso, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_ref, out_ridge, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_ref, out_elasticnet, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_grad_ref, out_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_grad_ref, out_lasso_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_grad_ref, out_ridge_grad, params.n_cols, - CompareApprox(params.tolerance))); - - - ASSERT_TRUE(devArrMatch(out_elasticnet_grad_ref, out_elasticnet_grad, params.n_cols, - CompareApprox(params.tolerance))); - -} - -INSTANTIATE_TEST_CASE_P(LinRegLossTests, LinRegLossTestF, ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(LinRegLossTests, LinRegLossTestD, ::testing::ValuesIn(inputsd)); - -} // end namespace Functions -} // end namespace MLCommon diff --git a/ml-prims/test/logisticReg (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu b/ml-prims/test/logisticReg (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu deleted file mode 100644 index 2a61cc9d1e..0000000000 --- a/ml-prims/test/logisticReg (Dantes-MBP.fios-router.home's conflicted copy 2019-05-14).cu +++ /dev/null @@ -1,239 +0,0 @@ -#include -#include "functions/logisticReg.h" -#include "random/rng.h" -#include "test_utils.h" - - -namespace MLCommon { -namespace Functions { - -template -struct LogRegLossInputs { - T tolerance; - T n_rows; - T n_cols; - int len; -}; - -template -class LogRegLossTest: public ::testing::TestWithParam > { -protected: - void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); - int len = params.len; - int n_rows = params.n_rows; - int n_cols = params.n_cols; - - T *labels, *coef; - - cublasHandle_t cublas_handle; - CUBLAS_CHECK(cublasCreate(&cublas_handle)); - cudaStream_t stream; - CUDA_CHECK(cudaStreamCreate(&stream)); - - allocator.reset(new defaultDeviceAllocator); - - allocate(in, len); - allocate(out, 1); - allocate(out_lasso, 1); - allocate(out_ridge, 1); - allocate(out_elasticnet, 1); - allocate(out_grad, n_cols); - allocate(out_lasso_grad, n_cols); - allocate(out_ridge_grad, n_cols); - allocate(out_elasticnet_grad, n_cols); - allocate(out_ref, 1); - allocate(out_lasso_ref, 1); - allocate(out_ridge_ref, 1); - allocate(out_elasticnet_ref, 1); - allocate(out_grad_ref, n_cols); - allocate(out_lasso_grad_ref, n_cols); - allocate(out_ridge_grad_ref, n_cols); - allocate(out_elasticnet_grad_ref, n_cols); - - allocate(labels, params.n_rows); - allocate(coef, params.n_cols); - - T h_in[len] = {0.1, 0.35, -0.9, -1.4, 2.0, 3.1}; - updateDevice(in, h_in, len, stream); - - T h_labels[n_rows] = {0.3, 2.0, -1.1}; - updateDevice(labels, h_labels, n_rows, stream); - - T h_coef[n_cols] = {0.35, -0.24}; - updateDevice(coef, h_coef, n_cols, stream); - - T h_out_ref[1] = {0.38752545}; - updateDevice(out_ref, h_out_ref, 1, stream); - - T h_out_lasso_ref[1] = {0.74152}; - updateDevice(out_lasso_ref, h_out_lasso_ref, 1, stream); - - T h_out_ridge_ref[1] = {0.4955854}; - updateDevice(out_ridge_ref, h_out_ridge_ref, 1, stream); - - T h_out_elasticnet_ref[1] = {0.618555}; - updateDevice(out_elasticnet_ref, h_out_elasticnet_ref, 1, stream); - - T h_out_grad_ref[n_cols] = {-0.58284, 0.207666}; - updateDevice(out_grad_ref, h_out_grad_ref, n_cols, stream); - - T h_out_lasso_grad_ref[n_cols] = {0.0171, -0.39233}; - updateDevice(out_lasso_grad_ref, h_out_lasso_grad_ref, n_cols, stream); - - T h_out_ridge_grad_ref[n_cols] = {-0.16284, -0.080333}; - updateDevice(out_ridge_grad_ref, h_out_ridge_grad_ref, n_cols, stream); - - T h_out_elasticnet_grad_ref[n_cols] = {-0.07284, -0.23633}; - updateDevice(out_elasticnet_grad_ref, h_out_elasticnet_grad_ref, n_cols, stream); - - T alpha = 0.6; - T l1_ratio = 0.5; - - logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out, penalty::NONE, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_grad, penalty::NONE, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_lasso, penalty::L1, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_lasso_grad, penalty::L1, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_ridge, penalty::L2, - alpha, l1_ratio, cublas_handle, allocator, stream); - - logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_ridge_grad, penalty::L2, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, allocator, stream); - - logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet_grad, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, allocator, stream); - - updateDevice(in, h_in, len, stream); - - CUBLAS_CHECK(cublasDestroy(cublas_handle)); - CUDA_CHECK(cudaStreamDestroy(stream)); - CUDA_CHECK(cudaFree(labels)); - CUDA_CHECK(cudaFree(coef)); - - } - - void TearDown() override { - CUDA_CHECK(cudaFree(in)); - CUDA_CHECK(cudaFree(out)); - CUDA_CHECK(cudaFree(out_lasso)); - CUDA_CHECK(cudaFree(out_ridge)); - CUDA_CHECK(cudaFree(out_elasticnet)); - CUDA_CHECK(cudaFree(out_grad)); - CUDA_CHECK(cudaFree(out_lasso_grad)); - CUDA_CHECK(cudaFree(out_ridge_grad)); - CUDA_CHECK(cudaFree(out_elasticnet_grad)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out_lasso_ref)); - CUDA_CHECK(cudaFree(out_ridge_ref)); - CUDA_CHECK(cudaFree(out_elasticnet_ref)); - CUDA_CHECK(cudaFree(out_grad_ref)); - CUDA_CHECK(cudaFree(out_lasso_grad_ref)); - CUDA_CHECK(cudaFree(out_ridge_grad_ref)); - CUDA_CHECK(cudaFree(out_elasticnet_grad_ref)); - } - -protected: - LogRegLossInputs params; - T *in; - T *out, *out_lasso, *out_ridge, *out_elasticnet; - T *out_ref, *out_lasso_ref, *out_ridge_ref, *out_elasticnet_ref; - T *out_grad, *out_lasso_grad, *out_ridge_grad, *out_elasticnet_grad; - T *out_grad_ref, *out_lasso_grad_ref, *out_ridge_grad_ref, *out_elasticnet_grad_ref; - std::shared_ptr allocator; -}; - -const std::vector > inputsf = { - {0.01f, 3, 2, 6} -}; - -const std::vector > inputsd = { - {0.01, 3, 2, 6} -}; - -typedef LogRegLossTest LogRegLossTestF; -TEST_P(LogRegLossTestF, Result) { - - - ASSERT_TRUE(devArrMatch(out_ref, out, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_ref, out_lasso, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_ref, out_ridge, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_ref, out_elasticnet, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_grad_ref, out_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_grad_ref, out_lasso_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_grad_ref, out_ridge_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_grad_ref, out_elasticnet_grad, params.n_cols, - CompareApprox(params.tolerance))); - -} - -typedef LogRegLossTest LogRegLossTestD; -TEST_P(LogRegLossTestD, Result){ - - - ASSERT_TRUE(devArrMatch(out_ref, out, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_ref, out_lasso, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_ref, out_ridge, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_ref, out_elasticnet, 1, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_grad_ref, out_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_lasso_grad_ref, out_lasso_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_ridge_grad_ref, out_ridge_grad, params.n_cols, - CompareApprox(params.tolerance))); - - ASSERT_TRUE(devArrMatch(out_elasticnet_grad_ref, out_elasticnet_grad, params.n_cols, - CompareApprox(params.tolerance))); - -} - -INSTANTIATE_TEST_CASE_P(LogRegLossTests, LogRegLossTestF, ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(LogRegLossTests, LogRegLossTestD, ::testing::ValuesIn(inputsd)); - -} // end namespace Functions -} // end namespace MLCommon From 486816a9b217f6a5e81ac28514c3c25211791fe5 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 15 May 2019 21:51:47 -0500 Subject: [PATCH 143/156] FIX trust and rand proj pep8 fixes --- python/cuml/metrics/trustworthiness.pyx | 51 +++--- .../{rproj.pyx => random_projextion.pyx} | 152 +++++++++--------- 2 files changed, 113 insertions(+), 90 deletions(-) rename python/cuml/random_projection/{rproj.pyx => random_projextion.pyx} (79%) diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx index 0aeab3698c..0ed0114e78 100644 --- a/python/cuml/metrics/trustworthiness.pyx +++ b/python/cuml/metrics/trustworthiness.pyx @@ -34,12 +34,16 @@ cdef extern from "metrics/trustworthiness_c.h" namespace "MLCommon::Distance": cdef extern from "metrics/trustworthiness_c.h" namespace "ML::Metrics": - cdef double trustworthiness_score[T, DistanceType](const cumlHandle& h, T* X, - T* X_embedded, int n, int m, int d, - int n_neighbors) + cdef double trustworthiness_score[T, DistanceType](const cumlHandle& h, + T* X, + T* X_embedded, + int n, int m, + int d, + int n_neighbors) -def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean', should_downcast=True): +def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, + metric='euclidean', should_downcast=True): """ Expresses to what extent the local structure is retained in embedding. The score is defined in the range [0, 1]. @@ -60,7 +64,8 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean trustworthiness score : double Trustworthiness of the low-dimensional embedding """ - if isinstance(X, cudf.DataFrame) and isinstance(X_embedded, cudf.DataFrame): + if (isinstance(X, cudf.DataFrame) and + isinstance(X_embedded, cudf.DataFrame)): datatype1 = np.dtype(X[X.columns[0]]._column.dtype) datatype2 = np.dtype(X_embedded[X_embedded.columns[0]]._column.dtype) n_samples = len(X) @@ -72,15 +77,18 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean n_samples, n_features = X.shape n_components = X_embedded.shape[1] else: - raise TypeError("X and X_embedded parameters must both be cuDF Dataframes or Numpy ndarray") + raise TypeError("X and X_embedded parameters must both be cuDF" + " Dataframes or Numpy ndarray") if datatype1 != np.float32 or datatype2 != np.float32: if should_downcast: X = to_single_precision(X) X_embedded = to_single_precision(X_embedded) else: - raise Exception("Input is double precision. Use 'should_downcast=True' " - "if you'd like it to be automatically casted to single precision.") + raise Exception("Input is double precision. Use " + "'should_downcast=True' " + "if you'd like it to be automatically " + "casted to single precision.") if isinstance(X, cudf.DataFrame): d_X = X.as_gpu_matrix(order='C') @@ -88,7 +96,7 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean elif isinstance(X, np.ndarray): d_X = cuda.to_device(X) d_X_embedded = cuda.to_device(X_embedded) - + cdef uintptr_t d_X_ptr = get_ctype_ptr(d_X) cdef uintptr_t d_X_embedded_ptr = get_ctype_ptr(d_X_embedded) @@ -99,9 +107,13 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean handle_ = handle.getHandle() if metric == 'euclidean': - res = trustworthiness_score[float, euclidean](handle_[0], d_X_ptr, - d_X_embedded_ptr, n_samples, n_features, - n_components, n_neighbors) + res = trustworthiness_score[float, euclidean](handle_[0], + d_X_ptr, + d_X_embedded_ptr, + n_samples, + n_features, + n_components, + n_neighbors) else: raise Exception("Unknown metric") @@ -119,18 +131,21 @@ def get_ctype_ptr(obj): def to_single_precision(X): if isinstance(X, cudf.DataFrame): - new_cols = [(col,X._cols[col].astype(np.float32)) for col in X._cols] - overflowed = sum([len(colval[colval >= np.inf]) for colname, colval in new_cols]) + new_cols = [(col, X._cols[col].astype(np.float32)) for col in X._cols] + overflowed = sum([len(colval[colval >= np.inf]) for colname, colval + in new_cols]) if overflowed > 0: - raise Exception("Downcast to single-precision resulted in data loss.") - + raise Exception("Downcast to single-precision resulted in data" + " loss.") + X = cudf.DataFrame(new_cols) else: X = X.astype(np.float32) overflowed = len(X[X >= np.inf]) if overflowed > 0: - raise Exception("Downcast to single-precision resulted in data loss.") + raise Exception("Downcast to single-precision resulted in data" + " loss.") - return X \ No newline at end of file + return X diff --git a/python/cuml/random_projection/rproj.pyx b/python/cuml/random_projection/random_projextion.pyx similarity index 79% rename from python/cuml/random_projection/rproj.pyx rename to python/cuml/random_projection/random_projextion.pyx index 1162809f74..2668c0813f 100644 --- a/python/cuml/random_projection/rproj.pyx +++ b/python/cuml/random_projection/random_projextion.pyx @@ -37,38 +37,40 @@ cdef extern from "random_projection/rproj_c.h" namespace "ML": int n_samples # number of samples int n_features # number of features (original dimension) int n_components # number of components (target dimension) - double eps # error tolerance according to Johnson-Lindenstrauss lemma - bool gaussian_method # toggle Gaussian or Sparse random projection methods - double density # ratio of non-zero component in the random projection matrix (used for sparse random projection) - bool dense_output # toggle random projection's transformation as a dense or sparse matrix + double eps # error tolerance according to Johnson-Lindenstrauss lemma # noqa E501 + bool gaussian_method # toggle Gaussian or Sparse random projection methods # noqa E501 + double density # ratio of non-zero component in the random projection matrix (used for sparse random projection) # noqa E501 + bool dense_output # toggle random projection's transformation as a dense or sparse matrix # noqa E501 int random_state # seed used by random generator # Structure describing random matrix cdef cppclass rand_mat[T]: - rand_mat() except + # random matrix structure constructor (set all to nullptr) + rand_mat() except + # random matrix structure constructor (set all to nullptr) # noqa E501 T *dense_data # dense random matrix data int *indices # sparse CSC random matrix indices int *indptr # sparse CSC random matrix indptr T *sparse_data # sparse CSC random matrix data - size_t sparse_data_size # sparse CSC random matrix number of non-zero elements + size_t sparse_data_size # sparse CSC random matrix number of non-zero elements # noqa E501 # Function used to fit the model cdef void RPROJfit[T](const cumlHandle& handle, rand_mat[T] *random_matrix, - paramsRPROJ* params) - + paramsRPROJ* params) + # Function used to apply data transformation cdef void RPROJtransform[T](const cumlHandle& handle, T *input, rand_mat[T] *random_matrix, T *output, paramsRPROJ* params) # Function used to compute the Johnson Lindenstrauss minimal distance - cdef size_t c_johnson_lindenstrauss_min_dim "ML::johnson_lindenstrauss_min_dim" (size_t n_samples, double eps) + cdef size_t c_johnson_lindenstrauss_min_dim \ + "ML::johnson_lindenstrauss_min_dim" (size_t n_samples, double eps) def johnson_lindenstrauss_min_dim(n_samples, eps=0.1): """ - In mathematics, the Johnson–Lindenstrauss lemma states that high-dimensional data - can be embedded into lower dimension while preserving the distances. + In mathematics, the Johnson–Lindenstrauss lemma states that + high-dimensional data can be embedded into lower dimension while preserving + the distances. With p the random projection : (1 - eps) ||u - v||^2 < ||p(u) - p(v)||^2 < (1 + eps) ||u - v||^2 @@ -83,7 +85,7 @@ def johnson_lindenstrauss_min_dim(n_samples, eps=0.1): Number of samples. eps : float in (0,1) (default = 0.1) Maximum distortion rate as defined by the Johnson-Lindenstrauss lemma. - + Returns ------- @@ -94,15 +96,17 @@ def johnson_lindenstrauss_min_dim(n_samples, eps=0.1): """ return c_johnson_lindenstrauss_min_dim(n_samples, eps) + cdef class BaseRandomProjection(): """ Base class for random projections. This class is not intended to be used directly. - - Random projection is a dimensionality reduction technique. Random projection methods - are powerful methods known for their simplicity, computational efficiency and restricted model size. - This algorithm also has the advantage to preserve distances well between any two samples - and is thus suitable for methods having this requirement. + + Random projection is a dimensionality reduction technique. Random + projection methods are powerful methods known for their simplicity, + computational efficiency and restricted model size. + This algorithm also has the advantage to preserve distances well between + any two samples and is thus suitable for methods having this requirement. Parameters ---------- @@ -137,7 +141,8 @@ cdef class BaseRandomProjection(): Notes ------ - Inspired from sklearn's implementation : https://scikit-learn.org/stable/modules/random_projection.html + Inspired from sklearn's implementation : + https://scikit-learn.org/stable/modules/random_projection.html """ @@ -154,8 +159,9 @@ cdef class BaseRandomProjection(): del self.rand_matD def __init__(self, n_components='auto', eps=0.1, - dense_output=True, random_state=None): - self.params.n_components = n_components if n_components != 'auto' else -1 + dense_output=True, random_state=None): + self.params.n_components = n_components if n_components != 'auto'\ + else -1 self.params.eps = eps self.params.dense_output = dense_output if random_state is not None: @@ -164,17 +170,6 @@ cdef class BaseRandomProjection(): self.params.gaussian_method = self.gaussian_method self.params.density = self.density - # Gets device pointer from Numba's Cuda array - def _get_ctype_ptr(self, obj): - # The manner to access the pointers in the gdf's might change, so - # encapsulating access in the following 3 methods. They might also be - # part of future gdf versions. - return obj.device_ctypes_pointer.value - - # Gets device pointer from cuDF dataframe's column - def _get_column_ptr(self, obj): - return self._get_ctype_ptr(obj._column._data.to_gpu_array()) - def fit(self, X, y=None): """ Fit the model. This function generates the random matrix on GPU. @@ -182,7 +177,8 @@ cdef class BaseRandomProjection(): Parameters ---------- X : cuDF DataFrame or Numpy array - Dense matrix (floats or doubles) of shape (n_samples, n_features) + Dense matrix (floats or doubles) of shape + (n_samples, n_features) Used to provide shape information Returns @@ -220,12 +216,14 @@ cdef class BaseRandomProjection(): def transform(self, X): """ Apply transformation on provided data. This function outputs - a multiplication between the input matrix and the generated random matrix + a multiplication between the input matrix and the generated random + matrix Parameters ---------- X : cuDF DataFrame or Numpy array - Dense matrix (floats or doubles) of shape (n_samples, n_features) + Dense matrix (floats or doubles) of shape + (n_samples, n_features) Used as input matrix Returns @@ -251,30 +249,30 @@ cdef class BaseRandomProjection(): raise TypeError(msg) X_new = cuda.device_array((n_samples, self.params.n_components), - dtype=self.gdf_datatype, - order='F') + dtype=self.gdf_datatype, + order='F') - cdef uintptr_t input_ptr = self._get_ctype_ptr(X_m) - cdef uintptr_t output_ptr = self._get_ctype_ptr(X_new) + cdef uintptr_t input_ptr = self._get_dev_array_ptr(X_m) + cdef uintptr_t output_ptr = self._get_dev_array_ptr(X_new) if self.params.n_features != n_features: raise ValueError("n_features must be same as on fitting: %d" % - self.params.n_features) + self.params.n_features) cdef cumlHandle* handle_ = self.handle.getHandle() if self.gdf_datatype.type == np.float32: RPROJtransform[float](handle_[0], - input_ptr, - self.rand_matS, - output_ptr, - &self.params) + input_ptr, + self.rand_matS, + output_ptr, + &self.params) else: RPROJtransform[double](handle_[0], - input_ptr, - self.rand_matD, - output_ptr, - &self.params) + input_ptr, + self.rand_matD, + output_ptr, + &self.params) self.handle.sync() @@ -284,7 +282,7 @@ cdef class BaseRandomProjection(): del(X_new) gdf_X_new = cudf.DataFrame() for i in range(0, h_X_new.shape[1]): - gdf_X_new[str(i)] = h_X_new[:,i] + gdf_X_new[str(i)] = h_X_new[:, i] return gdf_X_new else: @@ -293,12 +291,14 @@ cdef class BaseRandomProjection(): class GaussianRandomProjection(Base, BaseRandomProjection): """ - Gaussian Random Projection method derivated from BaseRandomProjection class. + Gaussian Random Projection method derivated from BaseRandomProjection + class. - Random projection is a dimensionality reduction technique. Random projection methods - are powerful methods known for their simplicity, computational efficiency and restricted model size. - This algorithm also has the advantage to preserve distances well between any two samples - and is thus suitable for methods having this requirement. + Random projection is a dimensionality reduction technique. Random + projection methods are powerful methods known for their simplicity, + computational efficiency and restricted model size. + This algorithm also has the advantage to preserve distances well between + any two samples and is thus suitable for methods having this requirement. The components of the random matrix are drawn from N(0, 1 / n_components). @@ -311,10 +311,12 @@ class GaussianRandomProjection(Base, BaseRandomProjection): from sklearn.svm import SVC # dataset generation - data, target = make_blobs(n_samples=800, centers=400, n_features=3000, random_state=42) + data, target = make_blobs(n_samples=800, centers=400, n_features=3000, + random_state=42) # model fitting - model = GaussianRandomProjection(n_components=5, random_state=42).fit(data) + model = GaussianRandomProjection(n_components=5, + random_state=42).fit(data) # dataset transformation transformed_data = model.transform(data) @@ -363,16 +365,17 @@ class GaussianRandomProjection(Base, BaseRandomProjection): Notes ------ - Inspired from sklearn's implementation : https://scikit-learn.org/stable/modules/random_projection.html + Inspired from sklearn's implementation : + https://scikit-learn.org/stable/modules/random_projection.html """ def __init__(self, handle=None, n_components='auto', eps=0.1, - random_state=None, verbose=False): + random_state=None, verbose=False): Base.__init__(self, handle, verbose) self.gaussian_method = True - self.density = -1.0 # not used - + self.density = -1.0 # not used + BaseRandomProjection.__init__( self, n_components=n_components, @@ -385,14 +388,16 @@ class SparseRandomProjection(Base, BaseRandomProjection): """ Sparse Random Projection method derivated from BaseRandomProjection class. - Random projection is a dimensionality reduction technique. Random projection methods - are powerful methods known for their simplicity, computational efficiency and restricted model size. - This algorithm also has the advantage to preserve distances well between any two samples - and is thus suitable for methods having this requirement. + Random projection is a dimensionality reduction technique. Random + projection methods are powerful methods known for their simplicity, + computational efficiency and restricted model size. + This algorithm also has the advantage to preserve distances well between + any two samples and is thus suitable for methods having this requirement. - Sparse random matrix is an alternative to dense random projection matrix (e.g. Gaussian) - that guarantees similar embedding quality while being much more memory efficient - and allowing faster computation of the projected data (with sparse enough matrices). + Sparse random matrix is an alternative to dense random projection matrix + (e.g. Gaussian) that guarantees similar embedding quality while being much + more memory efficient and allowing faster computation of the projected data + (with sparse enough matrices). If we note 's = 1 / density' the components of the random matrix are drawn from: - -sqrt(s) / sqrt(n_components) with probability 1 / 2s @@ -408,10 +413,12 @@ class SparseRandomProjection(Base, BaseRandomProjection): from sklearn.svm import SVC # dataset generation - data, target = make_blobs(n_samples=800, centers=400, n_features=3000, random_state=42) + data, target = make_blobs(n_samples=800, centers=400, n_features=3000, + random_state=42) # model fitting - model = SparseRandomProjection(n_components=5, random_state=42).fit(data) + model = SparseRandomProjection(n_components=5, + random_state=42).fit(data) # dataset transformation transformed_data = model.transform(data) @@ -449,7 +456,7 @@ class SparseRandomProjection(Base, BaseRandomProjection): Ratio of non-zero component in the random projection matrix. If density = 'auto', the value is set to the minimum density - as recommended by Ping Li et al.: 1 / sqrt(n_features). + as recommended by Ping Li et al.: 1 / sqrt(n_features). eps : float (default = 0.1) Error tolerance during projection. Used by Johnson–Lindenstrauss @@ -469,12 +476,13 @@ class SparseRandomProjection(Base, BaseRandomProjection): Notes ------ - Inspired from sklearn's implementation : https://scikit-learn.org/stable/modules/random_projection.html + Inspired from sklearn's implementation : + https://scikit-learn.org/stable/modules/random_projection.html """ def __init__(self, handle=None, n_components='auto', density='auto', - eps=0.1, dense_output=True, random_state=None, verbose=False): + eps=0.1, dense_output=True, random_state=None, verbose=False): Base.__init__(self, handle, verbose) self.gaussian_method = False self.density = density if density != 'auto' else -1.0 @@ -484,4 +492,4 @@ class SparseRandomProjection(Base, BaseRandomProjection): n_components=n_components, eps=eps, dense_output=dense_output, - random_state=random_state) \ No newline at end of file + random_state=random_state) From 92964282816eb61963e653071ced878d5f9c6b07 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 15 May 2019 21:58:51 -0500 Subject: [PATCH 144/156] FIX Fix files product of a bad merge --- cpp/src_prims/sparse/coo.h | 36 +-- cpp/src_prims/sparse/csr.h | 412 ++++++++++++++++++++++++++++++++-- cpp/test/prims/coo.cu | 6 - cpp/test/prims/csr.cu | 142 +++++++++++- cpp/test/prims/hinge.cu | 19 +- cpp/test/prims/linearReg.cu | 19 +- cpp/test/prims/logisticReg.cu | 19 +- 7 files changed, 579 insertions(+), 74 deletions(-) diff --git a/cpp/src_prims/sparse/coo.h b/cpp/src_prims/sparse/coo.h index eed47016e4..ad4f7eecdc 100644 --- a/cpp/src_prims/sparse/coo.h +++ b/cpp/src_prims/sparse/coo.h @@ -381,7 +381,7 @@ void coo_sort(int m, int n, int nnz, * @param stream: the cuda stream to use */ template - void coo_sort(COO *in, cudaStream_t stream = 0) { + void coo_sort(COO* const in, cudaStream_t stream = 0) { coo_sort(in->n_rows, in->n_cols, in->nnz, in->rows, in->cols, in->vals, stream); } @@ -447,7 +447,7 @@ __global__ void coo_remove_scalar_kernel( * @param results array to place results */ template -__global__ void coo_row_count_kernel(int *rows, int nnz, +__global__ void coo_row_count_kernel(int* const rows, int nnz, int *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; if(row < nnz) { @@ -464,7 +464,7 @@ __global__ void coo_row_count_kernel(int *rows, int nnz, * @param stream: cuda stream to use */ template -void coo_row_count(int *rows, int nnz, int *results, +void coo_row_count(int* const rows, int nnz, int *results, cudaStream_t stream) { dim3 grid_rc(MLCommon::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -482,7 +482,7 @@ void coo_row_count(int *rows, int nnz, int *results, * @param stream: cuda stream to use */ template -void coo_row_count(COO *in, int *results, cudaStream_t stream = 0) { +void coo_row_count(COO* const in, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -491,7 +491,7 @@ void coo_row_count(COO *in, int *results, cudaStream_t stream = 0) { } template -__global__ void coo_row_count_nz_kernel(int *rows, T *vals, int nnz, +__global__ void coo_row_count_nz_kernel(int* const rows, T* const vals, int nnz, int *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; if(row < nnz && vals[row] != 0.0) { @@ -500,7 +500,7 @@ __global__ void coo_row_count_nz_kernel(int *rows, T *vals, int nnz, } template -__global__ void coo_row_count_scalar_kernel(int *rows, T *vals, int nnz, +__global__ void coo_row_count_scalar_kernel(int* const rows, T* const vals, int nnz, T scalar, int *results) { int row = (blockIdx.x * TPB_X) + threadIdx.x; if(row < nnz && vals[row] != scalar) { @@ -518,7 +518,7 @@ __global__ void coo_row_count_scalar_kernel(int *rows, T *vals, int nnz, * @param stream: cuda stream to use */ template -void coo_row_count_scalar(COO *in, T scalar, int *results, +void coo_row_count_scalar(COO* const in, T scalar, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -540,7 +540,7 @@ void coo_row_count_scalar(COO *in, T scalar, int *results, * @param stream: cuda stream to use */ template -void coo_row_count_scalar(int *rows, T *vals, int nnz, T scalar, +void coo_row_count_scalar(int* const rows, T* const vals, int nnz, T scalar, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -561,7 +561,7 @@ void coo_row_count_scalar(int *rows, T *vals, int nnz, T scalar, * @param stream: cuda stream to use */ template -void coo_row_count_nz(int *rows, T *vals, int nnz, int *results, +void coo_row_count_nz(int* const rows, T* const vals, int nnz, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -579,7 +579,7 @@ void coo_row_count_nz(int *rows, T *vals, int nnz, int *results, * @param stream: cuda stream to use */ template -void coo_row_count_nz(COO *in, int *results, cudaStream_t stream = 0) { +void coo_row_count_nz(COO* const in, int *results, cudaStream_t stream = 0) { dim3 grid_rc(MLCommon::ceildiv(in->nnz, TPB_X), 1, 1); dim3 blk_rc(TPB_X, 1, 1); @@ -656,7 +656,7 @@ void coo_remove_scalar( * @param stream: cuda stream to use */ template -void coo_remove_scalar(COO *in, +void coo_remove_scalar(COO* const in, COO *out, T scalar, cudaStream_t stream) { @@ -703,14 +703,14 @@ void coo_remove_scalar(COO *in, * @param stream: cuda stream to use */ template -void coo_remove_zeros(COO *in, +void coo_remove_zeros(COO* const in, COO *out, cudaStream_t stream) { coo_remove_scalar(in, out, T(0.0), stream); } template -__global__ void from_knn_graph_kernel(long *knn_indices, T *knn_dists, int m, int k, +__global__ void from_knn_graph_kernel(long* const knn_indices, T* const knn_dists, int m, int k, int *rows, int *cols, T *vals) { int row = (blockIdx.x * TPB_X) + threadIdx.x; @@ -737,7 +737,7 @@ __global__ void from_knn_graph_kernel(long *knn_indices, T *knn_dists, int m, in * @param vals: output COO val array */ template -void from_knn(long *knn_indices, T *knn_dists, int m, int k, +void from_knn(long* const knn_indices, T* const knn_dists, int m, int k, int *rows, int *cols, T *vals) { dim3 grid(ceildiv(m, 32), 1, 1); @@ -751,7 +751,7 @@ void from_knn(long *knn_indices, T *knn_dists, int m, int k, * into COO format. */ template -void from_knn(long *knn_indices, T *knn_dists, int m, int k, +void from_knn(long* const knn_indices, T* const knn_dists, int m, int k, COO *out) { out->allocate(m*k, m, m); @@ -771,7 +771,7 @@ void from_knn(long *knn_indices, T *knn_dists, int m, int k, */ template void sorted_coo_to_csr( - T *rows, int nnz, T *row_ind, int m, + T* const rows, int nnz, T *row_ind, int m, cudaStream_t stream = 0) { T *row_counts; @@ -798,7 +798,7 @@ void sorted_coo_to_csr( * @param stream: cuda stream to use */ template -void sorted_coo_to_csr(COO *coo, int *row_ind, cudaStream_t stream = 0) { +void sorted_coo_to_csr(COO* const coo, int *row_ind, cudaStream_t stream = 0) { sorted_coo_to_csr(coo->rows, coo->nnz, row_ind, coo->n_rows, stream); } @@ -875,7 +875,7 @@ __global__ void coo_symmetrize_kernel( * @param stream: cuda stream to use */ template -void coo_symmetrize(COO *in, +void coo_symmetrize(COO* const in, COO *out, Lambda reduction_op, // two-argument reducer cudaStream_t stream) { diff --git a/cpp/src_prims/sparse/csr.h b/cpp/src_prims/sparse/csr.h index c8057d94f2..2ab9a134bd 100644 --- a/cpp/src_prims/sparse/csr.h +++ b/cpp/src_prims/sparse/csr.h @@ -17,6 +17,8 @@ #include "cuda_utils.h" +#include "array/array.h" + #include #include @@ -60,7 +62,7 @@ class CSR { * @param n_rows: number of rows in the dense matrix * @param n_cols: number of cols in the dense matrix */ - CSR(int *row_ind, int *row_ind_ptr, T *vals, int nnz, int n_rows = -1, int n_cols = -1) { + CSR(int* const row_ind, int* const row_ind_ptr, T* const vals, int nnz, int n_rows = -1, int n_cols = -1) { this->row_ind = row_ind; this->row_ind_ptr = row_ind_ptr; this->vals = vals; @@ -246,10 +248,10 @@ __global__ void csr_row_normalize_l1_kernel( * @param result: l1 normalized data array * @param stream: cuda stream to use */ -template +template void csr_row_normalize_l1( - int *ia, // csr row ex_scan (sorted by row) - T *vals, int nnz, // array of values and number of non-zeros + int* const ia, // csr row ex_scan (sorted by row) + T* const vals, int nnz, // array of values and number of non-zeros int m, // num rows in csr T *result, cudaStream_t stream) { // output array @@ -261,7 +263,7 @@ void csr_row_normalize_l1( m, result); } -template +template __global__ void csr_row_normalize_max_kernel( int *ia, // csr row ind array (sorted by row) T *vals, int nnz, // array of values and number of non-zeros @@ -310,10 +312,10 @@ __global__ void csr_row_normalize_max_kernel( * @param stream: cuda stream to use */ -template +template void csr_row_normalize_max( - int *ia, // csr row ind array (sorted by row) - T *vals, int nnz, // array of values and number of non-zeros + int* const ia, // csr row ind array (sorted by row) + T* const vals, int nnz, // array of values and number of non-zeros int m, // num total rows in csr T *result, cudaStream_t stream) { @@ -337,7 +339,7 @@ __device__ int get_stop_idx(T row, int m, int nnz, T *ind) { return stop_idx; } -template +template __global__ void csr_to_coo_kernel(int *row_ind, int m, int *coo_rows, int nnz) { // row-based matrix 1 thread per row @@ -368,7 +370,7 @@ void csr_to_coo(int *row_ind, int m, int *coo_rows, int nnz, } -template +template __global__ void csr_add_calc_row_counts_kernel( int *a_ind, int *a_indptr, T *a_val, int nnz1, int *b_ind, int *b_indptr, T *b_val, int nnz2, @@ -426,7 +428,7 @@ __global__ void csr_add_calc_row_counts_kernel( } -template +template __global__ void csr_add_kernel( int *a_ind, int *a_indptr, T *a_val, int nnz1, int *b_ind, int *b_indptr, T *b_val, int nnz2, @@ -491,10 +493,10 @@ __global__ void csr_add_kernel( * @param out_ind: output row_ind array * @param stream: cuda stream to use */ -template +template size_t csr_add_calc_inds( - int *a_ind, int *a_indptr, T *a_val, int nnz1, - int *b_ind, int *b_indptr, T *b_val, int nnz2, + int* const a_ind, int* const a_indptr, T* const a_val, int nnz1, + int* const b_ind, int* const b_indptr, T* const b_val, int nnz2, int m, int *out_ind, cudaStream_t stream ) { @@ -536,16 +538,16 @@ size_t csr_add_calc_inds( * @param b_val: right hand data array * @param nnz2: size of right hand index_ptr and val arrays * @param m: size of output array (number of rows in final matrix) - * @param c_ind: output row_ind arra + * @param c_ind: output row_ind array * @param c_indptr: output ind_ptr array * @param c_val: output data array * @param stream: cuda stream to use */ -template +template void csr_add_finalize( - int *a_ind, int *a_indptr, T *a_val, int nnz1, - int *b_ind, int *b_indptr, T *b_val, int nnz2, - int m, int *c_ind, int *c_indptr, T *c_val, + int* const a_ind, int* const a_indptr, T* const a_val, int nnz1, + int* const b_ind, int* const b_indptr, T* const b_val, int nnz2, + int m, int* const c_ind, int *c_indptr, T *c_val, cudaStream_t stream ) { dim3 grid(MLCommon::ceildiv(m, TPB_X), 1, 1); @@ -558,5 +560,377 @@ void csr_add_finalize( ); CUDA_CHECK(cudaPeekAtLastError()); } + +template void> +__global__ void csr_row_op_kernel(T* const row_ind, T n_rows, + T nnz, Lambda op) { + T row = blockIdx.x*TPB_X + threadIdx.x; + if(row < n_rows) { + T start_idx = row_ind[row]; + T stop_idx = row < n_rows-1 ? row_ind[row+1] : nnz; + op(row, start_idx, stop_idx); + } +} + +/** + * @brief Perform a custom row operation on a CSR matrix in batches. + * @tparam T numerical type of row_ind array + * @tparam TPB_X number of threads per block to use for underlying kernel + * @tparam Lambda type of custom operation function + * @param row_ind the CSR row_ind array to perform parallel operations over + * @param total_rows total number vertices in graph + * @param batchSize size of row_ind + * @param op custom row operation functor accepting the row and beginning index. + * @param stream cuda stream to use + */ +templatevoid> +void csr_row_op(T* const row_ind, T n_rows, T nnz, + Lambda op, cudaStream_t stream) { + + dim3 grid(MLCommon::ceildiv(n_rows, TPB_X), 1, 1); + dim3 blk(TPB_X, 1, 1); + csr_row_op_kernel<<>> + (row_ind, n_rows, nnz, op); + + CUDA_CHECK(cudaPeekAtLastError()); +} + +/** + * @brief Constructs an adjacency graph CSR row_ind_ptr array from + * a row_ind array and adjacency array. + * @tparam T the numeric type of the index arrays + * @tparam TPB_X the number of threads to use per block for kernels + * @tparam Lambda function for fused operation in the adj_graph construction + * @param row_ind the input CSR row_ind array + * @param total_rows number of vertices in graph + * @param batchSize number of vertices in current batch + * @param adj an adjacency array (size batchSize * total_rows) + * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph + * @param stream cuda stream to use + */ +templatevoid> +void csr_adj_graph_batched(T* const row_ind, T total_rows, T nnz, T batchSize, + bool* const adj, T *row_ind_ptr, cudaStream_t stream, Lambda fused_op) { + csr_row_op(row_ind, batchSize, nnz, + [fused_op, adj, total_rows, row_ind_ptr, batchSize] __device__ + (T row, T start_idx, T stop_idx) { + + fused_op(row, start_idx, stop_idx); + int k = 0; + for(T i=0; ivoid> +void csr_adj_graph_batched(T* const row_ind, T total_rows, T nnz, T batchSize, + bool* const adj, T *row_ind_ptr, cudaStream_t stream) { + csr_adj_graph_batched(row_ind, total_rows, nnz, batchSize, adj, + row_ind_ptr, stream, [] __device__ (T row, T start_idx, T stop_idx) {}); +} + +/** + * @brief Constructs an adjacency graph CSR row_ind_ptr array from a + * a row_ind array and adjacency array. + * @tparam T the numeric type of the index arrays + * @tparam TPB_X the number of threads to use per block for kernels + * @param row_ind the input CSR row_ind array + * @param n_rows number of total vertices in graph + * @param adj an adjacency array + * @param row_ind_ptr output CSR row_ind_ptr for adjacency graph + * @param stream cuda stream to use + */ +templatevoid> +void csr_adj_graph(T* const row_ind, T total_rows, T nnz, + bool* const adj, T *row_ind_ptr, cudaStream_t stream, Lambda fused_op) { + + csr_adj_graph_batched(row_ind, total_rows, nnz, total_rows, adj, + row_ind_ptr, stream, fused_op); +} + +template +class WeakCCState { + public: + + bool *xa; + bool *fa; + bool *m; + bool owner; + + WeakCCState(T n): owner(true) { + MLCommon::allocate(xa, n, true); + MLCommon::allocate(fa, n, true); + MLCommon::allocate(m, 1, true); + } + + WeakCCState(bool *xa, bool *fa, bool *m): + owner(false), xa(xa), fa(fa), m(m) { + } + + ~WeakCCState() { + if(owner) { + try { + CUDA_CHECK(cudaFree(xa)); + CUDA_CHECK(cudaFree(fa)); + CUDA_CHECK(cudaFree(m)); + } catch(Exception &e) { + std::cout << "Exception freeing memory for WeakCCState: " << + e.what() << std::endl; + } + } + } +}; + +template +__global__ void weak_cc_label_device( + Type *labels, + Type *row_ind, Type *row_ind_ptr, Type nnz, + bool *fa, bool *xa, bool *m, + int startVertexId, int batchSize) { + int tid = threadIdx.x + blockIdx.x*TPB_X; + if(tidcj) { + ci = cj; + ci_mod = true; + } + } + if(ci_mod) { + atomicMin(labels + startVertexId + tid, ci); + xa[startVertexId + tid] = true; + m[0] = true; + } + } + } +} + + +template +__global__ void weak_cc_init_label_kernel(Type *labels, int startVertexId, int batchSize, + Type MAX_LABEL, Lambda filter_op) { + /** F1 and F2 in the paper correspond to fa and xa */ + /** Cd in paper corresponds to db_cluster */ + int tid = threadIdx.x + blockIdx.x*TPB_X; + if(tid +__global__ void weak_cc_init_all_kernel(Type *labels, bool *fa, bool *xa, + Type N, Type MAX_LABEL) { + int tid = threadIdx.x + blockIdx.x*TPB_X; + if(tid +void weak_cc_label_batched(Type *labels, + Type* const row_ind, Type* const row_ind_ptr, Type nnz, Type N, + WeakCCState *state, + Type startVertexId, Type batchSize, + cudaStream_t stream, Lambda filter_op) { + bool host_m; + bool *host_fa = (bool*)malloc(sizeof(bool)*N); + bool *host_xa = (bool*)malloc(sizeof(bool)*N); + + dim3 blocks(ceildiv(batchSize, TPB_X)); + dim3 threads(TPB_X); + Type MAX_LABEL = std::numeric_limits::max(); + + weak_cc_init_label_kernel<<>>(labels, + startVertexId, batchSize, MAX_LABEL, filter_op); + CUDA_CHECK(cudaPeekAtLastError()); + do { + CUDA_CHECK( cudaMemsetAsync(state->m, false, sizeof(bool), stream) ); + weak_cc_label_device<<>>( + labels, + row_ind, row_ind_ptr, nnz, + state->fa, state->xa, state->m, + startVertexId, batchSize); + CUDA_CHECK(cudaPeekAtLastError()); + + //** swapping F1 and F2 + MLCommon::updateHost(host_fa, state->fa, N, stream); + MLCommon::updateHost(host_xa, state->xa, N, stream); + MLCommon::updateDevice(state->fa, host_xa, N, stream); + MLCommon::updateDevice(state->xa, host_fa, N, stream); + + //** Updating m * + MLCommon::updateHost(&host_m, state->m, 1, stream); + CUDA_CHECK(cudaStreamSynchronize(stream)); + } while(host_m); +} + +/** + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param startVertexId the starting vertex index for the current batch + * @param batchSize number of vertices for current batch + * @param state instance of inter-batch state management + * @param stream the cuda stream to use + * @param filter_op an optional filtering function to determine which points + * should get considered for labeling. + */ +templatebool> +void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr, + Type nnz, Type N, Type startVertexId, Type batchSize, + WeakCCState *state, cudaStream_t stream, Lambda filter_op) { + + dim3 blocks(ceildiv(N, TPB_X)); + dim3 threads(TPB_X); + + Type MAX_LABEL = std::numeric_limits::max(); + if(startVertexId == 0) { + weak_cc_init_all_kernel<<>> + (labels, state->fa, state->xa, N, MAX_LABEL); + CUDA_CHECK(cudaPeekAtLastError()); + } + weak_cc_label_batched(labels, row_ind, row_ind_ptr, nnz, N, state, + startVertexId, batchSize, stream, filter_op); +} + +/** + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param startVertexId the starting vertex index for the current batch + * @param batchSize number of vertices for current batch + * @param state instance of inter-batch state management + * @param stream the cuda stream to use + */ +template +void weak_cc_batched(Type *labels, Type* const row_ind, Type* const row_ind_ptr, + Type nnz, Type N, Type startVertexId, Type batchSize, + WeakCCState *state, cudaStream_t stream) { + + weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, startVertexId, batchSize, + state, stream, [] __device__ (int tid) {return true;}); +} + +/** + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param stream the cuda stream to use + * @param filter_op an optional filtering function to determine which points + * should get considered for labeling. + */ +templatebool> +void weak_cc(Type *labels, Type* const row_ind, Type* const row_ind_ptr, + Type nnz, Type N, cudaStream_t stream, Lambda filter_op) { + + WeakCCState state(N); + weak_cc_batched( + labels, row_ind, row_ind_ptr, + nnz, N, 0, N, stream, + filter_op); +} + +/** + * @brief Compute weakly connected components. Note that the resulting labels + * may not be taken from a monotonically increasing set (eg. numbers may be + * skipped). The MLCommon::Array package contains a primitive `make_monotonic`, + * which will make a monotonically increasing set of labels. + * + * This implementation comes from [1] and solves component labeling problem in + * parallel on CSR-indexes based upon the vertex degree and adjacency graph. + * + * [1] Hawick, K.A et al, 2010. "Parallel graph component labelling with GPUs and CUDA" + * + * @tparam Type the numeric type of non-floating point elements + * @tparam TPB_X the threads to use per block when configuring the kernel + * @tparam Lambda the type of an optional filter function (int)->bool + * @param labels an array for the output labels + * @param row_ind the compressed row index of the CSR array + * @param row_ind_ptr the row index pointer of the CSR array + * @param nnz the size of row_ind_ptr array + * @param N number of vertices + * @param stream the cuda stream to use + * should get considered for labeling. + */ +template +void weak_cc(Type *labels, Type* const row_ind, Type* const row_ind_ptr, + Type nnz, Type N, cudaStream_t stream) { + + WeakCCState state(N); + weak_cc_batched( + labels, row_ind, row_ind_ptr, + nnz, N, 0, N, stream, + [](Type t){return true;}); +} + + + }; }; diff --git a/cpp/test/prims/coo.cu b/cpp/test/prims/coo.cu index b21dfb7dc2..a5757b6aa4 100644 --- a/cpp/test/prims/coo.cu +++ b/cpp/test/prims/coo.cu @@ -105,8 +105,6 @@ TEST_P(COOSymmetrize, Result) { [] __device__ (int row, int col, float val, float trans) { return val+trans; }, stream); - std::cout << out << std::endl; - ASSERT_TRUE(out.nnz == expected.nnz); ASSERT_TRUE(devArrMatch(out.rows, expected.rows, out.nnz, Compare())); ASSERT_TRUE(devArrMatch(out.cols, expected.cols, out.nnz, Compare())); @@ -201,8 +199,6 @@ TEST_P(COORemoveZeros, Result) { updateDevice(in.cols, in_h.cols, params.nnz, stream); updateDevice(in.vals, in_h.vals, params.nnz, stream); - std::cout << in << std::endl; - coo_sort(&in); int out_rows_ref_h[2] = { 0, 3 }; @@ -215,8 +211,6 @@ TEST_P(COORemoveZeros, Result) { COO out_ref(2, 5, 5); COO out; - std::cout << in << std::endl; - updateDevice(out_ref.rows, *&out_rows_ref_h, 2, stream); updateDevice(out_ref.cols, *&out_cols_ref_h, 2, stream); updateDevice(out_ref.vals, out_vals_ref_h, 2, stream); diff --git a/cpp/test/prims/csr.cu b/cpp/test/prims/csr.cu index c3230d3709..72f24f29e6 100644 --- a/cpp/test/prims/csr.cu +++ b/cpp/test/prims/csr.cu @@ -21,6 +21,7 @@ #include "random/rng.h" #include "test_utils.h" +#include #include namespace MLCommon { @@ -61,13 +62,8 @@ TEST_P(CSRToCOO, Result) { csr_to_coo<32>(ex_scan, 4, result, 10, stream); - std::cout << MLCommon::arr2Str(result, 10, "result", stream) << std::endl; - ASSERT_TRUE(devArrMatch(verify, result, 10, Compare(), stream)); - - std::cout << "Verified!" << std::endl; - delete ex_scan_h; delete verify_h; @@ -105,8 +101,6 @@ TEST_P(CSRRowNormalizeMax, Result) { csr_row_normalize_max<32, float>(ex_scan, in_vals, 10, 4, result, stream); - std::cout << MLCommon::arr2Str(result, 10, "result", stream) << std::endl; - ASSERT_TRUE(devArrMatch(verify, result, 10, Compare())); cudaStreamDestroy(stream); @@ -223,6 +217,140 @@ TEST_P(CSRSum, Result) { CUDA_CHECK(cudaFree(result_val)); } +typedef CSRTest CSRRowOpTest; +TEST_P(CSRRowOpTest, Result) { + + cudaStream_t stream; + cudaStreamCreate(&stream); + + int *ex_scan; + float *result, *verify; + + int ex_scan_h[4] = {0, 4, 8, 9 }; + + float verify_h[10] = { 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0 }; + + allocate(verify, 10); + allocate(ex_scan, 4); + allocate(result, 10, true); + + updateDevice(ex_scan, *&ex_scan_h, 4, stream); + updateDevice(verify, *&verify_h, 10, stream); + + csr_row_op(ex_scan, 4, 10, + [result] __device__ (int row, int start_idx, int stop_idx) { + for(int i = start_idx; i < stop_idx; i++ ) + result[i] = row; + }, stream); + + ASSERT_TRUE(devArrMatch(verify, result, 10, Compare())); + + cudaStreamDestroy(stream); + + CUDA_CHECK(cudaFree(ex_scan)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); +} + +typedef CSRTest AdjGraphTest; +TEST_P(AdjGraphTest, Result) { + + cudaStream_t stream; + cudaStreamCreate(&stream); + + int *row_ind, *result, *verify; + bool *adj; + + int row_ind_h[3] = {0, 3, 6 }; + bool adj_h[18] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + int verify_h[9] = { 0, 1, 2, 0, 1, 2, 0, 1, 2 }; + + allocate(row_ind, 3); + allocate(adj, 18); + allocate(result, 9, true); + allocate(verify, 9); + + updateDevice(row_ind, *&row_ind_h, 3, stream); + updateDevice(adj, *&adj_h, 18, stream); + updateDevice(verify, *&verify_h, 9, stream); + + csr_adj_graph_batched(row_ind, 6, 9, 3, adj, result, stream); + + ASSERT_TRUE(devArrMatch(verify, result, 9, Compare())); + + cudaStreamDestroy(stream); + + CUDA_CHECK(cudaFree(row_ind)); + CUDA_CHECK(cudaFree(adj)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); +} + +typedef CSRTest WeakCCTest; +TEST_P(WeakCCTest, Result) { + + cudaStream_t stream; + cudaStreamCreate(&stream); + + int *row_ind, *row_ind_ptr, *result, *verify; + + int row_ind_h1[3] = {0, 3, 6 }; + int row_ind_ptr_h1[9] = { 0, 1, 2, 0, 1, 2, 0, 1, 2 }; + int verify_h1[6] = { 1, 1, 1, 2147483647, 2147483647, 2147483647 }; + + int row_ind_h2[3] = {0, 2, 4 }; + int row_ind_ptr_h2[5] = { 3, 4, 3, 4, 5 }; + int verify_h2[6] = { 1, 1, 1, 5, 5, 5 }; + + allocate(row_ind, 3); + allocate(row_ind_ptr, 9); + allocate(result, 9, true); + allocate(verify, 9); + + WeakCCState state(6); + + /** + * Run batch #1 + */ + updateDevice(row_ind, *&row_ind_h1, 3, stream); + updateDevice(row_ind_ptr, *&row_ind_ptr_h1, 9, stream); + updateDevice(verify, *&verify_h1, 6, stream); + + weak_cc_batched(result, row_ind, row_ind_ptr, 9, 6, 0, 3, &state, stream); + + ASSERT_TRUE(devArrMatch(verify, result, 6, Compare())); + + /** + * Run batch #2 + */ + updateDevice(row_ind, *&row_ind_h2, 3, stream); + updateDevice(row_ind_ptr, *&row_ind_ptr_h2, 5, stream); + updateDevice(verify, *&verify_h2, 6, stream); + + weak_cc_batched(result, row_ind, row_ind_ptr, 5, 6, 4, 3, &state, stream); + + ASSERT_TRUE(devArrMatch(verify, result, 6, Compare())); + + cudaStreamDestroy(stream); + + CUDA_CHECK(cudaFree(row_ind)); + CUDA_CHECK(cudaFree(row_ind_ptr)); + CUDA_CHECK(cudaFree(verify)); + CUDA_CHECK(cudaFree(result)); +} + +INSTANTIATE_TEST_CASE_P(CSRTests, WeakCCTest, + ::testing::ValuesIn(inputsf)); + + +INSTANTIATE_TEST_CASE_P(CSRTests, AdjGraphTest, + ::testing::ValuesIn(inputsf)); + +INSTANTIATE_TEST_CASE_P(CSRTests, CSRRowOpTest, + ::testing::ValuesIn(inputsf)); + + INSTANTIATE_TEST_CASE_P(CSRTests, CSRToCOO, ::testing::ValuesIn(inputsf)); diff --git a/cpp/test/prims/hinge.cu b/cpp/test/prims/hinge.cu index 3ebedcbc0f..384bb0ed76 100644 --- a/cpp/test/prims/hinge.cu +++ b/cpp/test/prims/hinge.cu @@ -32,6 +32,8 @@ protected: cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); + allocator.reset(new defaultDeviceAllocator); + allocate(in, len); allocate(out, 1); allocate(out_lasso, 1); @@ -90,38 +92,38 @@ protected: T l1_ratio = 0.5; hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_grad, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out_lasso, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_lasso_grad, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out_ridge, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_ridge_grad, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); hingeLoss(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); hingeLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet_grad, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); @@ -159,6 +161,7 @@ protected: T *out_ref, *out_lasso_ref, *out_ridge_ref, *out_elasticnet_ref; T *out_grad, *out_lasso_grad, *out_ridge_grad, *out_elasticnet_grad; T *out_grad_ref, *out_lasso_grad_ref, *out_ridge_grad_ref, *out_elasticnet_grad_ref; + std::shared_ptr allocator; }; const std::vector > inputsf = { diff --git a/cpp/test/prims/linearReg.cu b/cpp/test/prims/linearReg.cu index 558a8b1fee..0254ba17f1 100644 --- a/cpp/test/prims/linearReg.cu +++ b/cpp/test/prims/linearReg.cu @@ -32,6 +32,8 @@ protected: cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); + allocator.reset(new defaultDeviceAllocator); + allocate(in, len); allocate(out, 1); allocate(out_lasso, 1); @@ -90,38 +92,38 @@ protected: T l1_ratio = 0.5; linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_grad, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_lasso, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_lasso_grad, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_ridge, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_ridge_grad, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); linearRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); linearRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet_grad, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); @@ -159,6 +161,7 @@ protected: T *out_ref, *out_lasso_ref, *out_ridge_ref, *out_elasticnet_ref; T *out_grad, *out_lasso_grad, *out_ridge_grad, *out_elasticnet_grad; T *out_grad_ref, *out_lasso_grad_ref, *out_ridge_grad_ref, *out_elasticnet_grad_ref; + std::shared_ptr allocator; }; const std::vector > inputsf = { diff --git a/cpp/test/prims/logisticReg.cu b/cpp/test/prims/logisticReg.cu index d8c31b9ab7..2a61cc9d1e 100644 --- a/cpp/test/prims/logisticReg.cu +++ b/cpp/test/prims/logisticReg.cu @@ -31,6 +31,8 @@ protected: cudaStream_t stream; CUDA_CHECK(cudaStreamCreate(&stream)); + allocator.reset(new defaultDeviceAllocator); + allocate(in, len); allocate(out, 1); allocate(out_lasso, 1); @@ -89,38 +91,38 @@ protected: T l1_ratio = 0.5; logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_grad, penalty::NONE, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_lasso, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_lasso_grad, penalty::L1, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_ridge, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_ridge_grad, penalty::L2, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); logisticRegLoss(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); logisticRegLossGrads(in, params.n_rows, params.n_cols, labels, coef, out_elasticnet_grad, penalty::ELASTICNET, - alpha, l1_ratio, cublas_handle, stream); + alpha, l1_ratio, cublas_handle, allocator, stream); updateDevice(in, h_in, len, stream); @@ -158,6 +160,7 @@ protected: T *out_ref, *out_lasso_ref, *out_ridge_ref, *out_elasticnet_ref; T *out_grad, *out_lasso_grad, *out_ridge_grad, *out_elasticnet_grad; T *out_grad_ref, *out_lasso_grad_ref, *out_ridge_grad_ref, *out_elasticnet_grad_ref; + std::shared_ptr allocator; }; const std::vector > inputsf = { From 6801d92e6d14ca499267396e5e96c3df91a04522 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Wed, 15 May 2019 22:03:36 -0500 Subject: [PATCH 145/156] FIX Trustworthines pep8 fixes --- python/cuml/metrics/trustworthiness.pyx | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx index 0ed0114e78..cd864fa9f7 100644 --- a/python/cuml/metrics/trustworthiness.pyx +++ b/python/cuml/metrics/trustworthiness.pyx @@ -97,8 +97,8 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, d_X = cuda.to_device(X) d_X_embedded = cuda.to_device(X_embedded) - cdef uintptr_t d_X_ptr = get_ctype_ptr(d_X) - cdef uintptr_t d_X_embedded_ptr = get_ctype_ptr(d_X_embedded) + cdef uintptr_t d_X_ptr = _get_dev_array_ptr(d_X) + cdef uintptr_t d_X_embedded_ptr = _get_dev_array_ptr(d_X_embedded) cdef cumlHandle* handle_ = 0 if handle is None: @@ -122,13 +122,6 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, return res -def get_ctype_ptr(obj): - # The manner to access the pointers in the gdf's might change, so - # encapsulating access in the following 3 methods. They might also be - # part of future gdf versions. - return obj.device_ctypes_pointer.value - - def to_single_precision(X): if isinstance(X, cudf.DataFrame): new_cols = [(col, X._cols[col].astype(np.float32)) for col in X._cols] From d31dc7fe27694e8188ee76b8d605770e769caa4d Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 16 May 2019 07:45:12 -0500 Subject: [PATCH 146/156] FI trust method name fix --- python/cuml/metrics/trustworthiness.pyx | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx index cd864fa9f7..bfb9a99fbb 100644 --- a/python/cuml/metrics/trustworthiness.pyx +++ b/python/cuml/metrics/trustworthiness.pyx @@ -42,6 +42,13 @@ cdef extern from "metrics/trustworthiness_c.h" namespace "ML::Metrics": int n_neighbors) +def _get_array_ptr(self, obj): + """ + Get ctype pointer of a numba style device array + """ + return obj.device_ctypes_pointer.value + + def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, metric='euclidean', should_downcast=True): """ @@ -97,8 +104,8 @@ def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, d_X = cuda.to_device(X) d_X_embedded = cuda.to_device(X_embedded) - cdef uintptr_t d_X_ptr = _get_dev_array_ptr(d_X) - cdef uintptr_t d_X_embedded_ptr = _get_dev_array_ptr(d_X_embedded) + cdef uintptr_t d_X_ptr = _get_array_ptr(d_X) + cdef uintptr_t d_X_embedded_ptr = _get_array_ptr(d_X_embedded) cdef cumlHandle* handle_ = 0 if handle is None: From 06d8995bb598d7fd4390b20ea265f20bd625827c Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 16 May 2019 07:56:39 -0500 Subject: [PATCH 147/156] FI trust pep8 fix --- python/cuml/metrics/trustworthiness.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx index bfb9a99fbb..2f77fe28b1 100644 --- a/python/cuml/metrics/trustworthiness.pyx +++ b/python/cuml/metrics/trustworthiness.pyx @@ -43,10 +43,10 @@ cdef extern from "metrics/trustworthiness_c.h" namespace "ML::Metrics": def _get_array_ptr(self, obj): - """ - Get ctype pointer of a numba style device array - """ - return obj.device_ctypes_pointer.value + """ + Get ctype pointer of a numba style device array + """ + return obj.device_ctypes_pointer.value def trustworthiness(X, X_embedded, handle=None, n_neighbors=5, From a966581b059428a28e4efa9bb766f0f5a10ed4e0 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 16 May 2019 09:03:59 -0500 Subject: [PATCH 148/156] FIX rproj file rename to be more clear --- python/cuml/random_projection/__init__.py | 6 +++--- .../{random_projextion.pyx => random_projection.pyx} | 0 .../cuml/test/{test_rproj.py => test_random_projection.py} | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) rename python/cuml/random_projection/{random_projextion.pyx => random_projection.pyx} (100%) rename python/cuml/test/{test_rproj.py => test_random_projection.py} (96%) diff --git a/python/cuml/random_projection/__init__.py b/python/cuml/random_projection/__init__.py index 7643ca8da6..989d752ddb 100644 --- a/python/cuml/random_projection/__init__.py +++ b/python/cuml/random_projection/__init__.py @@ -14,6 +14,6 @@ # limitations under the License. # -from cuml.random_projection.rproj import GaussianRandomProjection -from cuml.random_projection.rproj import SparseRandomProjection -from cuml.random_projection.rproj import johnson_lindenstrauss_min_dim \ No newline at end of file +from cuml.random_projection.random_projection import GaussianRandomProjection +from cuml.random_projection.random_projection import SparseRandomProjection +from cuml.random_projection.random_projection import johnson_lindenstrauss_min_dim diff --git a/python/cuml/random_projection/random_projextion.pyx b/python/cuml/random_projection/random_projection.pyx similarity index 100% rename from python/cuml/random_projection/random_projextion.pyx rename to python/cuml/random_projection/random_projection.pyx diff --git a/python/cuml/test/test_rproj.py b/python/cuml/test/test_random_projection.py similarity index 96% rename from python/cuml/test/test_rproj.py rename to python/cuml/test/test_random_projection.py index cf438947b7..08cb3bc94f 100644 --- a/python/cuml/test/test_rproj.py +++ b/python/cuml/test/test_random_projection.py @@ -30,7 +30,7 @@ @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray']) @pytest.mark.parametrize('method', ['gaussian', 'sparse']) -def test_rproj_fit(datatype, input_type, method): +def test_random_projection_fit(datatype, input_type, method): # dataset generation data, target = make_blobs(n_samples=800, centers=400, n_features=3000) @@ -59,7 +59,7 @@ def test_rproj_fit(datatype, input_type, method): @pytest.mark.parametrize('datatype', [np.float32, np.float64]) @pytest.mark.parametrize('input_type', ['dataframe', 'ndarray']) @pytest.mark.parametrize('method', ['gaussian', 'sparse']) -def test_rproj_fit_transform(datatype, input_type, method): +def test_random_projection_fit_transform(datatype, input_type, method): eps = 0.2 # dataset generation From dcd2290a31c0bd12de657ce238a23aa0c33096a1 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 16 May 2019 09:28:17 -0500 Subject: [PATCH 149/156] FIX Correct name in __init__ --- python/cuml/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/__init__.py b/python/cuml/__init__.py index b49643bbff..5c9792a4fd 100644 --- a/python/cuml/__init__.py +++ b/python/cuml/__init__.py @@ -42,7 +42,7 @@ from cuml.manifold.umap import UMAP -from cuml.random_projection.rproj import GaussianRandomProjection, SparseRandomProjection, johnson_lindenstrauss_min_dim +from cuml.random_projection.random_projection import GaussianRandomProjection, SparseRandomProjection, johnson_lindenstrauss_min_dim from ._version import get_versions __version__ = get_versions()['version'] From bcc4e4bbf72bc14ed2b6c9e0a17032d6b6fd8393 Mon Sep 17 00:00:00 2001 From: Dante Gama Dessavre Date: Thu, 16 May 2019 10:42:07 -0500 Subject: [PATCH 150/156] FIX function name corrections --- python/cuml/metrics/trustworthiness.pyx | 2 +- python/cuml/solvers/cd.pyx | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx index 2f77fe28b1..a2f401540a 100644 --- a/python/cuml/metrics/trustworthiness.pyx +++ b/python/cuml/metrics/trustworthiness.pyx @@ -42,7 +42,7 @@ cdef extern from "metrics/trustworthiness_c.h" namespace "ML::Metrics": int n_neighbors) -def _get_array_ptr(self, obj): +def _get_array_ptr(obj): """ Get ctype pointer of a numba style device array """ diff --git a/python/cuml/solvers/cd.pyx b/python/cuml/solvers/cd.pyx index c3ee51cebd..8cc9b3f370 100644 --- a/python/cuml/solvers/cd.pyx +++ b/python/cuml/solvers/cd.pyx @@ -212,14 +212,14 @@ class CD(Base): msg = "X matrix must be a cuDF dataframe or Numpy ndarray" raise TypeError(msg) - X_ptr = self._get_ctype_ptr(X_m) + X_ptr = self._get_dev_array_ptr(X_m) cdef uintptr_t y_ptr if (isinstance(y, cudf.Series)): - y_ptr = self._get_column_ptr(y) + y_ptr = self._get_cudf_column_ptr(y) elif (isinstance(y, np.ndarray)): y_m = cuda.to_device(y) - y_ptr = self._get_ctype_ptr(y_m) + y_ptr = self._get_dev_array_ptr(y_m) else: msg = "y vector must be a cuDF series or Numpy ndarray" raise TypeError(msg) @@ -305,11 +305,11 @@ class CD(Base): msg = "X matrix format not supported" raise TypeError(msg) - X_ptr = self._get_ctype_ptr(X_m) + X_ptr = self._get_dev_array_ptr(X_m) - cdef uintptr_t coef_ptr = self._get_column_ptr(self.coef_) + cdef uintptr_t coef_ptr = self._get_cudf_column_ptr(self.coef_) preds = cudf.Series(np.zeros(n_rows, dtype=pred_datatype)) - cdef uintptr_t preds_ptr = self._get_column_ptr(preds) + cdef uintptr_t preds_ptr = self._get_cudf_column_ptr(preds) cdef cumlHandle* handle_ = self.handle.getHandle() if pred_datatype.type == np.float32: From a0ce7392f6896f8b581ef3a1ef401fb8460457d6 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 21 May 2019 08:50:55 -0700 Subject: [PATCH 151/156] removed the deprecated vector_broadcast in favor of more updated matVecOp prim --- cpp/src_prims/linalg/vector_broadcast.h | 100 ------------------ cpp/test/old/vector_broadcast.cu | 133 ------------------------ 2 files changed, 233 deletions(-) delete mode 100644 cpp/src_prims/linalg/vector_broadcast.h delete mode 100644 cpp/test/old/vector_broadcast.cu diff --git a/cpp/src_prims/linalg/vector_broadcast.h b/cpp/src_prims/linalg/vector_broadcast.h deleted file mode 100644 index ed2fd74c5f..0000000000 --- a/cpp/src_prims/linalg/vector_broadcast.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2018, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "vectorized.h" -#include "cuda_utils.h" - - -namespace MLCommon { -namespace Broadcast { - - -///@todo: investigate if using shared mem for vector would help with perf -template -__global__ void vectorBcastKernel(math_t* out, const math_t* matrix, - const math_t* vector, int rows, int cols, - Lambda op) { - typedef TxN_t VecType; - VecType mat, vec; - int len = rows * cols; - int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio; - int colIdx = idx % cols; - if(idx >= len) - return; - mat.load(matrix, idx); - vec.load(vector, colIdx); - #pragma unroll - for(int i=0;i -void vectorBcastImpl(math_t* out, const math_t* matrix, const math_t* vector, - int rows, int cols, Lambda op, cudaStream_t stream) { - int len = rows * cols; - const int nblks = ceildiv(veclen_? len/veclen_ : len, TPB); - vectorBcastKernel<<>> - (out, matrix, vector, rows, cols, op); - CUDA_CHECK(cudaPeekAtLastError()); -} - -/** - * @brief perform element-wise binary operation between a matrix and a vector, - * with the vector being broadcasted across the other dimension of the matrix. - * @tparam math_t data-type upon which the math operation will be performed - * @tparam Lambda the device-lambda performing the actual operation - * @tparam TPB threads-per-block in the final kernel launched - * @param out the output matrix - * @param matrix the input matrix (dimension = rows x cols) - * @param vector the input vector (length = cols) - * @param rows number of rows in the input/output matrix - * @param cols number of cols in the input/output matrix - * @param op the device-lambda - * @note If you want to work on a column-major storage with each column - * wanting to work on the input vector, then just swap rows and cols while - * calling this method. - */ -template -void vectorBroadcast(math_t* out, const math_t* matrix, const math_t* vector, - int rows, int cols, Lambda op, cudaStream_t stream) { - // need to use 'cols' here since vector access is based on this! - size_t bytes = cols * sizeof(math_t); - if(16/sizeof(math_t) && bytes % 16 == 0) { - vectorBcastImpl - (out, matrix, vector, rows, cols, op, stream); - } else if(8/sizeof(math_t) && bytes % 8 == 0) { - vectorBcastImpl - (out, matrix, vector, rows, cols, op, stream); - } else if(4/sizeof(math_t) && bytes % 4 == 0) { - vectorBcastImpl - (out, matrix, vector, rows, cols, op, stream); - } else if(2/sizeof(math_t) && bytes % 2 == 0) { - vectorBcastImpl - (out, matrix, vector, rows, cols, op, stream); - } else if(1/sizeof(math_t)) { - vectorBcastImpl - (out, matrix, vector, rows, cols, op, stream); - } else { - vectorBcastImpl - (out, matrix, vector, rows, cols, op, stream); - } -} - -}; // end namespace Broadcast -}; // end namespace MLCommon diff --git a/cpp/test/old/vector_broadcast.cu b/cpp/test/old/vector_broadcast.cu deleted file mode 100644 index 9ce7dfb144..0000000000 --- a/cpp/test/old/vector_broadcast.cu +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2018, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include "linalg/vector_broadcast.h" -#include "random/rng.h" -#include "test_utils.h" - - -namespace MLCommon { -namespace Broadcast { - -template -__global__ void naiveAddKernel(Type* out, const Type* mat, const Type* vec, - int rows, int cols) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int len = rows * cols; - int col = idx % cols; - if(idx < len) { - out[idx] = mat[idx] + vec[col]; - } -} - -template -void naiveAdd(Type* out, const Type* mat, const Type* vec, int rows, int cols) { - static const int TPB = 64; - int len = rows * cols; - int nblks = ceildiv(len, TPB); - naiveAddKernel<<>>(out, mat, vec, rows, cols); - CUDA_CHECK(cudaPeekAtLastError()); -} - - -template -struct VecBcastInputs { - T tolerance; - int rows, cols; - unsigned long long int seed; -}; - -template -::std::ostream& operator<<(::std::ostream& os, const VecBcastInputs& dims) { - return os; -} - -// Or else, we get the following compilation error -// for an extended __device__ lambda cannot have private or protected access within its class -template -void vectorBroadcastLaunch(T* out, const T* mat, const T* vec, int rows, int cols) { - vectorBroadcast(out, mat, vec, rows, cols, - [] __device__ (T a, T b) { - return a + b; - }); -} - -template -class VecBcastTest: public ::testing::TestWithParam > { -protected: - void SetUp() override { - params = ::testing::TestWithParam>::GetParam(); - Random::Rng r(params.seed); - int rows = params.rows; - int cols = params.cols; - int len = rows * cols; - allocate(mat, len); - allocate(vec, cols); - allocate(out_ref, len); - allocate(out, len); - r.uniform(mat, len, T(-1.0), T(1.0)); - r.uniform(vec, cols, T(-1.0), T(1.0)); - naiveAdd(out_ref, mat, vec, rows, cols); - vectorBroadcastLaunch(out, mat, vec, rows, cols); - } - - void TearDown() override { - CUDA_CHECK(cudaFree(mat)); - CUDA_CHECK(cudaFree(vec)); - CUDA_CHECK(cudaFree(out_ref)); - CUDA_CHECK(cudaFree(out)); - } - -protected: - VecBcastInputs params; - T *mat, *vec, *out_ref, *out; -}; - -const std::vector > inputsf = { - {0.000001f, 1024, 1024, 1234ULL}, - {0.000001f, 1024, 512, 1234ULL}, - {0.000001f, 1024, 256, 1234ULL}, - {0.000001f, 1024, 128, 1234ULL}, - {0.000001f, 1024, 64, 1234ULL} -}; - -const std::vector > inputsd = { - {0.000001, 1024, 1024, 1234ULL}, - {0.000001, 1024, 512, 1234ULL}, - {0.000001, 1024, 256, 1234ULL}, - {0.000001, 1024, 128, 1234ULL}, - {0.000001, 1024, 64, 1234ULL} -}; - -typedef VecBcastTest VecBcastTestF; -TEST_P(VecBcastTestF, Result) { - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows*params.cols, - CompareApprox(params.tolerance))); -} - -typedef VecBcastTest VecBcastTestD; -TEST_P(VecBcastTestD, Result){ - ASSERT_TRUE(devArrMatch(out_ref, out, params.rows*params.cols, - CompareApprox(params.tolerance))); -} - -INSTANTIATE_TEST_CASE_P(VecBcastTests, VecBcastTestF, ::testing::ValuesIn(inputsf)); - -INSTANTIATE_TEST_CASE_P(VecBcastTests, VecBcastTestD, ::testing::ValuesIn(inputsd)); - -} // end namespace Broadcast -} // end namespace MLCommon From 15057d5016d090cacb506018e94e7d1a183656e4 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 21 May 2019 09:20:33 -0700 Subject: [PATCH 152/156] Added unit-tests for add/subtract on a device scalar prim --- cpp/src_prims/linalg/add.h | 2 +- cpp/src_prims/linalg/subtract.h | 2 +- cpp/test/CMakeLists.txt | 1 + cpp/test/prims/add_sub_dev_scalar.cu | 133 +++++++++++++++++++++++++++ 4 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 cpp/test/prims/add_sub_dev_scalar.cu diff --git a/cpp/src_prims/linalg/add.h b/cpp/src_prims/linalg/add.h index 4b6f4858e1..002d4ca320 100644 --- a/cpp/src_prims/linalg/add.h +++ b/cpp/src_prims/linalg/add.h @@ -79,7 +79,7 @@ __global__ void add_dev_scalar_kernel(math_t* outDev, const math_t* inDev, */ template void addDevScalar(math_t* outDev, const math_t* inDev, const math_t* singleScalarDev, - int len, cudaStream_t stream) + IdxType len, cudaStream_t stream) { // TODO: block dimension has not been tuned dim3 block (256); diff --git a/cpp/src_prims/linalg/subtract.h b/cpp/src_prims/linalg/subtract.h index dbf133eccb..f8963b1074 100644 --- a/cpp/src_prims/linalg/subtract.h +++ b/cpp/src_prims/linalg/subtract.h @@ -82,7 +82,7 @@ __global__ void subtract_dev_scalar_kernel(math_t* outDev, const math_t* inDev, */ template void subtractDevScalar(math_t* outDev, const math_t* inDev, const math_t* singleScalarDev, - IdxType len, cudaStream_t stream) + IdxType len, cudaStream_t stream) { // Just for the note - there is no way to express such operation with cuBLAS in effective way // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index aca8c2a76f..60a467f945 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -116,6 +116,7 @@ if(BUILD_PRIMS_TESTS) # (please keep the filenames in alphabetical order) add_executable(prims prims/add.cu + prims/add_sub_dev_scalar.cu prims/array.cu prims/binary_op.cu prims/ternary_op.cu diff --git a/cpp/test/prims/add_sub_dev_scalar.cu b/cpp/test/prims/add_sub_dev_scalar.cu new file mode 100644 index 0000000000..f4b1359a63 --- /dev/null +++ b/cpp/test/prims/add_sub_dev_scalar.cu @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2018, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "linalg/add.h" +#include "linalg/subtract.h" +#include "linalg/unary_op.h" +#include "random/rng.h" +#include "test_utils.h" + + +namespace MLCommon { +namespace LinAlg { + +template +struct DevScalarInputs { + T tolerance; + IdxType len; + T scalar; + bool add; + unsigned long long int seed; +}; + +// Or else, we get the following compilation error +// for an extended __device__ lambda cannot have private or protected access +// within its class +template +void unaryOpLaunch(T *out, const T *in, T scalar, IdxType len, bool add, + cudaStream_t stream) { + unaryOp(out, in, len, + [scalar, add] __device__(T in) { + return add? in + scalar : in - scalar; + }, stream); +} + + +template +class DevScalarTest : public ::testing::TestWithParam> { +protected: + void SetUp() override { + params = ::testing::TestWithParam>::GetParam(); + Random::Rng r(params.seed); + cudaStream_t stream; + CUDA_CHECK(cudaStreamCreate(&stream)); + + auto len = params.len; + + allocate(in, len); + allocate(out_ref, len); + allocate(out, len); + allocate(scalar, (size_t)1); + updateDevice(scalar, ¶ms.scalar, 1, stream); + r.uniform(in, len, T(-1.0), T(1.0), stream); + unaryOpLaunch(out_ref, in, params.scalar, len, params.add, stream); + if(params.add) { + addDevScalar(out, in, scalar, len, stream); + } else { + subtractDevScalar(out, in, scalar, len, stream); + } + CUDA_CHECK(cudaStreamDestroy(stream)); + } + + void TearDown() override { + CUDA_CHECK(cudaFree(in)); + CUDA_CHECK(cudaFree(out_ref)); + CUDA_CHECK(cudaFree(out)); + CUDA_CHECK(cudaFree(scalar)); + } + +protected: + DevScalarInputs params; + T *in, *out_ref, *out, *scalar; +}; + +const std::vector> inputsf_i32 = { + {0.000001f, 1024 * 1024, 2.f, true, 1234ULL}, + {0.000001f, 1024 * 1024, 2.f, false, 1234ULL}}; +typedef DevScalarTest DevScalarTestF_i32; +TEST_P(DevScalarTestF_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DevScalarTests, DevScalarTestF_i32, + ::testing::ValuesIn(inputsf_i32)); + +const std::vector> inputsf_i64 = { + {0.000001f, 1024 * 1024, 2.f, true, 1234ULL}, + {0.000001f, 1024 * 1024, 2.f, false, 1234ULL}}; +typedef DevScalarTest DevScalarTestF_i64; +TEST_P(DevScalarTestF_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DevScalarTests, DevScalarTestF_i64, + ::testing::ValuesIn(inputsf_i64)); + +const std::vector> inputsd_i32 = { + {0.00000001, 1024 * 1024, 2.0, true, 1234ULL}, + {0.00000001, 1024 * 1024, 2.0, false, 1234ULL}}; +typedef DevScalarTest DevScalarTestD_i32; +TEST_P(DevScalarTestD_i32, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DevScalarTests, DevScalarTestD_i32, + ::testing::ValuesIn(inputsd_i32)); + +const std::vector> inputsd_i64 = { + {0.00000001, 1024 * 1024, 2.0, true, 1234ULL}, + {0.00000001, 1024 * 1024, 2.0, false, 1234ULL}}; +typedef DevScalarTest DevScalarTestD_i64; +TEST_P(DevScalarTestD_i64, Result) { + ASSERT_TRUE(devArrMatch(out_ref, out, params.len, + CompareApprox(params.tolerance))); +} +INSTANTIATE_TEST_CASE_P(DevScalarTests, DevScalarTestD_i64, + ::testing::ValuesIn(inputsd_i64)); + +} // end namespace LinAlg +} // end namespace MLCommon From e856daa3fa1e11af9a262702554283dfe6d684ae Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 21 May 2019 09:21:26 -0700 Subject: [PATCH 153/156] Removed the deprecated prims unit-tests folder (Issue #598) --- cpp/test/old/add_and_sub_dev_scalar.cu | 139 ------------------------- 1 file changed, 139 deletions(-) delete mode 100644 cpp/test/old/add_and_sub_dev_scalar.cu diff --git a/cpp/test/old/add_and_sub_dev_scalar.cu b/cpp/test/old/add_and_sub_dev_scalar.cu deleted file mode 100644 index 69ca5027be..0000000000 --- a/cpp/test/old/add_and_sub_dev_scalar.cu +++ /dev/null @@ -1,139 +0,0 @@ -#include -#include "linalg/add.h" -#include "linalg/subtract.h" -#include "cuda_utils.h" - -#include -#include -#include - -#include -#include -#include - -namespace -{ - template - class TestBuffer - { - public: - TestBuffer(size_t arrayLength) - : devContainterRaw(nullptr) - , hostContainter(arrayLength, T()) - { - MLCommon::allocate(devContainterRaw, arrayLength); - EXPECT_TRUE(devContainterRaw != nullptr); - } - - ~TestBuffer() { - EXPECT_TRUE(cudaFree(devContainterRaw) == cudaSuccess); - } - - T* getDevPtr() { - return devContainterRaw; - } - - T* getHostPtr() { - if (hostContainter.empty()) - return nullptr; - else - return &hostContainter[0]; - } - - T hostValueAt(size_t index) const { - if (index >= hostContainter.size()) - { - assert(!"INDEX IS OT OF ACCESSABLE RANGE"); - return T(); - } - return hostContainter.at(index); - } - - size_t size() const { - return hostContainter.size(); - } - - void fillArithmeticSeq(const T& start = T(1), const T& step = T(1)) - { - for (size_t i = 0; i < hostContainter.size(); ++i) - hostContainter[i] = start + step*i; - copy2Device(); - } - - void copy2Device() { - EXPECT_TRUE(cudaMemcpy(getDevPtr(), getHostPtr(), size() * sizeof(T), cudaMemcpyHostToDevice) == cudaSuccess); - } - - void copy2Host() { - EXPECT_TRUE(cudaMemcpy(getHostPtr(), getDevPtr(), size() * sizeof(T), cudaMemcpyDeviceToHost) == cudaSuccess); - } - - private: - T* devContainterRaw; - std::vector hostContainter; - - private: - TestBuffer(const TestBuffer&) = delete; - TestBuffer operator = (const TestBuffer&) = delete; - }; -} - -template -void test_add(size_t arraLength) -{ - TestBuffer in(arraLength); - TestBuffer extraScalar(1); - TestBuffer out(arraLength); - in.fillArithmeticSeq(); - extraScalar.fillArithmeticSeq(); - out.fillArithmeticSeq(); - - MLCommon::LinAlg::addDevScalar(out.getDevPtr(), in.getDevPtr(), extraScalar.getDevPtr(), in.size()); - out.copy2Host(); - - T maxError = T(); - for (int i = 0; i < arraLength; i++) - { - maxError = std::max(maxError, - abs( (in.hostValueAt(i) + extraScalar.hostValueAt(0)) - out.hostValueAt(i) ) - ); - } - EXPECT_TRUE(maxError < std::numeric_limits::epsilon()) << "Max deviation in test_add is greater then " << std::numeric_limits::epsilon(); -} - -template -void test_subtract(size_t arraLength) -{ - TestBuffer in(arraLength); - TestBuffer extraScalar(1); - TestBuffer out(arraLength); - in.fillArithmeticSeq(); - extraScalar.fillArithmeticSeq(); - out.fillArithmeticSeq(); - - MLCommon::LinAlg::subtractDevScalar(out.getDevPtr(), in.getDevPtr(), extraScalar.getDevPtr(), in.size()); - out.copy2Host(); - - T maxError = T(); - for (int i = 0; i < arraLength; i++) - maxError = std::max(maxError, - abs( (in.hostValueAt(i) - extraScalar.hostValueAt(0)) - out.hostValueAt(i) ) - ); - EXPECT_TRUE(maxError < std::numeric_limits::epsilon()) << "Max deviation test_subtract is greater then " << std::numeric_limits::epsilon(); -} - -TEST(AddAndSubDevScalarTest, add_test) -{ - test_add(1); - test_add(100); - test_add(1); - test_add(100); -} - -TEST(AddAndSubDevScalarTest, subtract_test) -{ - test_subtract(1); - test_subtract(100); - test_subtract(1); - test_subtract(100); -} From a784054cccee217e88a62cd3ccc1c47d3e661058 Mon Sep 17 00:00:00 2001 From: Thejaswi Rao Date: Tue, 21 May 2019 09:24:08 -0700 Subject: [PATCH 154/156] updated changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ee2c60ec9..5be70f8a55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ ## Bug Fixes - PR #584: Added missing virtual destructor to deviceAllocator and hostAllocator +- PR #620: C++: Removed old unit-test files in ml-prims # cuML 0.7.0 (10 May 2019) From b4a73a5f87311069367efe6f68b84597a77a3458 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Tue, 21 May 2019 11:52:36 -0700 Subject: [PATCH 155/156] Updated to use 0.8 dependencies --- BUILD.md | 5 +---- ci/cpu/build.sh | 6 +++--- ci/docs/build.sh | 4 ++-- ci/gpu/build.sh | 6 +++--- conda/recipes/cuml-cuda10/meta.yaml | 4 ++-- conda/recipes/cuml/meta.yaml | 4 ++-- conda/recipes/libcuml/meta.yaml | 4 ++-- 7 files changed, 15 insertions(+), 18 deletions(-) diff --git a/BUILD.md b/BUILD.md index bdc9ec7c08..25607b9bfd 100644 --- a/BUILD.md +++ b/BUILD.md @@ -4,7 +4,7 @@ To install cuML from source, ensure the dependencies are met: -1. [cuDF](https://github.com/rapidsai/cudf) (>=0.7) +1. [cuDF](https://github.com/rapidsai/cudf) (>=0.8) 2. zlib 3. cmake (>= 3.12.4) 4. CUDA (>= 9.2) @@ -123,6 +123,3 @@ cuML's cmake has the following configurable flags available: | GPU_ARCHS | List of GPU architectures, semicolon-separated | 60;70;75 | List of GPU architectures that all artifacts are compiled for. | | KERNEL_INFO | [ON, OFF] | OFF | Enable/disable kernel resource usage info in nvcc. | | LINE_INFO | [ON, OFF] | OFF | Enable/disable lineinfo in nvcc. | - - - diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index 024b0d66bc..92c9341142 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -15,9 +15,9 @@ export PATH=/conda/bin:/usr/local/cuda/bin:$PATH export PARALLEL_LEVEL=4 # Set versions of packages needed to be grabbed -export CUDF_VERSION=0.7.* -export NVSTRINGS_VERSION=0.7.* -export RMM_VERSION=0.7.* +export CUDF_VERSION=0.8.* +export NVSTRINGS_VERSION=0.8.* +export RMM_VERSION=0.8.* # Set home to the job's workspace export HOME=$WORKSPACE diff --git a/ci/docs/build.sh b/ci/docs/build.sh index 0806c1263b..746ca9b913 100644 --- a/ci/docs/build.sh +++ b/ci/docs/build.sh @@ -14,8 +14,8 @@ function logger() { export PATH=/conda/bin:/usr/local/cuda/bin:$PATH export PARALLEL_LEVEL=4 export CUDA_REL=${CUDA_VERSION%.*} -export CUDF_VERSION=0.7.* -export RMM_VERSION=0.7.* +export CUDF_VERSION=0.8.* +export RMM_VERSION=0.8.* # Set home to the job's workspace export HOME=$WORKSPACE diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 97592b1836..a5d6f9b9ff 100644 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -16,9 +16,9 @@ export PARALLEL_LEVEL=4 export CUDA_REL=${CUDA_VERSION%.*} # Set versions of packages needed to be grabbed -export CUDF_VERSION=0.7.* -export NVSTRINGS_VERSION=0.7.* -export RMM_VERSION=0.7.* +export CUDF_VERSION=0.8.* +export NVSTRINGS_VERSION=0.8.* +export RMM_VERSION=0.8.* # Set home to the job's workspace export HOME=$WORKSPACE diff --git a/conda/recipes/cuml-cuda10/meta.yaml b/conda/recipes/cuml-cuda10/meta.yaml index 3c38755f30..bc9d0666c9 100644 --- a/conda/recipes/cuml-cuda10/meta.yaml +++ b/conda/recipes/cuml-cuda10/meta.yaml @@ -28,14 +28,14 @@ requirements: - setuptools - cython>=0.29,<0.30 - cmake>=3.12.4 - - cudf 0.7* + - cudf 0.8* - libcuml={{ version }} - libcumlMG - cudatoolkit {{ cuda_version }}.* run: - python x.x - setuptools - - cudf 0.7* + - cudf 0.8* - libcuml={{ version }} - libcumlMG - {{ pin_compatible('cudatoolkit', max_pin='x.x') }} diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml index 0ed6003c4f..6b83767972 100644 --- a/conda/recipes/cuml/meta.yaml +++ b/conda/recipes/cuml/meta.yaml @@ -28,13 +28,13 @@ requirements: - setuptools - cython>=0.29,<0.30 - cmake>=3.12.4 - - cudf 0.7* + - cudf 0.8* - libcuml={{ version }} - cudatoolkit {{ cuda_version }}.* run: - python x.x - setuptools - - cudf 0.7* + - cudf 0.8* - libcuml={{ version }} - {{ pin_compatible('cudatoolkit', max_pin='x.x') }} diff --git a/conda/recipes/libcuml/meta.yaml b/conda/recipes/libcuml/meta.yaml index 77eb50b5fc..8d53e784a0 100644 --- a/conda/recipes/libcuml/meta.yaml +++ b/conda/recipes/libcuml/meta.yaml @@ -27,11 +27,11 @@ build: requirements: build: - cmake>=3.12.4 - - cudf 0.7* + - cudf 0.8* - cudatoolkit {{ cuda_version }}.* - lapack run: - - cudf 0.7* + - cudf 0.8* - {{ pin_compatible('cudatoolkit', max_pin='x.x') }} about: From 493c5c2984212329e5f8b8f2d37e914089b18ddc Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Tue, 21 May 2019 13:13:14 -0700 Subject: [PATCH 156/156] Updated CHANGELOG.md to include PR #622 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ee2c60ec9..7785c18b61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ - PR #596: Introduce cumlHandle for ols and ridge - PR #579: Introduce cumlHandle for cd and sgd, and propagate C++ errors in cython level for cd and sgd - PR #604: Adding cumlHandle to kNN, spectral methods, and UMAP +- PR #622: Updated to use 0.8 dependencies ## Bug Fixes - PR #584: Added missing virtual destructor to deviceAllocator and hostAllocator