Skip to content

Commit

Permalink
[REVIEW] Fix experimental RF backend crashes and add tests (#3117)
Browse files Browse the repository at this point in the history
* Patch and test for RF crash #3107

* Cleanups of RF regression fixes

* Add failing tests to RF regression

* Expand experimental backend testing and align pointers

* Expand python RF regression test

* Updates based on review feedback

* Update changelog

* Add classification tests

* Review comments and style fixes for RF
  • Loading branch information
JohnZed authored Nov 12, 2020
1 parent d9a73a7 commit 2219a75
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 30 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
- PR #3084: Fix artifacts in t-SNE results
- PR #3086: Reverting FIL Notebook Testing
- PR #3114: Fixed a typo in SVC's predict_proba AttributeError
- PR #3117: Fix two crashes in experimental RF backend
- PR #3119: Fix memset args for benchmark
- PR #3130: Return Python string from `dump_as_json()` of RF

Expand Down
12 changes: 11 additions & 1 deletion cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,10 @@ struct ClsTraits {
dim3 grid(b.n_blks_for_rows, colBlks, batchSize);
size_t smemSize = sizeof(int) * binSize + sizeof(DataT) * nbins;
smemSize += sizeof(int);

// Extra room for alignment (see alignPointer in computeSplitClassificationKernel)
smemSize += 2 * sizeof(DataT) + 1 * sizeof(int);

CUDA_CHECK(cudaMemsetAsync(b.hist, 0, sizeof(int) * b.nHistBins, s));
computeSplitClassificationKernel<DataT, LabelT, IdxT, TPB_DEFAULT>
<<<grid, TPB_DEFAULT, smemSize, s>>>(
Expand Down Expand Up @@ -451,11 +455,17 @@ struct RegTraits {
static void computeSplit(Builder<RegTraits<DataT, IdxT>>& b, IdxT col,
IdxT batchSize, CRITERION splitType,
cudaStream_t s) {
auto n_col_blks = b.n_blks_for_cols;
auto n_col_blks = std::min(b.n_blks_for_cols, b.input.nSampledCols - col);

dim3 grid(b.n_blks_for_rows, n_col_blks, batchSize);
auto nbins = b.params.n_bins;
size_t smemSize = 7 * nbins * sizeof(DataT) + nbins * sizeof(int);
smemSize += sizeof(int);

// Room for alignment in worst case (see alignPointer in
// computeSplitRegressionKernel)
smemSize += 5 * sizeof(DataT) + 2 * sizeof(int);

CUDA_CHECK(
cudaMemsetAsync(b.pred, 0, sizeof(DataT) * b.nPredCounts * 2, s));
if (splitType == CRITERION::MAE) {
Expand Down
32 changes: 22 additions & 10 deletions cpp/src/decisiontree/batched-levelalgo/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,13 @@ __global__ void nodeSplitKernel(IdxT max_depth, IdxT min_rows_per_node,
total_nodes, (char*)smem);
}

/* Returns 'input' rounded up to a correctly-aligned pointer of type OutT* */
template <typename OutT, typename InT>
__device__ OutT* alignPointer(InT input) {
return reinterpret_cast<OutT*>(
raft::alignTo(reinterpret_cast<size_t>(input), sizeof(OutT)));
}

template <typename DataT, typename LabelT, typename IdxT, int TPB>
__global__ void computeSplitClassificationKernel(
int* hist, IdxT nbins, IdxT max_depth, IdxT min_rows_per_node,
Expand All @@ -269,9 +276,9 @@ __global__ void computeSplitClassificationKernel(
auto end = range_start + range_len;
auto nclasses = input.nclasses;
auto len = nbins * 2 * nclasses;
auto* shist = reinterpret_cast<int*>(smem);
auto* sbins = reinterpret_cast<DataT*>(shist + len);
auto* sDone = reinterpret_cast<int*>(sbins + nbins);
auto* shist = alignPointer<int>(smem);
auto* sbins = alignPointer<DataT>(shist + len);
auto* sDone = alignPointer<int>(sbins + nbins);
IdxT stride = blockDim.x * gridDim.x;
IdxT tid = threadIdx.x + blockIdx.x * blockDim.x;
auto col = input.colids[colStart + blockIdx.y];
Expand Down Expand Up @@ -338,14 +345,15 @@ __global__ void computeSplitRegressionKernel(
}
auto end = range_start + range_len;
auto len = nbins * 2;
auto* spred = reinterpret_cast<DataT*>(smem);
auto* scount = reinterpret_cast<int*>(spred + len);
auto* sbins = reinterpret_cast<DataT*>(scount + nbins);
auto* spred = alignPointer<DataT>(smem);
auto* scount = alignPointer<int>(spred + len);
auto* sbins = alignPointer<DataT>(scount + nbins);

// used only for MAE criterion
auto* spred2 = reinterpret_cast<DataT*>(sbins + nbins);
auto* spred2P = reinterpret_cast<DataT*>(spred2 + len);
auto* spredP = reinterpret_cast<DataT*>(spred2P + nbins);
auto* sDone = reinterpret_cast<int*>(spredP + nbins);
auto* spred2 = alignPointer<DataT>(sbins + nbins);
auto* spred2P = alignPointer<DataT>(spred2 + len);
auto* spredP = alignPointer<DataT>(spred2P + nbins);
auto* sDone = alignPointer<int>(spredP + nbins);
IdxT stride = blockDim.x * gridDim.x;
IdxT tid = threadIdx.x + blockIdx.x * blockDim.x;
auto col = input.colids[colStart + blockIdx.y];
Expand All @@ -354,10 +362,13 @@ __global__ void computeSplitRegressionKernel(
}
for (IdxT i = threadIdx.x; i < nbins; i += blockDim.x) {
scount[i] = 0;
// printf("indexing from sbins: %p to %p, sizeof: %d (spred: %p)\n", sbins,
// &sbins[i], (int)sizeof(DataT*), spred);
sbins[i] = input.quantiles[col * nbins + i];
}
__syncthreads();
auto coloffset = col * input.M;

// compute prediction averages for all bins in shared mem
for (auto i = range_start + tid; i < end; i += stride) {
auto row = input.rowids[i];
Expand Down Expand Up @@ -440,6 +451,7 @@ __global__ void computeSplitRegressionKernel(
last = MLCommon::signalDone(done_count + nid * gridDim.y + blockIdx.y,
gridDim.x, blockIdx.x == 0, sDone);
}

if (!last) return;
// last block computes the final gain
Split<DataT, IdxT> sp;
Expand Down
21 changes: 11 additions & 10 deletions cpp/test/sg/rf_batched_classification_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ struct RfInputs {
float min_impurity_decrease;
int n_streams;
CRITERION split_criterion;
float min_expected_acc;
};

template <typename T>
Expand Down Expand Up @@ -143,30 +144,30 @@ class RFBatchedClsTest : public ::testing::TestWithParam<RfInputs> {

//-------------------------------------------------------------------------------------------------------------------------------------
const std::vector<RfInputs> inputsf2_clf = {
// Simple non-crash tests with small datasets
{100, 59, 1, 1.0f, 0.4f, 16, -1, true, false, 10, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 0.0, 2, CRITERION::GINI, 0.0f},
{101, 59, 2, 1.0f, 0.4f, 10, -1, true, false, 13, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 0.0, 2, CRITERION::GINI, 0.0f},
{100, 1, 2, 1.0f, 0.4f, 10, -1, true, false, 15, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 0.0, 2, CRITERION::GINI, 0.0f},
// Simple accuracy tests
{20000, 10, 25, 1.0f, 0.4f, 16, -1, true, false, 10,
SPLIT_ALGO::GLOBAL_QUANTILE, 2, 0.0, 2, CRITERION::GINI},
{20000, 10, 5, 1.0f, 0.4f, 14, -1, true, false, 10,
SPLIT_ALGO::GLOBAL_QUANTILE, 2, 0.0, 2, CRITERION::ENTROPY}};

typedef RFBatchedClsTest<float> RFBatchedClsTestF;
TEST_P(RFBatchedClsTestF, Fit) {
if (!params.bootstrap && (params.max_features == 1.0f)) {
ASSERT_TRUE(accuracy == 1.0f);
} else {
ASSERT_TRUE(accuracy >= 0.75f); // Empirically derived accuracy range
}
ASSERT_TRUE(accuracy >= params.min_expected_acc);
}

INSTANTIATE_TEST_CASE_P(RFBatchedClsTests, RFBatchedClsTestF,
::testing::ValuesIn(inputsf2_clf));

typedef RFBatchedClsTest<double> RFBatchedClsTestD;
TEST_P(RFBatchedClsTestD, Fit) {
if (!params.bootstrap && (params.max_features == 1.0f)) {
ASSERT_TRUE(accuracy == 1.0f);
} else {
ASSERT_TRUE(accuracy >= 0.75f); // Empirically derived accuracy range
}
ASSERT_TRUE(accuracy >= params.min_expected_acc);
}

INSTANTIATE_TEST_CASE_P(RFBatchedClsTests, RFBatchedClsTestD,
Expand Down
20 changes: 16 additions & 4 deletions cpp/test/sg/rf_batched_regression_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ struct RfInputs {
float min_impurity_decrease;
int n_streams;
CRITERION split_criterion;
float min_expected_acc;
};

template <typename T>
Expand Down Expand Up @@ -120,19 +121,30 @@ class RFBatchedRegTest : public ::testing::TestWithParam<RfInputs> {

//-------------------------------------------------------------------------------------------------------------------------------------
const std::vector<RfInputs> inputs = {
// Small datasets to repro corner cases as in #3107 (test for crash)
{100, 29, 1, 1.0f, 1.0f, 2, -1, false, false, 16, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 0.0, 2, CRITERION::MAE, -10.0},
{100, 57, 2, 1.0f, 1.0f, 2, -1, false, false, 16, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 0.0, 2, CRITERION::MAE, -10.0},
{101, 57, 2, 1.0f, 1.0f, 2, -1, false, false, 13, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 0.0, 2, CRITERION::MSE, -10.0},
{100, 1, 2, 1.0f, 1.0f, 2, -1, false, false, 13, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 0.0, 2, CRITERION::MAE, -10.0},

// Larger datasets for accuracy
{1000, 10, 10, 1.0f, 1.0f, 12, -1, true, false, 10,
SPLIT_ALGO::GLOBAL_QUANTILE, 2, 0.0, 2, CRITERION::MAE},
SPLIT_ALGO::GLOBAL_QUANTILE, 2, 0.0, 2, CRITERION::MAE, 0.7f},
{2000, 20, 20, 1.0f, 0.6f, 13, -1, true, false, 10,
SPLIT_ALGO::GLOBAL_QUANTILE, 2, 0.0, 2, CRITERION::MSE}};
SPLIT_ALGO::GLOBAL_QUANTILE, 2, 0.0, 2, CRITERION::MSE, 0.7f}};

typedef RFBatchedRegTest<float> RFBatchedRegTestF;
TEST_P(RFBatchedRegTestF, Fit) { ASSERT_TRUE(accuracy >= 0.7f); }
TEST_P(RFBatchedRegTestF, Fit) { ASSERT_GT(accuracy, params.min_expected_acc); }

INSTANTIATE_TEST_CASE_P(RFBatchedRegTests, RFBatchedRegTestF,
::testing::ValuesIn(inputs));

typedef RFBatchedRegTest<double> RFBatchedRegTestD;
TEST_P(RFBatchedRegTestD, Fit) { ASSERT_TRUE(accuracy >= 0.7f); }
TEST_P(RFBatchedRegTestD, Fit) { ASSERT_GT(accuracy, params.min_expected_acc); }

INSTANTIATE_TEST_CASE_P(RFBatchedRegTests, RFBatchedRegTestD,
::testing::ValuesIn(inputs));
Expand Down
19 changes: 14 additions & 5 deletions python/cuml/test/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,18 @@ def test_rf_classification(small_clf, datatype, split_algo,
@pytest.mark.parametrize('rows_sample', [unit_param(1.0), quality_param(0.90),
stress_param(0.95)])
@pytest.mark.parametrize('datatype', [np.float32])
@pytest.mark.parametrize('split_algo', [0, 1])
@pytest.mark.parametrize('max_features', [1.0, 'auto', 'log2', 'sqrt'])
@pytest.mark.parametrize(
'split_algo,max_features,use_experimental_backend,n_bins',
[(0, 1.0, False, 16),
(1, 1.0, False, 11),
(0, 'auto', False, 128),
(1, 'log2', False, 100),
(1, 'sqrt', False, 100),
(1, 1.0, True, 17),
(1, 1.0, True, 32),
])
def test_rf_regression(special_reg, datatype, split_algo, max_features,
rows_sample):
rows_sample, use_experimental_backend, n_bins):

use_handle = True

Expand All @@ -230,10 +238,11 @@ def test_rf_regression(special_reg, datatype, split_algo, max_features,

# Initialize and fit using cuML's random forest regression model
cuml_model = curfr(max_features=max_features, rows_sample=rows_sample,
n_bins=16, split_algo=split_algo, split_criterion=2,
n_bins=n_bins, split_algo=split_algo, split_criterion=2,
min_rows_per_node=2, random_state=123, n_streams=1,
n_estimators=50, handle=handle, max_leaves=-1,
max_depth=16, accuracy_metric='mse')
max_depth=16, accuracy_metric='mse',
use_experimental_backend=use_experimental_backend)
cuml_model.fit(X_train, y_train)
# predict using FIL
fil_preds = cuml_model.predict(X_test, predict_model="GPU")
Expand Down

0 comments on commit 2219a75

Please sign in to comment.