Skip to content

Commit

Permalink
adding sparse support to TreeSHAP in lightgbm (#3000)
Browse files Browse the repository at this point in the history
* adding sparse support to TreeSHAP in lightgbm

* updating based on comments

* updated based on comments, used fromiter instead of frombuffer

* updated based on comments

* fixed limits import order

* fix sparse feature contribs to work with more than int32 max rows

* really fixed int64 max error and build warnings

* added sparse test with >int32 max rows

* fixed python side reshape check on sparse data

* updated based on latest comments

* fixed comments

* added CSC INT32_MAX validation to test, fixed comments
  • Loading branch information
imatiach-msft authored Jun 28, 2020
1 parent d563aff commit 9f367d1
Show file tree
Hide file tree
Showing 12 changed files with 733 additions and 56 deletions.
7 changes: 4 additions & 3 deletions include/LightGBM/boosting.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,11 @@ class LIGHTGBM_EXPORT Boosting {
* \brief Feature contributions for the model's prediction of one record
* \param feature_values Feature value on this record
* \param output Prediction result for this record
* \param early_stop Early stopping instance. If nullptr, no early stopping is applied and all models are evaluated.
*/
virtual void PredictContrib(const double* features, double* output,
const PredictionEarlyStopInstance* early_stop) const = 0;
virtual void PredictContrib(const double* features, double* output) const = 0;

virtual void PredictContribByMap(const std::unordered_map<int, double>& features,
std::vector<std::unordered_map<int, double>>* output) const = 0;

/*!
* \brief Dump model to json format string
Expand Down
59 changes: 59 additions & 0 deletions include/LightGBM/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ typedef void* BoosterHandle; /*!< \brief Handle of booster. */
#define C_API_PREDICT_LEAF_INDEX (2) /*!< \brief Predict leaf index. */
#define C_API_PREDICT_CONTRIB (3) /*!< \brief Predict feature contributions (SHAP values). */

#define C_API_MATRIX_TYPE_CSR (0) /*!< \brief CSR sparse matrix type. */
#define C_API_MATRIX_TYPE_CSC (1) /*!< \brief CSC sparse matrix type. */

/*!
* \brief Get string message of the last error.
* \return Error information
Expand Down Expand Up @@ -742,6 +745,62 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForCSR(BoosterHandle handle,
int64_t* out_len,
double* out_result);

/*!
* \brief Make sparse prediction for a new dataset in CSR or CSC format. Currently only used for feature contributions.
* \note
* The outputs are pre-allocated, as they can vary for each invocation, but the shape should be the same:
* - for feature contributions, the shape of sparse matrix will be ``num_class * num_data * (num_feature + 1)``.
* The output indptr_type for the sparse matrix will be the same as the given input indptr_type.
* Call ``LGBM_BoosterFreePredictSparse`` to deallocate resources.
* \param handle Handle of booster
* \param indptr Pointer to row headers for CSR or column headers for CSC
* \param indptr_type Type of ``indptr``, can be ``C_API_DTYPE_INT32`` or ``C_API_DTYPE_INT64``
* \param indices Pointer to column indices for CSR or row indices for CSC
* \param data Pointer to the data space
* \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
* \param nindptr Number of rows in the matrix + 1
* \param nelem Number of nonzero elements in the matrix
* \param num_col_or_row Number of columns for CSR or number of rows for CSC
* \param predict_type What should be predicted, only feature contributions supported currently
* - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
* \param num_iteration Number of iterations for prediction, <= 0 means no limit
* \param parameter Other parameters for prediction, e.g. early stopping for prediction
* \param matrix_type Type of matrix input and output, can be ``C_API_MATRIX_TYPE_CSR`` or ``C_API_MATRIX_TYPE_CSC``
* \param[out] out_len Length of output indices and data
* \param[out] out_indptr Pointer to output row headers for CSR or column headers for CSC
* \param[out] out_indices Pointer to sparse column indices for CSR or row indices for CSC
* \param[out] out_data Pointer to sparse data space
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterPredictSparseOutput(BoosterHandle handle,
const void* indptr,
int indptr_type,
const int32_t* indices,
const void* data,
int data_type,
int64_t nindptr,
int64_t nelem,
int64_t num_col_or_row,
int predict_type,
int num_iteration,
const char* parameter,
int matrix_type,
int64_t* out_len,
void** out_indptr,
int32_t** out_indices,
void** out_data);

/*!
* \brief Method corresponding to ``LGBM_BoosterPredictSparseOutput`` to free the allocated data.
* \param indptr Pointer to output row headers or column headers to be deallocated
* \param indices Pointer to sparse indices to be deallocated
* \param data Pointer to sparse data space to be deallocated
* \param indptr_type Type of ``indptr``, can be ``C_API_DTYPE_INT32`` or ``C_API_DTYPE_INT64``
* \param data_type Type of ``data`` pointer, can be ``C_API_DTYPE_FLOAT32`` or ``C_API_DTYPE_FLOAT64``
* \return 0 when succeed, -1 when failure happens
*/
LIGHTGBM_C_EXPORT int LGBM_BoosterFreePredictSparse(void* indptr, int32_t* indices, void* data, int indptr_type, int data_type);

/*!
* \brief Make prediction for a new dataset in CSR format. This method re-uses the internal predictor structure
* from previous calls and is optimized for single row invocation.
Expand Down
6 changes: 5 additions & 1 deletion include/LightGBM/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
#ifndef LIGHTGBM_META_H_
#define LIGHTGBM_META_H_

#include <limits>
#include <cstdint>
#include <functional>
#include <limits>
#include <memory>
#include <unordered_map>
#include <utility>
#include <vector>

Expand Down Expand Up @@ -58,6 +59,9 @@ typedef int32_t comm_size_t;
using PredictFunction =
std::function<void(const std::vector<std::pair<int, double>>&, double* output)>;

using PredictSparseFunction =
std::function<void(const std::vector<std::pair<int, double>>&, std::vector<std::unordered_map<int, double>>* output)>;

typedef void(*ReduceFunction)(const char* input, char* output, int type_size, comm_size_t array_size);


Expand Down
20 changes: 20 additions & 0 deletions include/LightGBM/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ class Tree {
inline int PredictLeafIndexByMap(const std::unordered_map<int, double>& feature_values) const;

inline void PredictContrib(const double* feature_values, int num_features, double* output);
inline void PredictContribByMap(const std::unordered_map<int, double>& feature_values,
int num_features, std::unordered_map<int, double>* output);

/*! \brief Get Number of leaves*/
inline int num_leaves() const { return num_leaves_; }
Expand Down Expand Up @@ -387,6 +389,12 @@ class Tree {
PathElement *parent_unique_path, double parent_zero_fraction,
double parent_one_fraction, int parent_feature_index) const;

void TreeSHAPByMap(const std::unordered_map<int, double>& feature_values,
std::unordered_map<int, double>* phi,
int node, int unique_depth,
PathElement *parent_unique_path, double parent_zero_fraction,
double parent_one_fraction, int parent_feature_index) const;

/*! \brief Extend our decision path with a fraction of one and zero extensions for TreeSHAP*/
static void ExtendPath(PathElement *unique_path, int unique_depth,
double zero_fraction, double one_fraction, int feature_index);
Expand Down Expand Up @@ -539,6 +547,18 @@ inline void Tree::PredictContrib(const double* feature_values, int num_features,
}
}

inline void Tree::PredictContribByMap(const std::unordered_map<int, double>& feature_values,
int num_features, std::unordered_map<int, double>* output) {
(*output)[num_features] += ExpectedValue();
// Run the recursion with preallocated space for the unique path data
if (num_leaves_ > 1) {
CHECK_GE(max_depth_, 0);
const int max_path_len = max_depth_ + 1;
std::vector<PathElement> unique_path_data(max_path_len*(max_path_len + 1) / 2);
TreeSHAPByMap(feature_values, output, 0, 0, unique_path_data.data(), 1, 1, -1);
}
}

inline void Tree::RecomputeLeafDepths(int node, int depth) {
if (node == 0) leaf_depth_.resize(num_leaves());
if (node < 0) {
Expand Down
Loading

0 comments on commit 9f367d1

Please sign in to comment.