Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] Implement histogram binning #24

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
301 changes: 301 additions & 0 deletions sklearn/ensemble/_binning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
"""
This module contains the BinMapper class.

BinMapper is used for mapping a real-valued dataset into integer-valued bins.
Bin thresholds are computed with the quantiles so that each bin contains
approximately the same number of samples.
"""
# Authors: Nicolas Hug <[email protected]>
# Philip Teng <[email protected]>
# Haoyin Xu <[email protected]>
#
# License: BSD 3 clause

import numpy as np

from .utils import check_random_state, check_array
from .base import BaseEstimator, TransformerMixin
from .utils.validation import check_is_fitted
from ._binning import _map_to_bins
from .common import X_DTYPE, X_BINNED_DTYPE, ALMOST_INF, X_BITSET_INNER_DTYPE
from ._bitset import set_bitset_memoryview


def _find_binning_thresholds(col_data, max_bins):
"""Extract quantiles from a continuous feature.

Missing values are ignored for finding the thresholds.

Parameters
----------
col_data : array-like, shape (n_samples,)
The continuous feature to bin.
max_bins: int
The maximum number of bins to use for non-missing values. If for a
given feature the number of unique values is less than ``max_bins``,
then those unique values will be used to compute the bin thresholds,
instead of the quantiles

Return
------
binning_thresholds : ndarray of shape(min(max_bins, n_unique_values) - 1,)
The increasing numeric values that can be used to separate the bins.
A given value x will be mapped into bin value i iff
bining_thresholds[i - 1] < x <= binning_thresholds[i]
"""
# ignore missing values when computing bin thresholds
missing_mask = np.isnan(col_data)
if missing_mask.any():
col_data = col_data[~missing_mask]
col_data = np.ascontiguousarray(col_data, dtype=X_DTYPE)
distinct_values = np.unique(col_data)
if len(distinct_values) <= max_bins:
midpoints = distinct_values[:-1] + distinct_values[1:]
midpoints *= .5
else:
# We sort again the data in this case. We could compute
# approximate midpoint percentiles using the output of
# np.unique(col_data, return_counts) instead but this is more
# work and the performance benefit will be limited because we
# work on a fixed-size subsample of the full data.
percentiles = np.linspace(0, 100, num=max_bins + 1)
percentiles = percentiles[1:-1]
midpoints = np.percentile(col_data, percentiles,
interpolation='midpoint').astype(X_DTYPE)
assert midpoints.shape[0] == max_bins - 1

# We avoid having +inf thresholds: +inf thresholds are only allowed in
# a "split on nan" situation.
np.clip(midpoints, a_min=None, a_max=ALMOST_INF, out=midpoints)
return midpoints


class _BinMapper(TransformerMixin, BaseEstimator):
"""Transformer that maps a dataset into integer-valued bins.

For continuous features, the bins are created in a feature-wise fashion,
using quantiles so that each bins contains approximately the same number
of samples. For large datasets, quantiles are computed on a subset of the
data to speed-up the binning, but the quantiles should remain stable.

For categorical features, the raw categorical values are expected to be
in [0, 254] (this is not validated here though) and each category
corresponds to a bin. All categorical values must be known at
initialization: transform() doesn't know how to bin unknown categorical
values. Note that transform() is only used on non-training data in the
case of early stopping.

Features with a small number of values may be binned into less than
``n_bins`` bins. The last bin (at index ``n_bins - 1``) is always reserved
for missing values.

Parameters
----------
n_bins : int, default=256
The maximum number of bins to use (including the bin for missing
values). Should be in [3, 256]. Non-missing values are binned on
``max_bins = n_bins - 1`` bins. The last bin is always reserved for
missing values. If for a given feature the number of unique values is
less than ``max_bins``, then those unique values will be used to
compute the bin thresholds, instead of the quantiles. For categorical
features indicated by ``is_categorical``, the docstring for
``is_categorical`` details on this procedure.
subsample : int or None, default=2e5
If ``n_samples > subsample``, then ``sub_samples`` samples will be
randomly chosen to compute the quantiles. If ``None``, the whole data
is used.
is_categorical : ndarray of bool of shape (n_features,), default=None
Indicates categorical features. By default, all features are
considered continuous.
known_categories : list of {ndarray, None} of shape (n_features,), \
default=none
For each categorical feature, the array indicates the set of unique
categorical values. These should be the possible values over all the
data, not just the training data. For continuous features, the
corresponding entry should be None.
random_state: int, RandomState instance or None, default=None
Pseudo-random number generator to control the random sub-sampling.
Pass an int for reproducible output across multiple
function calls.
See :term: `Glossary <random_state>`.

Attributes
----------
bin_thresholds_ : list of ndarray
For each feature, each array indicates how to map a feature into a
binned feature. The semantic and size depends on the nature of the
feature:
- for real-valued features, the array corresponds to the real-valued
bin thresholds (the upper bound of each bin). There are ``max_bins
- 1`` thresholds, where ``max_bins = n_bins - 1`` is the number of
bins used for non-missing values.
- for categorical features, the array is a map from a binned category
value to the raw category value. The size of the array is equal to
``min(max_bins, category_cardinality)`` where we ignore missing
values in the cardinality.
n_bins_non_missing_ : ndarray, dtype=np.uint32
For each feature, gives the number of bins actually used for
non-missing values. For features with a lot of unique values, this is
equal to ``n_bins - 1``.
is_categorical_ : ndarray of shape (n_features,), dtype=np.uint8
Indicator for categorical features.
missing_values_bin_idx_ : np.uint8
The index of the bin where missing values are mapped. This is a
constant across all features. This corresponds to the last bin, and
it is always equal to ``n_bins - 1``. Note that if ``n_bins_missing_``
is less than ``n_bins - 1`` for a given feature, then there are
empty (and unused) bins.
"""
def __init__(self, n_bins=256, subsample=int(2e5), is_categorical=None,
known_categories=None, random_state=None):
self.n_bins = n_bins
self.subsample = subsample
self.is_categorical = is_categorical
self.known_categories = known_categories
self.random_state = random_state

def fit(self, X, y=None):
"""Fit data X by computing the binning thresholds.

The last bin is reserved for missing values, whether missing values
are present in the data or not.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to bin.
y: None
Ignored.

Returns
-------
self : object
"""
if not (3 <= self.n_bins <= 256):
# min is 3: at least 2 distinct bins and a missing values bin
raise ValueError('n_bins={} should be no smaller than 3 '
'and no larger than 256.'.format(self.n_bins))

X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
max_bins = self.n_bins - 1

rng = check_random_state(self.random_state)
if self.subsample is not None and X.shape[0] > self.subsample:
subset = rng.choice(X.shape[0], self.subsample, replace=False)
X = X.take(subset, axis=0)

if self.is_categorical is None:
self.is_categorical_ = np.zeros(X.shape[1], dtype=np.uint8)
else:
self.is_categorical_ = np.asarray(self.is_categorical,
dtype=np.uint8)

n_features = X.shape[1]
known_categories = self.known_categories
if known_categories is None:
known_categories = [None] * n_features

# validate is_categorical and known_categories parameters
for f_idx in range(n_features):
is_categorical = self.is_categorical_[f_idx]
known_cats = known_categories[f_idx]
if is_categorical and known_cats is None:
raise ValueError(
f"Known categories for feature {f_idx} must be provided."
)
if not is_categorical and known_cats is not None:
raise ValueError(
f"Feature {f_idx} isn't marked as a categorical feature, "
f"but categories were passed."
)

self.missing_values_bin_idx_ = self.n_bins - 1

self.bin_thresholds_ = []
n_bins_non_missing = []

for f_idx in range(n_features):
if not self.is_categorical_[f_idx]:
thresholds = _find_binning_thresholds(X[:, f_idx], max_bins)
n_bins_non_missing.append(thresholds.shape[0] + 1)
else:
# Since categories are assumed to be encoded in
# [0, n_cats] and since n_cats <= max_bins,
# the thresholds *are* the unique categorical values. This will
# lead to the correct mapping in transform()
thresholds = known_categories[f_idx]
n_bins_non_missing.append(thresholds.shape[0])

self.bin_thresholds_.append(thresholds)

self.n_bins_non_missing_ = np.array(n_bins_non_missing,
dtype=np.uint32)
return self

def transform(self, X):
"""Bin data X.

Missing values will be mapped to the last bin.

For categorical features, the mapping will be incorrect for unknown
categories. Since the BinMapper is given known_categories of the
entire training data (i.e. before the call to train_test_split() in
case of early-stopping), this never happens.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The data to bin.

Returns
-------
X_binned : array-like of shape (n_samples, n_features)
The binned data (fortran-aligned).
"""
X = check_array(X, dtype=[X_DTYPE], force_all_finite=False)
check_is_fitted(self)
if X.shape[1] != self.n_bins_non_missing_.shape[0]:
raise ValueError(
'This estimator was fitted with {} features but {} got passed '
'to transform()'.format(self.n_bins_non_missing_.shape[0],
X.shape[1])
)
binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order='F')
_map_to_bins(X, self.bin_thresholds_, self.missing_values_bin_idx_,
binned)
return binned

def make_known_categories_bitsets(self):
"""Create bitsets of known categories.

Returns
-------
- known_cat_bitsets : ndarray of shape (n_categorical_features, 8)
Array of bitsets of known categories, for each categorical feature.
- f_idx_map : ndarray of shape (n_features,)
Map from original feature index to the corresponding index in the
known_cat_bitsets array.
"""

categorical_features_indices = np.flatnonzero(self.is_categorical_)

n_features = self.is_categorical_.size
n_categorical_features = categorical_features_indices.size

f_idx_map = np.zeros(n_features, dtype=np.uint32)
f_idx_map[categorical_features_indices] = np.arange(
n_categorical_features, dtype=np.uint32)

known_categories = self.bin_thresholds_

known_cat_bitsets = np.zeros((n_categorical_features, 8),
dtype=X_BITSET_INNER_DTYPE)

# TODO: complexity is O(n_categorical_features * 255). Maybe this is
# worth cythonizing
for mapped_f_idx, f_idx in enumerate(categorical_features_indices):
for raw_cat_val in known_categories[f_idx]:
set_bitset_memoryview(known_cat_bitsets[mapped_f_idx],
raw_cat_val)

return known_cat_bitsets, f_idx_map
26 changes: 26 additions & 0 deletions sklearn/ensemble/_bitset.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# cython: language_level=3
# Authors: Philip Teng <[email protected]>
# Haoyin Xu <[email protected]>
#
# License: BSD 3 clause

# See _bitset.pyx for details.

from .common cimport X_BINNED_DTYPE_C
from .common cimport BITSET_DTYPE_C
from .common cimport BITSET_INNER_DTYPE_C
from .common cimport X_DTYPE_C

cdef void init_bitset(BITSET_DTYPE_C bitset) nogil

cdef void set_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) nogil

cdef unsigned char in_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) nogil

cpdef unsigned char in_bitset_memoryview(const BITSET_INNER_DTYPE_C[:] bitset,
X_BINNED_DTYPE_C val) nogil

cdef unsigned char in_bitset_2d_memoryview(
const BITSET_INNER_DTYPE_C [:, :] bitset,
X_BINNED_DTYPE_C val,
unsigned int row) nogil
Loading