Skip to content

Commit

Permalink
Fixed imputer for sparse matrices
Browse files Browse the repository at this point in the history
  • Loading branch information
eddiebergman committed Nov 4, 2021
1 parent be80a2b commit 05675ad
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,25 @@ def fit(self, X: PIPELINE_DATA_DTYPE,
y: Optional[PIPELINE_DATA_DTYPE] = None) -> 'CategoricalImputation':
import sklearn.impute

fill_value = None
if hasattr(X, 'columns'):
kind = X[X.columns[-1]].dtype.kind
else:
# Series, sparse and numpy have dtype
# Only DataFrame does not
kind = X.dtype.kind

if kind in ("i", "u", "f"):
number_kinds = ("i", "u", "f")
if kind in number_kinds:
# We do not want to impute a category with the default
# value (0 is the default) in case such default is in the
# train data already!
if issparse(X):
# X.data doesn't return 0's
fill_value = min([*X.data, 0]) - 1
else:
fill_value = min(np.unique(X)) - 1
# value (0 is the default).
# Hence we take one greater than the max
unique = np.unique([*X.data, 0]) if issparse(X) else np.unique(X)
print(unique)
fill_value = min(unique) - 1
else:
fill_value = None

print(fill_value)

self.preprocessor = sklearn.impute.SimpleImputer(
strategy='constant', copy=False, fill_value=fill_value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@ def test_default_imputation(input_data_imputation, categorical):
X = X.astype('str').astype('object')
X[mask] = np.nan
else:
imputation_value = 0
imputation_value = min(np.unique(X)) - 1

Y = CategoricalImputation().fit_transform(X.copy())
assert ((np.argwhere(Y == imputation_value) == np.argwhere(mask)).all())
assert ((np.argwhere(Y != imputation_value) == np.argwhere(np.logical_not(mask))).all())

assert np.array_equal(Y == imputation_value, mask)
assert np.array_equal(Y != imputation_value, ~mask)


@pytest.mark.parametrize('format_type', ('numpy', 'pandas'))
def test_nonzero_numerical_imputation(format_type):

# First try with an array with 0 as only valid category. The imputation should
# happen with -1
X = np.full(fill_value=np.nan, shape=(10, 10))
Expand All @@ -69,8 +70,9 @@ def test_nonzero_numerical_imputation(format_type):
@pytest.mark.parametrize('input_data_imputation', ('numpy'), indirect=True)
def test_default_sparse(input_data_imputation):
X, mask = input_data_imputation
X = sparse.csc_matrix(X)
X = sparse.csr_matrix(X)
Y = CategoricalImputation().fit_transform(X)
Y = Y.todense()
assert (np.argwhere(Y == 0) == np.argwhere(mask)).all()
assert (np.argwhere(Y != 0) == np.argwhere(np.logical_not(mask))).all()

assert np.array_equal(Y == -1, mask)
assert np.array_equal(Y != -1, ~mask)

0 comments on commit 05675ad

Please sign in to comment.