From 12aa6f9a87e6466dedcb5cce1b59bbcf30799cbe Mon Sep 17 00:00:00 2001 From: Oleg Smirnov Date: Fri, 29 May 2020 19:43:53 +0200 Subject: [PATCH 1/5] implements category_modulo and category_binning for by(), as suggested in #907 --- datashader/reductions.py | 186 ++++++++++++++++++++++++++++++++------- 1 file changed, 153 insertions(+), 33 deletions(-) diff --git a/datashader/reductions.py b/datashader/reductions.py index 3398a0275..f43d3d522 100644 --- a/datashader/reductions.py +++ b/datashader/reductions.py @@ -43,39 +43,155 @@ def apply(self, df): else: return df[self.column].values +class CategoryPreprocess(Preprocess): + """Base class for categorizing preprocessors.""" + @property + def cat_column(self): + """Returns name of categorized column""" + return self.column + + def categories(self, input_dshape): + """Returns list of categories corresponding to input shape""" + raise NotImplementedError("categories not implemented") + + def validate(self, in_dshape): + """Validates input shape""" + raise NotImplementedError("validate not implemented") + + def apply(self, df): + """Applies preprocessor to DataFrame and returns array""" + raise NotImplementedError("apply not implemented") -class category_codes(Preprocess): +class category_codes(CategoryPreprocess): """Extract just the category codes from a categorical column.""" + def categories(self, input_dshape): + return input_dshape.measure[self.column].categories + + def validate(self, in_dshape): + if not self.column in in_dshape.dict: + raise ValueError("specified column not found") + if not isinstance(in_dshape.measure[self.column], ct.Categorical): + raise ValueError("input must be categorical") + def apply(self, df): if cudf and isinstance(df, cudf.DataFrame): return df[self.column].cat.codes.to_gpu_array() else: return df[self.column].cat.codes.values -class category_values(Preprocess): - """Extract multiple columns from a dataframe as a numpy array of values.""" - def __init__(self, columns): - self.columns = list(columns) +class category_modulo(category_codes): + """ + A variation on category_codes that assigns categories using an integer column, modulo a base. + Category is computed as (column_value - offset)%modulo. + """ + + # couldn't find anything in the datashape docs about how to check if a CType is an integer, so just define a big set + IntegerTypes = {ct.bool_, ct.uint8, ct.uint16, ct.uint32, ct.uint64, ct.int8, ct.int16, ct.int32, ct.int64} + + def __init__(self, column, modulo, offset=0): + super().__init__(column) + self.bin0 = offset + self.modulo = modulo + + def categories(self, in_dshape): + return list(range(self.modulo)) + + def validate(self, in_dshape): + if not self.column in in_dshape.dict: + raise ValueError("specified column not found") + if in_dshape.measure[self.column] not in self.IntegerTypes: + raise ValueError("input must be an integer column") + + def apply(self, df): + if cudf and isinstance(df, cudf.DataFrame): + ## dunno how to do this in CUDA, is it as simple as this? + # return ((df[column] - offset) % modulo).to_gpu_array() + raise NotImplementedError("this feature is not implemented in cudf") + else: + return (df[self.column].values - self.bin0) % self.modulo + +class category_binning(category_modulo): + """ + A variation on category_codes that assigns categories by binning a continuously-valued column. + The number of categories returned is always nbins+1. + The last category (nbin) is for NaNs in the data column, as well as for values under/over the binned + interval (when include_under or include_over is False). + + Parameters + ---------- + column: column to use + bin0: lower bound of first bin + binsize: bin size + nbins: number of bins + include_under: if True, values below bin 0 are assigned to category 0 + include_over: if True, values above the last bin (nbins-1) are assigned to category nbin-1 + """ + + def __init__(self, column, bin0, binsize, nbins, include_under=True, include_over=True): + super().__init__(column, nbins + 1) # +1 category for NaNs and clipped values + self.bin0 = bin0 + self.binsize = binsize + self.nbins = nbins + self.bin_under = 0 if include_under else nbins + self.bin_over = nbins-1 if include_over else nbins + + def validate(self, in_dshape): + if not self.column in in_dshape.dict: + raise ValueError("specified column not found") + + def apply(self, df): + """ + Helper function. Takes a DataFrame column, modulo integer value + """ + if cudf and isinstance(df, cudf.DataFrame): + ## dunno how to do this in CUDA + raise NotImplementedError("this feature is not implemented in cudf") + else: + value = df[self.column].values + index = ((value - self.bin0) / self.binsize).astype(np.uint32) + index[index < 0] = self.bin_under + index[index >= self.nbins] = self.bin_over + index[np.isnan(value)] = self.nbins + return index + + +class category_values(CategoryPreprocess): + """Extract a category and a value column from a dataframe as (2,N) numpy array of values.""" + def __init__(self, categorizer, value_column): + super().__init__(value_column) + self.categorizer = categorizer @property def inputs(self): - return self.columns + return (self.categorizer.column, self.column) + + @property + def cat_column(self): + """Returns name of categorized column""" + return self.categorizer.column + + def categories(self, input_dshape): + return self.categorizer.categories + + def validate(self, in_dshape): + return self.categorizer.validate(in_dshape) def apply(self, df): + a = self.categorizer.apply(df) if cudf and isinstance(df, cudf.DataFrame): import cupy - if df[self.columns[1]].dtype.kind == 'f': + if df[self.column].dtype.kind == 'f': nullval = np.nan else: nullval = 0 - a = df[self.columns[0]].cat.codes.to_gpu_array() - b = df[self.columns[1]].to_gpu_array(fillna=nullval) + b = df[self.column].to_gpu_array(fillna=nullval) return cupy.stack((a, b), axis=-1) else: - a = df[self.columns[0]].cat.codes.values - b = df[self.columns[1]].values + b = df[self.column].values return np.stack((a, b), axis=-1) + + class Reduction(Expr): """Base class for per-bin reductions.""" def __init__(self, column=None): @@ -139,26 +255,37 @@ def _finalize(bases, cuda=False, **kwargs): return xr.DataArray(bases[0], **kwargs) class by(Reduction): - """Apply the provided reduction separately per categorical ``column`` value. + """Apply the provided reduction separately per category. Parameters ---------- - column : str - Name of the column to aggregate over. Column data type must be - categorical. Resulting aggregate has an outer dimension axis along the - categories present. + cats: str or CategoryPreprocess instance + Name of column to aggregate over, or a categorizer object that returns categories. + Resulting aggregate has an outer dimension axis along the categories present. reduction : Reduction Per-category reduction function. """ - def __init__(self, cat_column, reduction): - self.columns = (cat_column, getattr(reduction, 'column', None)) + def __init__(self, cats, reduction): + # set basic categorizer + if isinstance(cats, CategoryPreprocess): + self.categorizer = cats + elif isinstance(cats, str): + self.categorizer = category_codes(cats) + else: + raise TypeError("first argument must be a column name or a CategoryPreprocess instance") + self.column = self.categorizer.column # for backwards compatibility with count_cat + self.columns = (self.categorizer.column, getattr(reduction, 'column', None)) self.reduction = reduction - self.column = cat_column # for backwards compatibility with count_cat - + # if a value column is supplied, set category_values preprocessor + if self.val_column is not None: + self.preprocess = category_values(self.categorizer, self.val_column) + else: + self.preprocess = self.categorizer + def __hash__(self): return hash((type(self), self._hashable_inputs(), self.reduction)) def _build_temps(self, cuda=False): - return tuple(by(self.cat_column, tmp) for tmp in self.reduction._build_temps(cuda)) + return tuple(by(self.categorizer, tmp) for tmp in self.reduction._build_temps(cuda)) @property def cat_column(self): @@ -169,24 +296,17 @@ def val_column(self): return self.columns[1] def validate(self, in_dshape): - if not self.cat_column in in_dshape.dict: - raise ValueError("specified column not found") - if not isinstance(in_dshape.measure[self.cat_column], ct.Categorical): - raise ValueError("input must be categorical") - + self.preprocess.validate(in_dshape) self.reduction.validate(in_dshape) def out_dshape(self, input_dshape): - cats = input_dshape.measure[self.cat_column].categories + cats = self.categorizer.categories(input_dshape) red_shape = self.reduction.out_dshape(input_dshape) return dshape(Record([(c, red_shape) for c in cats])) @property def inputs(self): - if self.val_column is not None: - return (category_values(self.columns),) - else: - return (category_codes(self.columns[0]),) + return (self.preprocess, ) def _build_create(self, out_dshape): n_cats = len(out_dshape.measure.fields) @@ -197,7 +317,7 @@ def _build_bases(self, cuda=False): bases = self.reduction._build_bases(cuda) if len(bases) == 1 and bases[0] is self: return bases - return tuple(by(self.cat_column, base) for base in bases) + return tuple(by(self.categorizer, base) for base in bases) def _build_append(self, dshape, schema, cuda=False): return self.reduction._build_append(dshape, schema, cuda) @@ -206,7 +326,7 @@ def _build_combine(self, dshape): return self.reduction._combine def _build_finalize(self, dshape): - cats = list(dshape[self.cat_column].categories) + cats = self.categorizer.categories(dshape) def finalize(bases, cuda=False, **kwargs): kwargs['dims'] += [self.cat_column] From 457d6093341fcaf4643fd074338485437ba363a7 Mon Sep 17 00:00:00 2001 From: Oleg Smirnov Date: Sat, 30 May 2020 17:08:08 +0200 Subject: [PATCH 2/5] Fix broken category_codes. Replace uint with int for bin index. --- datashader/reductions.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/datashader/reductions.py b/datashader/reductions.py index f43d3d522..c08df258d 100644 --- a/datashader/reductions.py +++ b/datashader/reductions.py @@ -103,12 +103,11 @@ def validate(self, in_dshape): raise ValueError("input must be an integer column") def apply(self, df): + result = (df[self.column] - self.bin0) % self.modulo if cudf and isinstance(df, cudf.DataFrame): - ## dunno how to do this in CUDA, is it as simple as this? - # return ((df[column] - offset) % modulo).to_gpu_array() - raise NotImplementedError("this feature is not implemented in cudf") + return result.to_gpu_array() else: - return (df[self.column].values - self.bin0) % self.modulo + return result.values class category_binning(category_modulo): """ @@ -140,15 +139,12 @@ def validate(self, in_dshape): raise ValueError("specified column not found") def apply(self, df): - """ - Helper function. Takes a DataFrame column, modulo integer value - """ if cudf and isinstance(df, cudf.DataFrame): ## dunno how to do this in CUDA - raise NotImplementedError("this feature is not implemented in cudf") + raise NotImplementedError("this feature is not implemented in cuda") else: value = df[self.column].values - index = ((value - self.bin0) / self.binsize).astype(np.uint32) + index = ((value - self.bin0) / self.binsize).astype(int) index[index < 0] = self.bin_under index[index >= self.nbins] = self.bin_over index[np.isnan(value)] = self.nbins @@ -326,7 +322,7 @@ def _build_combine(self, dshape): return self.reduction._combine def _build_finalize(self, dshape): - cats = self.categorizer.categories(dshape) + cats = list(self.categorizer.categories(dshape)) def finalize(bases, cuda=False, **kwargs): kwargs['dims'] += [self.cat_column] From c3758ae19f8b958f79d61e271fde7293d7f4fe9c Mon Sep 17 00:00:00 2001 From: Oleg Smirnov Date: Tue, 16 Jun 2020 19:11:03 +0200 Subject: [PATCH 3/5] added categorizer state to its hashable inputs --- datashader/reductions.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/datashader/reductions.py b/datashader/reductions.py index c08df258d..a52f1527d 100644 --- a/datashader/reductions.py +++ b/datashader/reductions.py @@ -90,9 +90,12 @@ class category_modulo(category_codes): def __init__(self, column, modulo, offset=0): super().__init__(column) - self.bin0 = offset + self.offset = offset self.modulo = modulo + def _hashable_inputs(self): + return super()._hashable_inputs() + (self.offset, self.modulo) + def categories(self, in_dshape): return list(range(self.modulo)) @@ -103,7 +106,7 @@ def validate(self, in_dshape): raise ValueError("input must be an integer column") def apply(self, df): - result = (df[self.column] - self.bin0) % self.modulo + result = (df[self.column] - self.offset) % self.modulo if cudf and isinstance(df, cudf.DataFrame): return result.to_gpu_array() else: @@ -134,6 +137,9 @@ def __init__(self, column, bin0, binsize, nbins, include_under=True, include_ove self.bin_under = 0 if include_under else nbins self.bin_over = nbins-1 if include_over else nbins + def _hashable_inputs(self): + return super()._hashable_inputs() + (self.bin0, self.binsize, self.bin_under, self.bin_over) + def validate(self, in_dshape): if not self.column in in_dshape.dict: raise ValueError("specified column not found") @@ -278,7 +284,7 @@ def __init__(self, cats, reduction): self.preprocess = self.categorizer def __hash__(self): - return hash((type(self), self._hashable_inputs(), self.reduction)) + return hash((type(self), self._hashable_inputs(), self.categorizer._hashable_inputs(), self.reduction)) def _build_temps(self, cuda=False): return tuple(by(self.categorizer, tmp) for tmp in self.reduction._build_temps(cuda)) From a674ddbc9f4ddb502dcece2aa61a82c4aab9d8b6 Mon Sep 17 00:00:00 2001 From: "James A. Bednar" Date: Mon, 2 Nov 2020 18:16:42 -0600 Subject: [PATCH 4/5] Fixed typo --- datashader/reductions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashader/reductions.py b/datashader/reductions.py index 4b9fb551d..cd093534b 100644 --- a/datashader/reductions.py +++ b/datashader/reductions.py @@ -114,7 +114,7 @@ def apply(self, df): class category_binning(category_modulo): """ - A variation on category_codes that assigns categories by binning a continuously-valued column. + A variation on category_codes that assigns categories by binning a continuous-valued column. The number of categories returned is always nbins+1. The last category (nbin) is for NaNs in the data column, as well as for values under/over the binned interval (when include_under or include_over is False). From d542970565cb56f201df65ebda29841cacf11f19 Mon Sep 17 00:00:00 2001 From: Oleg Smirnov Date: Wed, 4 Nov 2020 17:54:51 +0200 Subject: [PATCH 5/5] added tests for new reductions. Implemented @jbednar's suggestions --- datashader/reductions.py | 23 +++--- datashader/tests/test_dask.py | 119 ++++++++++++++++++++++++++- datashader/tests/test_pandas.py | 138 +++++++++++++++++++++++++++++++- 3 files changed, 267 insertions(+), 13 deletions(-) diff --git a/datashader/reductions.py b/datashader/reductions.py index 58c0f7c70..967961da6 100644 --- a/datashader/reductions.py +++ b/datashader/reductions.py @@ -124,17 +124,17 @@ class category_binning(category_modulo): Parameters ---------- column: column to use - bin0: lower bound of first bin - binsize: bin size + lower: lower bound of first bin + upper: upper bound of last bin nbins: number of bins include_under: if True, values below bin 0 are assigned to category 0 include_over: if True, values above the last bin (nbins-1) are assigned to category nbin-1 """ - def __init__(self, column, bin0, binsize, nbins, include_under=True, include_over=True): + def __init__(self, column, lower, upper, nbins, include_under=True, include_over=True): super().__init__(column, nbins + 1) # +1 category for NaNs and clipped values - self.bin0 = bin0 - self.binsize = binsize + self.bin0 = lower + self.binsize = (upper - lower) / float(nbins) self.nbins = nbins self.bin_under = 0 if include_under else nbins self.bin_over = nbins-1 if include_over else nbins @@ -269,12 +269,12 @@ class by(Reduction): reduction : Reduction Per-category reduction function. """ - def __init__(self, cats, reduction): + def __init__(self, cat_column, reduction): # set basic categorizer - if isinstance(cats, CategoryPreprocess): - self.categorizer = cats - elif isinstance(cats, str): - self.categorizer = category_codes(cats) + if isinstance(cat_column, CategoryPreprocess): + self.categorizer = cat_column + elif isinstance(cat_column, str): + self.categorizer = category_codes(cat_column) else: raise TypeError("first argument must be a column name or a CategoryPreprocess instance") self.column = self.categorizer.column # for backwards compatibility with count_cat @@ -863,4 +863,5 @@ def inputs(self): __all__ = list(set([_k for _k,_v in locals().items() if isinstance(_v,type) and (issubclass(_v,Reduction) or _v is summary) and _v not in [Reduction, OptionalFieldReduction, - FloatingReduction, m2]])) + FloatingReduction, m2]])) + \ + ['category_modulo', 'category_binning'] diff --git a/datashader/tests/test_dask.py b/datashader/tests/test_dask.py index b8c17fb0e..439a0246f 100644 --- a/datashader/tests/test_dask.py +++ b/datashader/tests/test_dask.py @@ -41,7 +41,8 @@ 'f32': np.arange(20, dtype='f4'), 'f64': np.arange(20, dtype='f8'), 'empty_bin': np.array([0.] * 15 + [np.nan] * 5), - 'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5}) + 'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5, + 'cat_int': np.array([10]*5 + [11]*5 + [12]*5 + [13]*5)}) df_pd.cat = df_pd.cat.astype('category') df_pd.at[2,'f32'] = np.nan df_pd.at[2,'f64'] = np.nan @@ -236,6 +237,40 @@ def test_count_cat(ddf): agg = c.points(ddf, 'x', 'y', ds.count_cat('cat')) assert_eq_xr(agg, out) + # categorizing by (cat_int-10)%4 ought to give the same result + out = xr.DataArray( + sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int']) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.count())) + assert_eq_xr(agg, out) + + # easier to write these tests in here, since we expect the same result with only slight tweaks + + # add an extra category (this will count nans and out of bounds) + sol = np.append(sol, [[[0], [0]],[[0], [0]]], axis=2) + + # categorizing by binning the integer arange columns using [0,20] into 4 bins. Same result as for count_cat + for col in 'i32', 'i64': + out = xr.DataArray( + sol, coords=(coords + [range(5)]), dims=(dims + [col]) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.count())) + assert_eq_xr(agg, out) + + # as above, but for the float arange columns. Element 2 has a nan, so the first bin is one short, and the nan bin is +1 + sol[0, 0, 0] = 4 + sol[0, 0, 4] = 1 + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=(coords + [range(5)]), dims=(dims + [col]) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.count())) + assert_eq_xr(agg, out) + + + + @pytest.mark.parametrize('ddf', ddfs) def test_categorical_sum(ddf): sol = np.array([[[ 10, nan, nan, nan], @@ -251,6 +286,15 @@ def test_categorical_sum(ddf): agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.sum('i64'))) assert_eq_xr(agg, out) + out = xr.DataArray( + sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int']) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.sum('i32'))) + assert_eq_xr(agg, out) + + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.sum('i64'))) + assert_eq_xr(agg, out) + sol = np.array([[[8.0, nan, nan, nan], [nan, nan, 60.0, nan]], [[nan, 35.0, nan, nan], @@ -264,6 +308,17 @@ def test_categorical_sum(ddf): agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.sum('f64'))) assert_eq_xr(agg, out) + # add an extra category (this will count nans and out of bounds) + sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2) + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=(coords + [range(5)]), dims=(dims + [col]) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.sum(col))) + assert_eq_xr(agg, out) + + @pytest.mark.parametrize('ddf', ddfs) def test_categorical_mean(ddf): sol = np.array([[[ 2, nan, nan, nan], @@ -281,6 +336,27 @@ def test_categorical_mean(ddf): agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.mean('f64'))) assert_eq_xr(agg, out) + out = xr.DataArray( + sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int']) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.mean('f32'))) + assert_eq_xr(agg, out) + + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.mean('f64'))) + assert_eq_xr(agg, out) + + # add an extra category (this will count nans and out of bounds) + sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2) + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=(coords + [range(5)]), dims=(dims + [col]) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.mean(col))) + assert_eq_xr(agg, out) + + + @pytest.mark.parametrize('ddf', ddfs) def test_categorical_var(ddf): if cudf and isinstance(ddf._meta, cudf.DataFrame): @@ -303,6 +379,27 @@ def test_categorical_var(ddf): agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.var('f64'))) assert_eq_xr(agg, out, True) + out = xr.DataArray( + sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int']) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.var('f32'))) + assert_eq_xr(agg, out) + + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.var('f64'))) + assert_eq_xr(agg, out) + + # add an extra category (this will count nans and out of bounds) + sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2) + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=(coords + [range(5)]), dims=(dims + [col]) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.var(col))) + assert_eq_xr(agg, out) + + + @pytest.mark.parametrize('ddf', ddfs) def test_categorical_std(ddf): if cudf and isinstance(ddf._meta, cudf.DataFrame): @@ -327,6 +424,26 @@ def test_categorical_std(ddf): agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.std('f64'))) assert_eq_xr(agg, out, True) + out = xr.DataArray( + sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int']) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.std('f32'))) + assert_eq_xr(agg, out) + + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.std('f64'))) + assert_eq_xr(agg, out) + + # add an extra category (this will count nans and out of bounds) + sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2) + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=(coords + [range(5)]), dims=(dims + [col]) + ) + agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.std(col))) + assert_eq_xr(agg, out) + + @pytest.mark.parametrize('ddf', ddfs) def test_multiple_aggregates(ddf): if dask_cudf and isinstance(ddf, dask_cudf.DataFrame): diff --git a/datashader/tests/test_pandas.py b/datashader/tests/test_pandas.py index 03d710f8e..5073021b5 100644 --- a/datashader/tests/test_pandas.py +++ b/datashader/tests/test_pandas.py @@ -22,7 +22,8 @@ 'f32': np.arange(20, dtype='f4'), 'f64': np.arange(20, dtype='f8'), 'empty_bin': np.array([0.] * 15 + [np.nan] * 5), - 'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5}) + 'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5, + 'cat_int': np.array([10]*5 + [11]*5 + [12]*5 + [13]*5)}) df_pd.cat = df_pd.cat.astype('category') df_pd.at[2,'f32'] = nan df_pd.at[2,'f64'] = nan @@ -250,6 +251,36 @@ def test_categorical_count(df): agg = c.points(df, 'x', 'y', ds.by('cat', ds.count('i32'))) assert_eq_xr(agg, out) + # categorizing by (cat_int-10)%4 ought to give the same result + out = xr.DataArray( + sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int']) + ) + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.count())) + assert_eq_xr(agg, out) + + # add an extra category (this will count nans and out of bounds) + sol = np.append(sol, [[[0], [0]],[[0], [0]]], axis=2) + + # categorizing by binning the integer arange columns using [0,20] into 4 bins. Same result as for count_cat + for col in 'i32', 'i64': + out = xr.DataArray( + sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col]) + ) + agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.count())) + assert_eq_xr(agg, out) + + # as above, but for the float arange columns. Element 2 has a nan, so the first bin is one short, and the nan bin is +1 + sol[0, 0, 0] = 4 + sol[0, 0, 4] = 1 + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col]) + ) + agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.count())) + assert_eq_xr(agg, out) + + @pytest.mark.parametrize('df', dfs) def test_categorical_sum(df): sol = np.array([[[ 10, nan, nan, nan], @@ -266,6 +297,17 @@ def test_categorical_sum(df): agg = c.points(df, 'x', 'y', ds.by('cat', ds.sum('i64'))) assert_eq_xr(agg, out) + # categorizing by (cat_int-10)%4 ought to give the same result + out = xr.DataArray( + sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int']) + ) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.sum('i32'))) + assert_eq_xr(agg, out) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.sum('i64'))) + assert_eq_xr(agg, out) + sol = np.array([[[8.0, nan, nan, nan], [nan, nan, 60.0, nan]], [[nan, 35.0, nan, nan], @@ -280,6 +322,16 @@ def test_categorical_sum(df): agg = c.points(df, 'x', 'y', ds.by('cat', ds.sum('f64'))) assert_eq_xr(agg, out) + sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2) + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col]) + ) + agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.sum(col))) + assert_eq_xr(agg, out) + + @pytest.mark.parametrize('df', dfs) def test_categorical_max(df): sol = np.array([[[ 4, nan, nan, nan], @@ -293,6 +345,27 @@ def test_categorical_max(df): agg = c.points(df, 'x', 'y', ds.by('cat', ds.max('i32'))) assert_eq_xr(agg, out) + # categorizing by (cat_int-10)%4 ought to give the same result + out = xr.DataArray( + sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int']) + ) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.max('i32'))) + assert_eq_xr(agg, out) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.max('i64'))) + assert_eq_xr(agg, out) + + sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2) + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col]) + ) + agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.max(col))) + assert_eq_xr(agg, out) + + @pytest.mark.parametrize('df', dfs) def test_categorical_mean(df): sol = np.array([[[ 2, nan, nan, nan], @@ -310,6 +383,27 @@ def test_categorical_mean(df): agg = c.points(df, 'x', 'y', ds.by('cat', ds.mean('f64'))) assert_eq_xr(agg, out) + # categorizing by (cat_int-10)%4 ought to give the same result + out = xr.DataArray( + sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int']) + ) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.mean('i32'))) + assert_eq_xr(agg, out) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.mean('i64'))) + assert_eq_xr(agg, out) + + sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2) + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col]) + ) + agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.mean(col))) + assert_eq_xr(agg, out) + + @pytest.mark.parametrize('df', dfs) def test_categorical_var(df): if cudf and isinstance(df, cudf.DataFrame): @@ -332,6 +426,27 @@ def test_categorical_var(df): agg = c.points(df, 'x', 'y', ds.by('cat', ds.var('f64'))) assert_eq_xr(agg, out, True) + # categorizing by (cat_int-10)%4 ought to give the same result + out = xr.DataArray( + sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int']) + ) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.var('f32'))) + assert_eq_xr(agg, out) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.var('f64'))) + assert_eq_xr(agg, out) + + sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2) + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col]) + ) + agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.var(col))) + assert_eq_xr(agg, out) + + @pytest.mark.parametrize('df', dfs) def test_categorical_std(df): if cudf and isinstance(df, cudf.DataFrame): @@ -356,6 +471,27 @@ def test_categorical_std(df): agg = c.points(df, 'x', 'y', ds.by('cat', ds.std('f64'))) assert_eq_xr(agg, out, True) + # categorizing by (cat_int-10)%4 ought to give the same result + out = xr.DataArray( + sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int']) + ) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.std('f32'))) + assert_eq_xr(agg, out) + + agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.std('f64'))) + assert_eq_xr(agg, out) + + sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2) + + for col in 'f32', 'f64': + out = xr.DataArray( + sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col]) + ) + agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.std(col))) + assert_eq_xr(agg, out) + + @pytest.mark.parametrize('df', dfs) def test_multiple_aggregates(df): agg = c.points(df, 'x', 'y',