From 12aa6f9a87e6466dedcb5cce1b59bbcf30799cbe Mon Sep 17 00:00:00 2001
From: Oleg Smirnov <osmirnov@gmail.com>
Date: Fri, 29 May 2020 19:43:53 +0200
Subject: [PATCH 1/5] implements category_modulo and category_binning for by(),
 as suggested in #907

---
 datashader/reductions.py | 186 ++++++++++++++++++++++++++++++++-------
 1 file changed, 153 insertions(+), 33 deletions(-)

diff --git a/datashader/reductions.py b/datashader/reductions.py
index 3398a0275..f43d3d522 100644
--- a/datashader/reductions.py
+++ b/datashader/reductions.py
@@ -43,39 +43,155 @@ def apply(self, df):
         else:
             return df[self.column].values
 
+class CategoryPreprocess(Preprocess):
+    """Base class for categorizing preprocessors."""
+    @property
+    def cat_column(self):
+        """Returns name of categorized column"""
+        return self.column
+
+    def categories(self, input_dshape):
+        """Returns list of categories corresponding to input shape"""
+        raise NotImplementedError("categories not implemented")
+
+    def validate(self, in_dshape):
+        """Validates input shape"""
+        raise NotImplementedError("validate not implemented")
+
+    def apply(self, df):
+        """Applies preprocessor to DataFrame and returns array"""
+        raise NotImplementedError("apply not implemented")
 
-class category_codes(Preprocess):
+class category_codes(CategoryPreprocess):
     """Extract just the category codes from a categorical column."""
+    def categories(self, input_dshape):
+        return input_dshape.measure[self.column].categories
+
+    def validate(self, in_dshape):
+        if not self.column in in_dshape.dict:
+            raise ValueError("specified column not found")
+        if not isinstance(in_dshape.measure[self.column], ct.Categorical):
+            raise ValueError("input must be categorical")
+
     def apply(self, df):
         if cudf and isinstance(df, cudf.DataFrame):
             return df[self.column].cat.codes.to_gpu_array()
         else:
             return df[self.column].cat.codes.values
 
-class category_values(Preprocess):
-    """Extract multiple columns from a dataframe as a numpy array of values."""
-    def __init__(self, columns):
-        self.columns = list(columns)
+class category_modulo(category_codes):
+    """
+    A variation on category_codes that assigns categories using an integer column, modulo a base.
+    Category is computed as (column_value - offset)%modulo.
+    """
+
+    # couldn't find anything in the datashape docs about how to check if a CType is an integer, so just define a big set
+    IntegerTypes = {ct.bool_, ct.uint8, ct.uint16, ct.uint32, ct.uint64, ct.int8, ct.int16, ct.int32, ct.int64}
+
+    def __init__(self, column, modulo, offset=0):
+        super().__init__(column)
+        self.bin0 = offset
+        self.modulo = modulo
+
+    def categories(self, in_dshape):
+        return list(range(self.modulo))
+
+    def validate(self, in_dshape):
+        if not self.column in in_dshape.dict:
+            raise ValueError("specified column not found")
+        if in_dshape.measure[self.column] not in self.IntegerTypes:
+            raise ValueError("input must be an integer column")
+
+    def apply(self, df):
+        if cudf and isinstance(df, cudf.DataFrame):
+            ## dunno how to do this in CUDA, is it as simple as this?
+            # return ((df[column] - offset) % modulo).to_gpu_array()
+            raise NotImplementedError("this feature is not implemented in cudf")
+        else:
+            return (df[self.column].values - self.bin0) % self.modulo
+
+class category_binning(category_modulo):
+    """
+    A variation on category_codes that assigns categories by binning a continuously-valued column.
+    The number of categories returned is always nbins+1.
+    The last category (nbin) is for NaNs in the data column, as well as for values under/over the binned
+    interval (when include_under or include_over is False).
+
+    Parameters
+    ----------
+    column:   column to use
+    bin0:     lower bound of first bin
+    binsize:  bin size
+    nbins:     number of bins
+    include_under: if True, values below bin 0 are assigned to category 0
+    include_over:  if True, values above the last bin (nbins-1) are assigned to category nbin-1
+    """
+
+    def __init__(self, column, bin0, binsize, nbins, include_under=True, include_over=True):
+        super().__init__(column, nbins + 1)  # +1 category for NaNs and clipped values
+        self.bin0 = bin0
+        self.binsize = binsize
+        self.nbins = nbins
+        self.bin_under = 0 if include_under else nbins
+        self.bin_over  = nbins-1 if include_over else nbins
+
+    def validate(self, in_dshape):
+        if not self.column in in_dshape.dict:
+            raise ValueError("specified column not found")
+
+    def apply(self, df):
+        """
+        Helper function. Takes a DataFrame column, modulo integer value
+        """
+        if cudf and isinstance(df, cudf.DataFrame):
+            ## dunno how to do this in CUDA
+            raise NotImplementedError("this feature is not implemented in cudf")
+        else:
+            value = df[self.column].values
+            index = ((value - self.bin0) / self.binsize).astype(np.uint32)
+            index[index < 0] = self.bin_under
+            index[index >= self.nbins] = self.bin_over
+            index[np.isnan(value)] = self.nbins
+            return index
+
+
+class category_values(CategoryPreprocess):
+    """Extract a category and a value column from a dataframe as (2,N) numpy array of values."""
+    def __init__(self, categorizer, value_column):
+        super().__init__(value_column)
+        self.categorizer = categorizer
 
     @property
     def inputs(self):
-        return self.columns
+        return (self.categorizer.column, self.column)
+
+    @property
+    def cat_column(self):
+        """Returns name of categorized column"""
+        return self.categorizer.column
+
+    def categories(self, input_dshape):
+        return self.categorizer.categories
+
+    def validate(self, in_dshape):
+        return self.categorizer.validate(in_dshape)
 
     def apply(self, df):
+        a = self.categorizer.apply(df)
         if cudf and isinstance(df, cudf.DataFrame):
             import cupy
-            if df[self.columns[1]].dtype.kind == 'f':
+            if df[self.column].dtype.kind == 'f':
                 nullval = np.nan
             else:
                 nullval = 0
-            a = df[self.columns[0]].cat.codes.to_gpu_array()
-            b = df[self.columns[1]].to_gpu_array(fillna=nullval)
+            b = df[self.column].to_gpu_array(fillna=nullval)
             return cupy.stack((a, b), axis=-1)
         else:
-            a = df[self.columns[0]].cat.codes.values
-            b = df[self.columns[1]].values
+            b = df[self.column].values
             return np.stack((a, b), axis=-1)
 
+
+
 class Reduction(Expr):
     """Base class for per-bin reductions."""
     def __init__(self, column=None):
@@ -139,26 +255,37 @@ def _finalize(bases, cuda=False, **kwargs):
         return xr.DataArray(bases[0], **kwargs)
 
 class by(Reduction):
-    """Apply the provided reduction separately per categorical ``column`` value.
+    """Apply the provided reduction separately per category.
     Parameters
     ----------
-    column : str
-        Name of the column to aggregate over. Column data type must be
-        categorical. Resulting aggregate has an outer dimension axis along the
-        categories present.
+    cats: str or CategoryPreprocess instance
+        Name of column to aggregate over, or a categorizer object that returns categories.
+        Resulting aggregate has an outer dimension axis along the categories present.
     reduction : Reduction
         Per-category reduction function.
     """
-    def __init__(self, cat_column, reduction):
-        self.columns = (cat_column, getattr(reduction, 'column', None))
+    def __init__(self, cats, reduction):
+        # set basic categorizer
+        if isinstance(cats, CategoryPreprocess):
+            self.categorizer = cats
+        elif isinstance(cats, str):
+            self.categorizer = category_codes(cats)
+        else:
+            raise TypeError("first argument must be a column name or a CategoryPreprocess instance")
+        self.column = self.categorizer.column # for backwards compatibility with count_cat
+        self.columns = (self.categorizer.column, getattr(reduction, 'column', None))
         self.reduction = reduction
-        self.column = cat_column # for backwards compatibility with count_cat
-        
+        # if a value column is supplied, set category_values preprocessor
+        if self.val_column is not None:
+            self.preprocess = category_values(self.categorizer, self.val_column)
+        else:
+            self.preprocess = self.categorizer
+
     def __hash__(self):
         return hash((type(self), self._hashable_inputs(), self.reduction))
 
     def _build_temps(self, cuda=False):
-        return tuple(by(self.cat_column, tmp) for tmp in self.reduction._build_temps(cuda))
+        return tuple(by(self.categorizer, tmp) for tmp in self.reduction._build_temps(cuda))
 
     @property
     def cat_column(self):
@@ -169,24 +296,17 @@ def val_column(self):
         return self.columns[1]
 
     def validate(self, in_dshape):
-        if not self.cat_column in in_dshape.dict:
-            raise ValueError("specified column not found")
-        if not isinstance(in_dshape.measure[self.cat_column], ct.Categorical):
-            raise ValueError("input must be categorical")
-
+        self.preprocess.validate(in_dshape)
         self.reduction.validate(in_dshape)
 
     def out_dshape(self, input_dshape):
-        cats = input_dshape.measure[self.cat_column].categories
+        cats = self.categorizer.categories(input_dshape)
         red_shape = self.reduction.out_dshape(input_dshape)
         return dshape(Record([(c, red_shape) for c in cats]))
 
     @property
     def inputs(self):
-        if self.val_column is not None:
-            return (category_values(self.columns),)
-        else:
-            return (category_codes(self.columns[0]),)
+        return (self.preprocess, )
 
     def _build_create(self, out_dshape):
         n_cats = len(out_dshape.measure.fields)
@@ -197,7 +317,7 @@ def _build_bases(self, cuda=False):
         bases = self.reduction._build_bases(cuda)
         if len(bases) == 1 and bases[0] is self:
             return bases
-        return tuple(by(self.cat_column, base) for base in bases)
+        return tuple(by(self.categorizer, base) for base in bases)
 
     def _build_append(self, dshape, schema, cuda=False):
         return self.reduction._build_append(dshape, schema, cuda)
@@ -206,7 +326,7 @@ def _build_combine(self, dshape):
         return self.reduction._combine
 
     def _build_finalize(self, dshape):
-        cats = list(dshape[self.cat_column].categories)
+        cats = self.categorizer.categories(dshape)
 
         def finalize(bases, cuda=False, **kwargs):
             kwargs['dims'] += [self.cat_column]

From 457d6093341fcaf4643fd074338485437ba363a7 Mon Sep 17 00:00:00 2001
From: Oleg Smirnov <osmirnov@gmail.com>
Date: Sat, 30 May 2020 17:08:08 +0200
Subject: [PATCH 2/5] Fix broken category_codes. Replace uint with int for bin
 index.

---
 datashader/reductions.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/datashader/reductions.py b/datashader/reductions.py
index f43d3d522..c08df258d 100644
--- a/datashader/reductions.py
+++ b/datashader/reductions.py
@@ -103,12 +103,11 @@ def validate(self, in_dshape):
             raise ValueError("input must be an integer column")
 
     def apply(self, df):
+        result = (df[self.column] - self.bin0) % self.modulo
         if cudf and isinstance(df, cudf.DataFrame):
-            ## dunno how to do this in CUDA, is it as simple as this?
-            # return ((df[column] - offset) % modulo).to_gpu_array()
-            raise NotImplementedError("this feature is not implemented in cudf")
+            return result.to_gpu_array()
         else:
-            return (df[self.column].values - self.bin0) % self.modulo
+            return result.values
 
 class category_binning(category_modulo):
     """
@@ -140,15 +139,12 @@ def validate(self, in_dshape):
             raise ValueError("specified column not found")
 
     def apply(self, df):
-        """
-        Helper function. Takes a DataFrame column, modulo integer value
-        """
         if cudf and isinstance(df, cudf.DataFrame):
             ## dunno how to do this in CUDA
-            raise NotImplementedError("this feature is not implemented in cudf")
+            raise NotImplementedError("this feature is not implemented in cuda")
         else:
             value = df[self.column].values
-            index = ((value - self.bin0) / self.binsize).astype(np.uint32)
+            index = ((value - self.bin0) / self.binsize).astype(int)
             index[index < 0] = self.bin_under
             index[index >= self.nbins] = self.bin_over
             index[np.isnan(value)] = self.nbins
@@ -326,7 +322,7 @@ def _build_combine(self, dshape):
         return self.reduction._combine
 
     def _build_finalize(self, dshape):
-        cats = self.categorizer.categories(dshape)
+        cats = list(self.categorizer.categories(dshape))
 
         def finalize(bases, cuda=False, **kwargs):
             kwargs['dims'] += [self.cat_column]

From c3758ae19f8b958f79d61e271fde7293d7f4fe9c Mon Sep 17 00:00:00 2001
From: Oleg Smirnov <osmirnov@gmail.com>
Date: Tue, 16 Jun 2020 19:11:03 +0200
Subject: [PATCH 3/5] added categorizer state to its hashable inputs

---
 datashader/reductions.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/datashader/reductions.py b/datashader/reductions.py
index c08df258d..a52f1527d 100644
--- a/datashader/reductions.py
+++ b/datashader/reductions.py
@@ -90,9 +90,12 @@ class category_modulo(category_codes):
 
     def __init__(self, column, modulo, offset=0):
         super().__init__(column)
-        self.bin0 = offset
+        self.offset = offset
         self.modulo = modulo
 
+    def _hashable_inputs(self):
+        return super()._hashable_inputs() + (self.offset, self.modulo)
+
     def categories(self, in_dshape):
         return list(range(self.modulo))
 
@@ -103,7 +106,7 @@ def validate(self, in_dshape):
             raise ValueError("input must be an integer column")
 
     def apply(self, df):
-        result = (df[self.column] - self.bin0) % self.modulo
+        result = (df[self.column] - self.offset) % self.modulo
         if cudf and isinstance(df, cudf.DataFrame):
             return result.to_gpu_array()
         else:
@@ -134,6 +137,9 @@ def __init__(self, column, bin0, binsize, nbins, include_under=True, include_ove
         self.bin_under = 0 if include_under else nbins
         self.bin_over  = nbins-1 if include_over else nbins
 
+    def _hashable_inputs(self):
+        return super()._hashable_inputs() + (self.bin0, self.binsize, self.bin_under, self.bin_over)
+
     def validate(self, in_dshape):
         if not self.column in in_dshape.dict:
             raise ValueError("specified column not found")
@@ -278,7 +284,7 @@ def __init__(self, cats, reduction):
             self.preprocess = self.categorizer
 
     def __hash__(self):
-        return hash((type(self), self._hashable_inputs(), self.reduction))
+        return hash((type(self), self._hashable_inputs(), self.categorizer._hashable_inputs(), self.reduction))
 
     def _build_temps(self, cuda=False):
         return tuple(by(self.categorizer, tmp) for tmp in self.reduction._build_temps(cuda))

From a674ddbc9f4ddb502dcece2aa61a82c4aab9d8b6 Mon Sep 17 00:00:00 2001
From: "James A. Bednar" <jbednar@users.noreply.github.com>
Date: Mon, 2 Nov 2020 18:16:42 -0600
Subject: [PATCH 4/5] Fixed typo

---
 datashader/reductions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datashader/reductions.py b/datashader/reductions.py
index 4b9fb551d..cd093534b 100644
--- a/datashader/reductions.py
+++ b/datashader/reductions.py
@@ -114,7 +114,7 @@ def apply(self, df):
 
 class category_binning(category_modulo):
     """
-    A variation on category_codes that assigns categories by binning a continuously-valued column.
+    A variation on category_codes that assigns categories by binning a continuous-valued column.
     The number of categories returned is always nbins+1.
     The last category (nbin) is for NaNs in the data column, as well as for values under/over the binned
     interval (when include_under or include_over is False).

From d542970565cb56f201df65ebda29841cacf11f19 Mon Sep 17 00:00:00 2001
From: Oleg Smirnov <osmirnov@gmail.com>
Date: Wed, 4 Nov 2020 17:54:51 +0200
Subject: [PATCH 5/5] added tests for new reductions. Implemented @jbednar's
 suggestions

---
 datashader/reductions.py        |  23 +++---
 datashader/tests/test_dask.py   | 119 ++++++++++++++++++++++++++-
 datashader/tests/test_pandas.py | 138 +++++++++++++++++++++++++++++++-
 3 files changed, 267 insertions(+), 13 deletions(-)

diff --git a/datashader/reductions.py b/datashader/reductions.py
index 58c0f7c70..967961da6 100644
--- a/datashader/reductions.py
+++ b/datashader/reductions.py
@@ -124,17 +124,17 @@ class category_binning(category_modulo):
     Parameters
     ----------
     column:   column to use
-    bin0:     lower bound of first bin
-    binsize:  bin size
+    lower:    lower bound of first bin
+    upper:    upper bound of last bin
     nbins:     number of bins
     include_under: if True, values below bin 0 are assigned to category 0
     include_over:  if True, values above the last bin (nbins-1) are assigned to category nbin-1
     """
 
-    def __init__(self, column, bin0, binsize, nbins, include_under=True, include_over=True):
+    def __init__(self, column, lower, upper, nbins, include_under=True, include_over=True):
         super().__init__(column, nbins + 1)  # +1 category for NaNs and clipped values
-        self.bin0 = bin0
-        self.binsize = binsize
+        self.bin0 = lower
+        self.binsize = (upper - lower) / float(nbins)
         self.nbins = nbins
         self.bin_under = 0 if include_under else nbins
         self.bin_over  = nbins-1 if include_over else nbins
@@ -269,12 +269,12 @@ class by(Reduction):
     reduction : Reduction
         Per-category reduction function.
     """
-    def __init__(self, cats, reduction):
+    def __init__(self, cat_column, reduction):
         # set basic categorizer
-        if isinstance(cats, CategoryPreprocess):
-            self.categorizer = cats
-        elif isinstance(cats, str):
-            self.categorizer = category_codes(cats)
+        if isinstance(cat_column, CategoryPreprocess):
+            self.categorizer = cat_column
+        elif isinstance(cat_column, str):
+            self.categorizer = category_codes(cat_column)
         else:
             raise TypeError("first argument must be a column name or a CategoryPreprocess instance")
         self.column = self.categorizer.column # for backwards compatibility with count_cat
@@ -863,4 +863,5 @@ def inputs(self):
 __all__ = list(set([_k for _k,_v in locals().items()
                     if isinstance(_v,type) and (issubclass(_v,Reduction) or _v is summary)
                     and _v not in [Reduction, OptionalFieldReduction,
-                                   FloatingReduction, m2]]))
+                                   FloatingReduction, m2]])) + \
+                    ['category_modulo', 'category_binning']
diff --git a/datashader/tests/test_dask.py b/datashader/tests/test_dask.py
index b8c17fb0e..439a0246f 100644
--- a/datashader/tests/test_dask.py
+++ b/datashader/tests/test_dask.py
@@ -41,7 +41,8 @@
                       'f32': np.arange(20, dtype='f4'),
                       'f64': np.arange(20, dtype='f8'),
                       'empty_bin': np.array([0.] * 15 + [np.nan] * 5),
-                      'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5})
+                      'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5,
+                      'cat_int': np.array([10]*5 + [11]*5 + [12]*5 + [13]*5)})
 df_pd.cat = df_pd.cat.astype('category')
 df_pd.at[2,'f32'] = np.nan
 df_pd.at[2,'f64'] = np.nan
@@ -236,6 +237,40 @@ def test_count_cat(ddf):
     agg = c.points(ddf, 'x', 'y', ds.count_cat('cat'))
     assert_eq_xr(agg, out)
 
+    # categorizing by (cat_int-10)%4 ought to give the same result
+    out = xr.DataArray(
+        sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int'])
+    )
+    agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.count()))
+    assert_eq_xr(agg, out)
+
+    # easier to write these tests in here, since we expect the same result with only slight tweaks
+
+    # add an extra category (this will count nans and out of bounds)
+    sol = np.append(sol, [[[0], [0]],[[0], [0]]], axis=2)
+
+    # categorizing by binning the integer arange columns using [0,20] into 4 bins. Same result as for count_cat
+    for col in 'i32', 'i64':
+        out = xr.DataArray(
+            sol, coords=(coords + [range(5)]), dims=(dims + [col])
+        )
+        agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.count()))
+        assert_eq_xr(agg, out)
+
+    # as above, but for the float arange columns. Element 2 has a nan, so the first bin is one short, and the nan bin is +1
+    sol[0, 0, 0] = 4
+    sol[0, 0, 4] = 1
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=(coords + [range(5)]), dims=(dims + [col])
+        )
+        agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.count()))
+        assert_eq_xr(agg, out)
+
+
+
+
 @pytest.mark.parametrize('ddf', ddfs)
 def test_categorical_sum(ddf):
     sol = np.array([[[ 10, nan, nan, nan],
@@ -251,6 +286,15 @@ def test_categorical_sum(ddf):
     agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.sum('i64')))
     assert_eq_xr(agg, out)
 
+    out = xr.DataArray(
+        sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int'])
+    )
+    agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.sum('i32')))
+    assert_eq_xr(agg, out)
+
+    agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.sum('i64')))
+    assert_eq_xr(agg, out)
+
     sol = np.array([[[8.0,  nan,  nan,  nan],
                      [nan,  nan, 60.0,  nan]],
                     [[nan, 35.0,  nan,  nan],
@@ -264,6 +308,17 @@ def test_categorical_sum(ddf):
     agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.sum('f64')))
     assert_eq_xr(agg, out)
 
+    # add an extra category (this will count nans and out of bounds)
+    sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2)
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=(coords + [range(5)]), dims=(dims + [col])
+        )
+        agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.sum(col)))
+        assert_eq_xr(agg, out)
+
+
 @pytest.mark.parametrize('ddf', ddfs)
 def test_categorical_mean(ddf):
     sol = np.array([[[  2, nan, nan, nan],
@@ -281,6 +336,27 @@ def test_categorical_mean(ddf):
     agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.mean('f64')))
     assert_eq_xr(agg, out)
 
+    out = xr.DataArray(
+        sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int'])
+    )
+    agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.mean('f32')))
+    assert_eq_xr(agg, out)
+
+    agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.mean('f64')))
+    assert_eq_xr(agg, out)
+
+    # add an extra category (this will count nans and out of bounds)
+    sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2)
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=(coords + [range(5)]), dims=(dims + [col])
+        )
+        agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.mean(col)))
+        assert_eq_xr(agg, out)
+
+
+
 @pytest.mark.parametrize('ddf', ddfs)
 def test_categorical_var(ddf):
     if cudf and isinstance(ddf._meta, cudf.DataFrame):
@@ -303,6 +379,27 @@ def test_categorical_var(ddf):
     agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.var('f64')))
     assert_eq_xr(agg, out, True)
 
+    out = xr.DataArray(
+        sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int'])
+    )
+    agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.var('f32')))
+    assert_eq_xr(agg, out)
+
+    agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.var('f64')))
+    assert_eq_xr(agg, out)
+
+    # add an extra category (this will count nans and out of bounds)
+    sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2)
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=(coords + [range(5)]), dims=(dims + [col])
+        )
+        agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.var(col)))
+        assert_eq_xr(agg, out)
+
+
+
 @pytest.mark.parametrize('ddf', ddfs)
 def test_categorical_std(ddf):
     if cudf and isinstance(ddf._meta, cudf.DataFrame):
@@ -327,6 +424,26 @@ def test_categorical_std(ddf):
     agg = c.points(ddf, 'x', 'y', ds.by('cat', ds.std('f64')))
     assert_eq_xr(agg, out, True)
 
+    out = xr.DataArray(
+        sol, coords=(coords + [range(4)]), dims=(dims + ['cat_int'])
+    )
+    agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.std('f32')))
+    assert_eq_xr(agg, out)
+
+    agg = c.points(ddf, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.std('f64')))
+    assert_eq_xr(agg, out)
+
+    # add an extra category (this will count nans and out of bounds)
+    sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2)
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=(coords + [range(5)]), dims=(dims + [col])
+        )
+        agg = c.points(ddf, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.std(col)))
+        assert_eq_xr(agg, out)
+
+
 @pytest.mark.parametrize('ddf', ddfs)
 def test_multiple_aggregates(ddf):
     if dask_cudf and isinstance(ddf, dask_cudf.DataFrame):
diff --git a/datashader/tests/test_pandas.py b/datashader/tests/test_pandas.py
index 03d710f8e..5073021b5 100644
--- a/datashader/tests/test_pandas.py
+++ b/datashader/tests/test_pandas.py
@@ -22,7 +22,8 @@
                       'f32': np.arange(20, dtype='f4'),
                       'f64': np.arange(20, dtype='f8'),
                       'empty_bin': np.array([0.] * 15 + [np.nan] * 5),
-                      'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5})
+                      'cat': ['a']*5 + ['b']*5 + ['c']*5 + ['d']*5,
+                      'cat_int': np.array([10]*5 + [11]*5 + [12]*5 + [13]*5)})
 df_pd.cat = df_pd.cat.astype('category')
 df_pd.at[2,'f32'] = nan
 df_pd.at[2,'f64'] = nan
@@ -250,6 +251,36 @@ def test_categorical_count(df):
     agg = c.points(df, 'x', 'y', ds.by('cat', ds.count('i32')))
     assert_eq_xr(agg, out)
 
+    # categorizing by (cat_int-10)%4 ought to give the same result
+    out = xr.DataArray(
+        sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int'])
+    )
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.count()))
+    assert_eq_xr(agg, out)
+
+    # add an extra category (this will count nans and out of bounds)
+    sol = np.append(sol, [[[0], [0]],[[0], [0]]], axis=2)
+
+    # categorizing by binning the integer arange columns using [0,20] into 4 bins. Same result as for count_cat
+    for col in 'i32', 'i64':
+        out = xr.DataArray(
+            sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col])
+        )
+        agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.count()))
+        assert_eq_xr(agg, out)
+
+    # as above, but for the float arange columns. Element 2 has a nan, so the first bin is one short, and the nan bin is +1
+    sol[0, 0, 0] = 4
+    sol[0, 0, 4] = 1
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col])
+        )
+        agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.count()))
+        assert_eq_xr(agg, out)
+
+
 @pytest.mark.parametrize('df', dfs)
 def test_categorical_sum(df):
     sol = np.array([[[ 10, nan, nan, nan],
@@ -266,6 +297,17 @@ def test_categorical_sum(df):
     agg = c.points(df, 'x', 'y', ds.by('cat', ds.sum('i64')))
     assert_eq_xr(agg, out)
 
+    # categorizing by (cat_int-10)%4 ought to give the same result
+    out = xr.DataArray(
+        sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int'])
+    )
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.sum('i32')))
+    assert_eq_xr(agg, out)
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.sum('i64')))
+    assert_eq_xr(agg, out)
+
     sol = np.array([[[8.0,  nan,  nan,  nan],
                      [nan,  nan, 60.0,  nan]],
                     [[nan, 35.0,  nan,  nan],
@@ -280,6 +322,16 @@ def test_categorical_sum(df):
     agg = c.points(df, 'x', 'y', ds.by('cat', ds.sum('f64')))
     assert_eq_xr(agg, out)
 
+    sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2)
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col])
+        )
+        agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.sum(col)))
+        assert_eq_xr(agg, out)
+
+
 @pytest.mark.parametrize('df', dfs)
 def test_categorical_max(df):
     sol = np.array([[[  4, nan, nan, nan],
@@ -293,6 +345,27 @@ def test_categorical_max(df):
     agg = c.points(df, 'x', 'y', ds.by('cat', ds.max('i32')))
     assert_eq_xr(agg, out)
 
+    # categorizing by (cat_int-10)%4 ought to give the same result
+    out = xr.DataArray(
+        sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int'])
+    )
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.max('i32')))
+    assert_eq_xr(agg, out)
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.max('i64')))
+    assert_eq_xr(agg, out)
+
+    sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2)
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col])
+        )
+        agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.max(col)))
+        assert_eq_xr(agg, out)
+
+
 @pytest.mark.parametrize('df', dfs)
 def test_categorical_mean(df):
     sol = np.array([[[  2, nan, nan, nan],
@@ -310,6 +383,27 @@ def test_categorical_mean(df):
     agg = c.points(df, 'x', 'y', ds.by('cat', ds.mean('f64')))
     assert_eq_xr(agg, out)
 
+    # categorizing by (cat_int-10)%4 ought to give the same result
+    out = xr.DataArray(
+        sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int'])
+    )
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.mean('i32')))
+    assert_eq_xr(agg, out)
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.mean('i64')))
+    assert_eq_xr(agg, out)
+
+    sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2)
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col])
+        )
+        agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.mean(col)))
+        assert_eq_xr(agg, out)
+
+
 @pytest.mark.parametrize('df', dfs)
 def test_categorical_var(df):
     if cudf and isinstance(df, cudf.DataFrame):
@@ -332,6 +426,27 @@ def test_categorical_var(df):
     agg = c.points(df, 'x', 'y', ds.by('cat', ds.var('f64')))
     assert_eq_xr(agg, out, True)
 
+        # categorizing by (cat_int-10)%4 ought to give the same result
+    out = xr.DataArray(
+        sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int'])
+    )
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.var('f32')))
+    assert_eq_xr(agg, out)
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.var('f64')))
+    assert_eq_xr(agg, out)
+
+    sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2)
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col])
+        )
+        agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.var(col)))
+        assert_eq_xr(agg, out)
+
+
 @pytest.mark.parametrize('df', dfs)
 def test_categorical_std(df):
     if cudf and isinstance(df, cudf.DataFrame):
@@ -356,6 +471,27 @@ def test_categorical_std(df):
     agg = c.points(df, 'x', 'y', ds.by('cat', ds.std('f64')))
     assert_eq_xr(agg, out, True)
 
+        # categorizing by (cat_int-10)%4 ought to give the same result
+    out = xr.DataArray(
+        sol, coords=OrderedDict(coords, cat_int=range(4)), dims=(dims + ['cat_int'])
+    )
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.std('f32')))
+    assert_eq_xr(agg, out)
+
+    agg = c.points(df, 'x', 'y', ds.by(ds.category_modulo('cat_int', modulo=4, offset=10), ds.std('f64')))
+    assert_eq_xr(agg, out)
+
+    sol = np.append(sol, [[[nan], [nan]],[[nan], [nan]]], axis=2)
+
+    for col in 'f32', 'f64':
+        out = xr.DataArray(
+            sol, coords=OrderedDict(coords, **{col: range(5)}), dims=(dims + [col])
+        )
+        agg = c.points(df, 'x', 'y', ds.by(ds.category_binning(col, 0, 20, 4), ds.std(col)))
+        assert_eq_xr(agg, out)
+
+
 @pytest.mark.parametrize('df', dfs)
 def test_multiple_aggregates(df):
     agg = c.points(df, 'x', 'y',