From 09671fde13536e3784ca2d3752c3ef8c2907f4e4 Mon Sep 17 00:00:00 2001
From: gbrener <gregshipssoftware@gmail.com>
Date: Thu, 4 May 2017 19:20:19 -0500
Subject: [PATCH 01/11] Optimize max/min operations

Optimize the operations that datashader uses to calculate the x_range and y_range variables when the ranges are not provided by the user. Add a caching feature that avoids recalculating the x_range and y_range when the Canvas is reused for multiple aggregations. Caching feature can be turned off when instantiating the Canvas object, or invalidated while calling cvs.points(..., recalc_ranges=True) or cvs.line(..., recalc_ranges=True)
---
 datashader/core.py    | 42 +++++++++++++++++++++++++++++++++++-------
 datashader/dask.py    |  9 +++++++--
 datashader/glyphs.py  |  6 ++++--
 datashader/pandas.py  |  9 +++++++--
 examples/filetimes.py | 15 ++++++++++-----
 5 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/datashader/core.py b/datashader/core.py
index 309bb6d82..eb18b27a9 100644
--- a/datashader/core.py
+++ b/datashader/core.py
@@ -138,23 +138,35 @@ class Canvas(object):
     plot_width, plot_height : int, optional
         Width and height of the output aggregate in pixels.
     x_range, y_range : tuple, optional
-        A tuple representing the bounds inclusive space ``[min, max]`` along
-        the axis.
+        A tuple representing the bounds inclusive space ``[min, max]`` along the
+        axis. These will be calculated and cached by datashader during the first
+        aggregation, if not provided. The cached values may be
+        invalidated/recalculated by providing the ``recalc_ranges`` keyword
+        argument to the ``Canvas.points()`` and/or ``Canvas.line()`` methods.
     x_axis_type, y_axis_type : str, optional
         The type of the axis. Valid options are ``'linear'`` [default], and
         ``'log'``.
+    cache_ranges : bool, optional
+        Whether to cache x_range and y_range variables when they get
+        updated. Default is ``True``. This provides a speedup by relying on the
+        x and y data remaining unchanged between data aggregations. If the x or
+        y data may have changed, one may either set this option to ``False``, or
+        provide the ``recalc_ranges=True`` when calling ``Canvas.points()``
+        and/or ``Canvas.line()``.
     """
     def __init__(self, plot_width=600, plot_height=600,
                  x_range=None, y_range=None,
-                 x_axis_type='linear', y_axis_type='linear'):
+                 x_axis_type='linear', y_axis_type='linear',
+                 cache_ranges=True):
         self.plot_width = plot_width
         self.plot_height = plot_height
-        self.x_range = tuple(x_range) if x_range is not None else x_range
-        self.y_range = tuple(y_range) if y_range is not None else y_range
+        self.x_range = tuple(x_range) if x_range is not None else None
+        self.y_range = tuple(y_range) if y_range is not None else None
         self.x_axis = _axis_lookup[x_axis_type]
         self.y_axis = _axis_lookup[y_axis_type]
+        self.cache_ranges = cache_ranges
 
-    def points(self, source, x, y, agg=None):
+    def points(self, source, x, y, agg=None, recalc_ranges=False):
         """Compute a reduction by pixel, mapping data to pixels as points.
 
         Parameters
@@ -165,14 +177,22 @@ def points(self, source, x, y, agg=None):
             Column names for the x and y coordinates of each point.
         agg : Reduction, optional
             Reduction to compute. Default is ``count()``.
+        recalc_ranges : bool, optional
+            Recalculate the ranges that datashader calculated/cached during the
+            first aggregation. Default is ``False``. This option should only be
+            used if the dataframe's x or y data was altered after the ``Canvas``
+            was created.
         """
         from .glyphs import Point
         from .reductions import count
+        if not self.cache_ranges or recalc_ranges:
+            self.x_range = None
+            self.y_range = None
         if agg is None:
             agg = count()
         return bypixel(source, self, Point(x, y), agg)
 
-    def line(self, source, x, y, agg=None):
+    def line(self, source, x, y, agg=None, recalc_ranges=False):
         """Compute a reduction by pixel, mapping data to pixels as a line.
 
         For aggregates that take in extra fields, the interpolated bins will
@@ -192,9 +212,17 @@ def line(self, source, x, y, agg=None):
             Column names for the x and y coordinates of each vertex.
         agg : Reduction, optional
             Reduction to compute. Default is ``any()``.
+        recalc_ranges : bool, optional
+            Recalculate the ranges that datashader calculated/cached during the
+            first aggregation. Default is ``False``. This option should only be
+            used if the dataframe's x or y data was altered after the ``Canvas``
+            was created.
         """
         from .glyphs import Line
         from .reductions import any
+        if not self.cache_ranges or recalc_ranges:
+            self.x_range = None
+            self.y_range = None
         if agg is None:
             agg = any()
         return bypixel(source, self, Line(x, y), agg)
diff --git a/datashader/dask.py b/datashader/dask.py
index 33657537d..c483f36ab 100644
--- a/datashader/dask.py
+++ b/datashader/dask.py
@@ -25,8 +25,13 @@ def dask_pipeline(df, schema, canvas, glyph, summary):
 
 
 def shape_bounds_st_and_axis(df, canvas, glyph):
-    x_range = canvas.x_range or glyph._compute_x_bounds(df)
-    y_range = canvas.y_range or glyph._compute_y_bounds(df)
+    # Cache the x and y ranges during the first aggregation
+    if canvas.x_range is None:
+        canvas.x_range = glyph._compute_x_bounds(df)
+    if canvas.y_range is None:
+        canvas.y_range = glyph._compute_y_bounds(df)
+    x_range = canvas.x_range
+    y_range = canvas.y_range
     x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
     x_range, y_range = (x_min, x_max), (y_min, y_max)
     width = canvas.plot_width
diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index 321cc3cf7..1352c718c 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -29,10 +29,12 @@ def validate(self, in_dshape):
             raise ValueError('y must be real')
 
     def _compute_x_bounds(self, df):
-        return df[self.x].min(), df[self.x].max()
+        xs = df[self.x].values
+        return xs.min(), xs.max()
 
     def _compute_y_bounds(self, df):
-        return df[self.y].min(), df[self.y].max()
+        ys = df[self.y].values
+        return ys.min(), ys.max()
 
 
 class Point(_PointLike):
diff --git a/datashader/pandas.py b/datashader/pandas.py
index f631ff14e..c52fb5dc2 100644
--- a/datashader/pandas.py
+++ b/datashader/pandas.py
@@ -15,8 +15,13 @@ def pandas_pipeline(df, schema, canvas, glyph, summary):
     y_mapper = canvas.y_axis.mapper
     extend = glyph._build_extend(x_mapper, y_mapper, info, append)
 
-    x_range = canvas.x_range or glyph._compute_x_bounds(df)
-    y_range = canvas.y_range or glyph._compute_y_bounds(df)
+    # Cache the x and y ranges during the first aggregation
+    if canvas.x_range is None:
+        canvas.x_range = glyph._compute_x_bounds(df)
+    if canvas.y_range is None:
+        canvas.y_range = glyph._compute_y_bounds(df)
+    x_range = canvas.x_range
+    y_range = canvas.y_range
     width = canvas.plot_width
     height = canvas.plot_height
 
diff --git a/examples/filetimes.py b/examples/filetimes.py
index 3e47ad3e4..db2540a03 100755
--- a/examples/filetimes.py
+++ b/examples/filetimes.py
@@ -227,11 +227,15 @@ def timed_read(filepath,dftype):
     return df, duration
 
 
-def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12)):
+CACHED_RANGES = (None, None)
+def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12), recalc_ranges=False):
+    global CACHED_RANGES
     start = time.time()
-    cvs = ds.Canvas(plot_width, plot_height)
-    agg = cvs.points(df, p.x, p.y)
+    cvs = ds.Canvas(plot_width, plot_height, x_range=CACHED_RANGES[0], y_range=CACHED_RANGES[1])
+    agg = cvs.points(df, p.x, p.y, recalc_ranges=recalc_ranges)
     end = time.time()
+    if not recalc_ranges:
+        CACHED_RANGES = (cvs.x_range, cvs.y_range)
     img = export_image(tf.shade(agg),filepath,export_path=".")
     return img, end-start
 
@@ -273,6 +277,7 @@ def main(argv):
     parser.add_argument('--debug', action='store_true', help='Enable increased verbosity and DEBUG messages')
     parser.add_argument('--cache', choices=('persist', 'cachey'), default=None, help='Enable caching: "persist" causes Dask dataframes to force loading into memory; "cachey" uses dask.cache.Cache with a cachesize of {}. Caching is disabled by default'.format(int(p.cachesize)))
     parser.add_argument('--distributed', action='store_true', help='Enable the distributed scheduler instead of the threaded, which is the default.')
+    parser.add_argument('--recalc-ranges', action='store_true', help='Tell datashader to recalculate the ranges on each aggregation, instead of caching them (by default).')
     args = parser.parse_args(argv[1:])
 
     if args.cache is None:
@@ -321,7 +326,7 @@ def main(argv):
     if DEBUG:
         print('DEBUG: Memory usage (after read):\t{} MB'.format(get_proc_mem(), flush=True))
 
-    img,aggtime1 = timed_agg(df,filepath,5,5)
+    img,aggtime1 = timed_agg(df,filepath,5,5,recalc_ranges=args.recalc_ranges)
     if DEBUG:
         mem_usage = df.memory_usage(deep=True)
         if p.dftype == 'dask':
@@ -333,7 +338,7 @@ def main(argv):
             print('DEBUG: column "{}" dtype: {}'.format(colname, df[colname].dtype))
         print('DEBUG: Memory usage (after agg1):\t{} MB'.format(get_proc_mem(), flush=True))
 
-    img,aggtime2 = timed_agg(df,filepath)
+    img,aggtime2 = timed_agg(df,filepath,recalc_ranges=args.recalc_ranges)
     if DEBUG:
         print('DEBUG: Memory usage (after agg2):\t{} MB'.format(get_proc_mem(), flush=True))
     

From 206601ef4c6901042058dba7f56af92501fc8bcc Mon Sep 17 00:00:00 2001
From: gbrener <gbrener@continuum.io>
Date: Fri, 5 May 2017 16:20:44 -0500
Subject: [PATCH 02/11] Revert caching feature. min/max -> nanmin/nanmax

Remove caching feature from previous commits after discussion with Jim. Convert arr.min() and arr.max() calls to np.nanmin(arr) and np.nanmax(arr) to more-closely emulate the NaN-handling behavior of df.min() and df.max()
---
 datashader/core.py    | 42 +++++++-----------------------------------
 datashader/dask.py    |  9 ++-------
 datashader/glyphs.py  |  4 ++--
 datashader/pandas.py  |  9 ++-------
 examples/filetimes.py |  2 +-
 5 files changed, 14 insertions(+), 52 deletions(-)

diff --git a/datashader/core.py b/datashader/core.py
index eb18b27a9..1c09af0d9 100644
--- a/datashader/core.py
+++ b/datashader/core.py
@@ -138,35 +138,23 @@ class Canvas(object):
     plot_width, plot_height : int, optional
         Width and height of the output aggregate in pixels.
     x_range, y_range : tuple, optional
-        A tuple representing the bounds inclusive space ``[min, max]`` along the
-        axis. These will be calculated and cached by datashader during the first
-        aggregation, if not provided. The cached values may be
-        invalidated/recalculated by providing the ``recalc_ranges`` keyword
-        argument to the ``Canvas.points()`` and/or ``Canvas.line()`` methods.
+        A tuple representing the bounds inclusive space ``[min, max]`` along
+        the axis.
     x_axis_type, y_axis_type : str, optional
         The type of the axis. Valid options are ``'linear'`` [default], and
         ``'log'``.
-    cache_ranges : bool, optional
-        Whether to cache x_range and y_range variables when they get
-        updated. Default is ``True``. This provides a speedup by relying on the
-        x and y data remaining unchanged between data aggregations. If the x or
-        y data may have changed, one may either set this option to ``False``, or
-        provide the ``recalc_ranges=True`` when calling ``Canvas.points()``
-        and/or ``Canvas.line()``.
     """
     def __init__(self, plot_width=600, plot_height=600,
                  x_range=None, y_range=None,
-                 x_axis_type='linear', y_axis_type='linear',
-                 cache_ranges=True):
+                 x_axis_type='linear', y_axis_type='linear'):
         self.plot_width = plot_width
         self.plot_height = plot_height
-        self.x_range = tuple(x_range) if x_range is not None else None
-        self.y_range = tuple(y_range) if y_range is not None else None
+        self.x_range = None if x_range is None else tuple(x_range)
+        self.y_range = None if y_range is None else tuple(y_range)
         self.x_axis = _axis_lookup[x_axis_type]
         self.y_axis = _axis_lookup[y_axis_type]
-        self.cache_ranges = cache_ranges
 
-    def points(self, source, x, y, agg=None, recalc_ranges=False):
+    def points(self, source, x, y, agg=None):
         """Compute a reduction by pixel, mapping data to pixels as points.
 
         Parameters
@@ -177,22 +165,14 @@ def points(self, source, x, y, agg=None, recalc_ranges=False):
             Column names for the x and y coordinates of each point.
         agg : Reduction, optional
             Reduction to compute. Default is ``count()``.
-        recalc_ranges : bool, optional
-            Recalculate the ranges that datashader calculated/cached during the
-            first aggregation. Default is ``False``. This option should only be
-            used if the dataframe's x or y data was altered after the ``Canvas``
-            was created.
         """
         from .glyphs import Point
         from .reductions import count
-        if not self.cache_ranges or recalc_ranges:
-            self.x_range = None
-            self.y_range = None
         if agg is None:
             agg = count()
         return bypixel(source, self, Point(x, y), agg)
 
-    def line(self, source, x, y, agg=None, recalc_ranges=False):
+    def line(self, source, x, y, agg=None):
         """Compute a reduction by pixel, mapping data to pixels as a line.
 
         For aggregates that take in extra fields, the interpolated bins will
@@ -212,17 +192,9 @@ def line(self, source, x, y, agg=None, recalc_ranges=False):
             Column names for the x and y coordinates of each vertex.
         agg : Reduction, optional
             Reduction to compute. Default is ``any()``.
-        recalc_ranges : bool, optional
-            Recalculate the ranges that datashader calculated/cached during the
-            first aggregation. Default is ``False``. This option should only be
-            used if the dataframe's x or y data was altered after the ``Canvas``
-            was created.
         """
         from .glyphs import Line
         from .reductions import any
-        if not self.cache_ranges or recalc_ranges:
-            self.x_range = None
-            self.y_range = None
         if agg is None:
             agg = any()
         return bypixel(source, self, Line(x, y), agg)
diff --git a/datashader/dask.py b/datashader/dask.py
index c483f36ab..33657537d 100644
--- a/datashader/dask.py
+++ b/datashader/dask.py
@@ -25,13 +25,8 @@ def dask_pipeline(df, schema, canvas, glyph, summary):
 
 
 def shape_bounds_st_and_axis(df, canvas, glyph):
-    # Cache the x and y ranges during the first aggregation
-    if canvas.x_range is None:
-        canvas.x_range = glyph._compute_x_bounds(df)
-    if canvas.y_range is None:
-        canvas.y_range = glyph._compute_y_bounds(df)
-    x_range = canvas.x_range
-    y_range = canvas.y_range
+    x_range = canvas.x_range or glyph._compute_x_bounds(df)
+    y_range = canvas.y_range or glyph._compute_y_bounds(df)
     x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
     x_range, y_range = (x_min, x_max), (y_min, y_max)
     width = canvas.plot_width
diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index 1352c718c..fc1347bcc 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -30,11 +30,11 @@ def validate(self, in_dshape):
 
     def _compute_x_bounds(self, df):
         xs = df[self.x].values
-        return xs.min(), xs.max()
+        return np.nanmin(xs), np.nanmax(xs)
 
     def _compute_y_bounds(self, df):
         ys = df[self.y].values
-        return ys.min(), ys.max()
+        return np.nanmin(ys), np.nanmax(ys)
 
 
 class Point(_PointLike):
diff --git a/datashader/pandas.py b/datashader/pandas.py
index c52fb5dc2..f631ff14e 100644
--- a/datashader/pandas.py
+++ b/datashader/pandas.py
@@ -15,13 +15,8 @@ def pandas_pipeline(df, schema, canvas, glyph, summary):
     y_mapper = canvas.y_axis.mapper
     extend = glyph._build_extend(x_mapper, y_mapper, info, append)
 
-    # Cache the x and y ranges during the first aggregation
-    if canvas.x_range is None:
-        canvas.x_range = glyph._compute_x_bounds(df)
-    if canvas.y_range is None:
-        canvas.y_range = glyph._compute_y_bounds(df)
-    x_range = canvas.x_range
-    y_range = canvas.y_range
+    x_range = canvas.x_range or glyph._compute_x_bounds(df)
+    y_range = canvas.y_range or glyph._compute_y_bounds(df)
     width = canvas.plot_width
     height = canvas.plot_height
 
diff --git a/examples/filetimes.py b/examples/filetimes.py
index db2540a03..f482cb1cf 100755
--- a/examples/filetimes.py
+++ b/examples/filetimes.py
@@ -232,7 +232,7 @@ def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12), re
     global CACHED_RANGES
     start = time.time()
     cvs = ds.Canvas(plot_width, plot_height, x_range=CACHED_RANGES[0], y_range=CACHED_RANGES[1])
-    agg = cvs.points(df, p.x, p.y, recalc_ranges=recalc_ranges)
+    agg = cvs.points(df, p.x, p.y)
     end = time.time()
     if not recalc_ranges:
         CACHED_RANGES = (cvs.x_range, cvs.y_range)

From 8945def49c1ee9858f4089bc952411d3463d7f70 Mon Sep 17 00:00:00 2001
From: gbrener <gbrener@continuum.io>
Date: Fri, 5 May 2017 17:43:57 -0500
Subject: [PATCH 03/11] Memoize x/y bounds computations

Use toolz.memoize - similar to how other code in glyphs.py is optimized - to cache the x/y bound computations. This takes advantage of the fact that dask dataframes are immutable/hashable, and has the desired result that the cache_ranges feature had before.
---
 datashader/glyphs.py  | 2 ++
 examples/filetimes.py | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index fc1347bcc..f40d5ac2e 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -28,10 +28,12 @@ def validate(self, in_dshape):
         elif not isreal(in_dshape.measure[self.y]):
             raise ValueError('y must be real')
 
+    @memoize
     def _compute_x_bounds(self, df):
         xs = df[self.x].values
         return np.nanmin(xs), np.nanmax(xs)
 
+    @memoize
     def _compute_y_bounds(self, df):
         ys = df[self.y].values
         return np.nanmin(ys), np.nanmax(ys)
diff --git a/examples/filetimes.py b/examples/filetimes.py
index f482cb1cf..cf6a18b20 100755
--- a/examples/filetimes.py
+++ b/examples/filetimes.py
@@ -228,13 +228,13 @@ def timed_read(filepath,dftype):
 
 
 CACHED_RANGES = (None, None)
-def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12), recalc_ranges=False):
+def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12), cache_ranges=True):
     global CACHED_RANGES
     start = time.time()
     cvs = ds.Canvas(plot_width, plot_height, x_range=CACHED_RANGES[0], y_range=CACHED_RANGES[1])
     agg = cvs.points(df, p.x, p.y)
     end = time.time()
-    if not recalc_ranges:
+    if cache_ranges:
         CACHED_RANGES = (cvs.x_range, cvs.y_range)
     img = export_image(tf.shade(agg),filepath,export_path=".")
     return img, end-start
@@ -326,7 +326,7 @@ def main(argv):
     if DEBUG:
         print('DEBUG: Memory usage (after read):\t{} MB'.format(get_proc_mem(), flush=True))
 
-    img,aggtime1 = timed_agg(df,filepath,5,5,recalc_ranges=args.recalc_ranges)
+    img,aggtime1 = timed_agg(df,filepath,5,5,cache_ranges=(not args.recalc_ranges))
     if DEBUG:
         mem_usage = df.memory_usage(deep=True)
         if p.dftype == 'dask':
@@ -338,7 +338,7 @@ def main(argv):
             print('DEBUG: column "{}" dtype: {}'.format(colname, df[colname].dtype))
         print('DEBUG: Memory usage (after agg1):\t{} MB'.format(get_proc_mem(), flush=True))
 
-    img,aggtime2 = timed_agg(df,filepath,recalc_ranges=args.recalc_ranges)
+    img,aggtime2 = timed_agg(df,filepath,cache_ranges=(not args.recalc_ranges))
     if DEBUG:
         print('DEBUG: Memory usage (after agg2):\t{} MB'.format(get_proc_mem(), flush=True))
     

From 53ce5f36378aed7cc6eb7d951f2b6491cb1612ce Mon Sep 17 00:00:00 2001
From: gbrener <gbrener@continuum.io>
Date: Fri, 5 May 2017 18:42:35 -0500
Subject: [PATCH 04/11] Only memoize the dask versions of max/min

The tests are failing because pandas DataFrames are not hashable. Rather than using dask.dataframe.hashing.hash_pandas_object, it is probably more efficient to simply recalculate the min/max. So memoization only happens for dask.
---
 datashader/dask.py   |  4 ++--
 datashader/glyphs.py | 19 +++++++++++++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/datashader/dask.py b/datashader/dask.py
index 33657537d..5969cf9ab 100644
--- a/datashader/dask.py
+++ b/datashader/dask.py
@@ -25,8 +25,8 @@ def dask_pipeline(df, schema, canvas, glyph, summary):
 
 
 def shape_bounds_st_and_axis(df, canvas, glyph):
-    x_range = canvas.x_range or glyph._compute_x_bounds(df)
-    y_range = canvas.y_range or glyph._compute_y_bounds(df)
+    x_range = canvas.x_range or glyph._compute_x_bounds_hashable(df)
+    y_range = canvas.y_range or glyph._compute_y_bounds_hashable(df)
     x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
     x_range, y_range = (x_min, x_max), (y_min, y_max)
     width = canvas.plot_width
diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index f40d5ac2e..4725bafa5 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division
 
 from toolz import memoize
+from dask.dataframe.hashing import hash_pandas_object
 import numpy as np
 
 from .core import Expr
@@ -28,16 +29,30 @@ def validate(self, in_dshape):
         elif not isreal(in_dshape.measure[self.y]):
             raise ValueError('y must be real')
 
-    @memoize
     def _compute_x_bounds(self, df):
         xs = df[self.x].values
         return np.nanmin(xs), np.nanmax(xs)
 
-    @memoize
     def _compute_y_bounds(self, df):
         ys = df[self.y].values
         return np.nanmin(ys), np.nanmax(ys)
 
+    @memoize
+    def _compute_x_bounds_hashable(self, df):
+        """Same as ``PointLike._compute_x_bounds``, but memoized because
+        ``df`` is immutable/hashable (a Dask dataframe).
+        """
+        xs = df[self.x].values
+        return np.nanmin(xs), np.nanmax(xs)
+
+    @memoize
+    def _compute_y_bounds_hashable(self, df):
+        """Same as ``PointLike._compute_y_bounds``, but memoized because
+        ``df`` is immutable/hashable (a Dask dataframe).
+        """
+        ys = df[self.y].values
+        return np.nanmin(ys), np.nanmax(ys)
+
 
 class Point(_PointLike):
     """A point, with center at ``x`` and ``y``.

From 6e9851020a70a8f0e3801144107cbfa6952f0aaf Mon Sep 17 00:00:00 2001
From: gbrener <gbrener@continuum.io>
Date: Fri, 5 May 2017 19:23:30 -0500
Subject: [PATCH 05/11] Make a numba-ized version of bound computations

Since I know of no straightforward way to incorporate numba functions into dask graphs, there are now two versions of the min/max computations; the pandas ones use numba, and the dask ones use memoization (relying on dask dataframe immutability).
---
 datashader/dask.py   |  4 ++--
 datashader/glyphs.py | 38 +++++++++++++++++++++++++++-----------
 datashader/pandas.py |  4 ++--
 3 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/datashader/dask.py b/datashader/dask.py
index 5969cf9ab..80b4c266b 100644
--- a/datashader/dask.py
+++ b/datashader/dask.py
@@ -25,8 +25,8 @@ def dask_pipeline(df, schema, canvas, glyph, summary):
 
 
 def shape_bounds_st_and_axis(df, canvas, glyph):
-    x_range = canvas.x_range or glyph._compute_x_bounds_hashable(df)
-    y_range = canvas.y_range or glyph._compute_y_bounds_hashable(df)
+    x_range = canvas.x_range or glyph._compute_x_bounds_dask(df)
+    y_range = canvas.y_range or glyph._compute_y_bounds_dask(df)
     x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range))
     x_range, y_range = (x_min, x_max), (y_min, y_max)
     width = canvas.plot_width
diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index 4725bafa5..6cb17cdd9 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -29,25 +29,41 @@ def validate(self, in_dshape):
         elif not isreal(in_dshape.measure[self.y]):
             raise ValueError('y must be real')
 
-    def _compute_x_bounds(self, df):
-        xs = df[self.x].values
-        return np.nanmin(xs), np.nanmax(xs)
-
-    def _compute_y_bounds(self, df):
-        ys = df[self.y].values
-        return np.nanmin(ys), np.nanmax(ys)
+    @staticmethod
+    @ngjit
+    def _compute_x_bounds_pandas(xs):
+        minval = maxval = xs[0]
+        for x in xs:
+            if not np.isnan(x):
+                if x < minval:
+                    minval = x
+                elif x > maxval:
+                    maxval = x
+        return minval, maxval
+
+    @staticmethod
+    @ngjit
+    def _compute_y_bounds_pandas(ys):
+        minval = maxval = ys[0]
+        for y in ys:
+            if not np.isnan(y):
+                if y < minval:
+                    minval = y
+                elif y > maxval:
+                    maxval = y
+        return minval, maxval
 
     @memoize
-    def _compute_x_bounds_hashable(self, df):
-        """Same as ``PointLike._compute_x_bounds``, but memoized because
+    def _compute_x_bounds_dask(self, df):
+        """Like ``PointLike._compute_x_bounds``, but memoized because
         ``df`` is immutable/hashable (a Dask dataframe).
         """
         xs = df[self.x].values
         return np.nanmin(xs), np.nanmax(xs)
 
     @memoize
-    def _compute_y_bounds_hashable(self, df):
-        """Same as ``PointLike._compute_y_bounds``, but memoized because
+    def _compute_y_bounds_dask(self, df):
+        """Like ``PointLike._compute_y_bounds``, but memoized because
         ``df`` is immutable/hashable (a Dask dataframe).
         """
         ys = df[self.y].values
diff --git a/datashader/pandas.py b/datashader/pandas.py
index f631ff14e..c4249fa73 100644
--- a/datashader/pandas.py
+++ b/datashader/pandas.py
@@ -15,8 +15,8 @@ def pandas_pipeline(df, schema, canvas, glyph, summary):
     y_mapper = canvas.y_axis.mapper
     extend = glyph._build_extend(x_mapper, y_mapper, info, append)
 
-    x_range = canvas.x_range or glyph._compute_x_bounds(df)
-    y_range = canvas.y_range or glyph._compute_y_bounds(df)
+    x_range = canvas.x_range or glyph._compute_x_bounds_pandas(df[glyph.x].values)
+    y_range = canvas.y_range or glyph._compute_y_bounds_pandas(df[glyph.y].values)
     width = canvas.plot_width
     height = canvas.plot_height
 

From 64eec97394bc9ddfcdde95fb6bb40868ca5961c1 Mon Sep 17 00:00:00 2001
From: gbrener <gbrener@continuum.io>
Date: Fri, 5 May 2017 19:44:39 -0500
Subject: [PATCH 06/11] Revert to old method names

Nothing has changed except the names, so that the tests can return to passing state again.
---
 datashader/glyphs.py | 4 ++--
 datashader/pandas.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index 6cb17cdd9..a819b15dc 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -31,7 +31,7 @@ def validate(self, in_dshape):
 
     @staticmethod
     @ngjit
-    def _compute_x_bounds_pandas(xs):
+    def _compute_x_bounds(xs):
         minval = maxval = xs[0]
         for x in xs:
             if not np.isnan(x):
@@ -43,7 +43,7 @@ def _compute_x_bounds_pandas(xs):
 
     @staticmethod
     @ngjit
-    def _compute_y_bounds_pandas(ys):
+    def _compute_y_bounds(ys):
         minval = maxval = ys[0]
         for y in ys:
             if not np.isnan(y):
diff --git a/datashader/pandas.py b/datashader/pandas.py
index c4249fa73..f3edfb668 100644
--- a/datashader/pandas.py
+++ b/datashader/pandas.py
@@ -15,8 +15,8 @@ def pandas_pipeline(df, schema, canvas, glyph, summary):
     y_mapper = canvas.y_axis.mapper
     extend = glyph._build_extend(x_mapper, y_mapper, info, append)
 
-    x_range = canvas.x_range or glyph._compute_x_bounds_pandas(df[glyph.x].values)
-    y_range = canvas.y_range or glyph._compute_y_bounds_pandas(df[glyph.y].values)
+    x_range = canvas.x_range or glyph._compute_x_bounds(df[glyph.x].values)
+    y_range = canvas.y_range or glyph._compute_y_bounds(df[glyph.y].values)
     width = canvas.plot_width
     height = canvas.plot_height
 

From 430a1b5ef103b7a6a3d32faeffb5b4c436d03940 Mon Sep 17 00:00:00 2001
From: gbrener <gbrener@continuum.io>
Date: Fri, 5 May 2017 19:51:38 -0500
Subject: [PATCH 07/11] Update test

Update unit test so that it passes a numpy array to numba function instead of dataframe (as it was before).
---
 datashader/tests/test_glyphs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datashader/tests/test_glyphs.py b/datashader/tests/test_glyphs.py
index 8d5256e5c..f6bd0e707 100644
--- a/datashader/tests/test_glyphs.py
+++ b/datashader/tests/test_glyphs.py
@@ -10,8 +10,8 @@
 def test_point_bounds_check():
     df = pd.DataFrame({'x': [1, 2, 3], 'y': [5, 6, 7]})
     p = Point('x', 'y')
-    assert p._compute_x_bounds(df) == (1, 3)
-    assert p._compute_y_bounds(df) == (5, 7)
+    assert p._compute_x_bounds(df['x'].values) == (1, 3)
+    assert p._compute_y_bounds(df['y'].values) == (5, 7)
 
 
 def test_point_validate():

From 54425e2379eb68cc0836a96ab94d11afd657270e Mon Sep 17 00:00:00 2001
From: Greg Brener <gbrener@users.noreply.github.com>
Date: Mon, 8 May 2017 00:11:14 -0500
Subject: [PATCH 08/11] Update glyphs.py

Add checks to bounds computations for cases where NaN(s) occur at the beginning of the x/y arrays, or the arrays are all NaNs, or arrays are empty.
---
 datashader/glyphs.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index a819b15dc..7f191b82f 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -1,7 +1,6 @@
 from __future__ import absolute_import, division
 
 from toolz import memoize
-from dask.dataframe.hashing import hash_pandas_object
 import numpy as np
 
 from .core import Expr
@@ -32,25 +31,33 @@ def validate(self, in_dshape):
     @staticmethod
     @ngjit
     def _compute_x_bounds(xs):
+        if len(xs) == 0:
+            raise ValueError('x coordinate array is empty.')
         minval = maxval = xs[0]
         for x in xs:
             if not np.isnan(x):
-                if x < minval:
+                if np.isnan(minval) or x < minval:
                     minval = x
-                elif x > maxval:
+                elif np.isnan(maxval) x > maxval:
                     maxval = x
+        if np.isnan(minval) or np.isnan(maxval):
+            raise ValueError('All x coordinates are NaN.')
         return minval, maxval
 
     @staticmethod
     @ngjit
     def _compute_y_bounds(ys):
+        if len(ys) == 0:
+            raise ValueError('y coordinate array is empty.')
         minval = maxval = ys[0]
         for y in ys:
             if not np.isnan(y):
-                if y < minval:
+                if np.isnan(minval) or y < minval:
                     minval = y
-                elif y > maxval:
+                elif np.isnan(maxval) or y > maxval:
                     maxval = y
+        if np.isnan(minval) or np.isnan(maxval):
+            raise ValueError('All y coordinates are NaN.')
         return minval, maxval
 
     @memoize

From a08193f430c063556a059a6c82b2297adb5bdde4 Mon Sep 17 00:00:00 2001
From: Greg Brener <gbrener@users.noreply.github.com>
Date: Mon, 8 May 2017 00:13:57 -0500
Subject: [PATCH 09/11] Fix typo

---
 datashader/glyphs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index 7f191b82f..e579a0524 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -38,7 +38,7 @@ def _compute_x_bounds(xs):
             if not np.isnan(x):
                 if np.isnan(minval) or x < minval:
                     minval = x
-                elif np.isnan(maxval) x > maxval:
+                elif np.isnan(maxval) or x > maxval:
                     maxval = x
         if np.isnan(minval) or np.isnan(maxval):
             raise ValueError('All x coordinates are NaN.')

From 75897e14f4897eedcaa3bcf23727561e281f68cc Mon Sep 17 00:00:00 2001
From: Greg Brener <gbrener@users.noreply.github.com>
Date: Mon, 8 May 2017 00:55:19 -0500
Subject: [PATCH 10/11] Tweak bounds computations to be 2 loops

Although the former code was shorter, separating bounds computations into two loops should yield a slight speed improvement for the common case.
---
 datashader/glyphs.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index e579a0524..e9313dcce 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -34,12 +34,20 @@ def _compute_x_bounds(xs):
         if len(xs) == 0:
             raise ValueError('x coordinate array is empty.')
         minval = maxval = xs[0]
-        for x in xs:
+        idx = 1
+        while idx < len(xs) and np.isnan(minval) and np.isnan(maxval):
+            x = xs[idx]
             if not np.isnan(x):
-                if np.isnan(minval) or x < minval:
+                minval = maxval = x
+            idx += 1
+        while idx < len(xs):
+            x = xs[idx]
+            if not np.isnan(x):
+                if x < minval:
                     minval = x
-                elif np.isnan(maxval) or x > maxval:
+                elif x > maxval:
                     maxval = x
+            idx += 1
         if np.isnan(minval) or np.isnan(maxval):
             raise ValueError('All x coordinates are NaN.')
         return minval, maxval
@@ -50,12 +58,20 @@ def _compute_y_bounds(ys):
         if len(ys) == 0:
             raise ValueError('y coordinate array is empty.')
         minval = maxval = ys[0]
-        for y in ys:
+        idx = 1
+        while idx < len(ys) and np.isnan(minval) and np.isnan(maxval):
+            y = ys[idx]
+            if not np.isnan(y):
+                minval = maxval = y
+            idx += 1
+        while idx < len(ys):
+            y = ys[idx]
             if not np.isnan(y):
-                if np.isnan(minval) or y < minval:
+                if y < minval:
                     minval = y
-                elif np.isnan(maxval) or y > maxval:
+                elif y > maxval:
                     maxval = y
+            idx += 1
         if np.isnan(minval) or np.isnan(maxval):
             raise ValueError('All y coordinates are NaN.')
         return minval, maxval

From eed1d220f3aae3f0f29bf89a0286ef08be50645e Mon Sep 17 00:00:00 2001
From: Greg Brener <gbrener@users.noreply.github.com>
Date: Mon, 8 May 2017 14:56:40 -0500
Subject: [PATCH 11/11] Revert back to 1 loop implementation

Instead of doing 2 loops as suggested, revert back to 1 loop (to avoid computing the len on a dask dataframe).
---
 datashader/glyphs.py | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

diff --git a/datashader/glyphs.py b/datashader/glyphs.py
index e9313dcce..ad0c596fc 100644
--- a/datashader/glyphs.py
+++ b/datashader/glyphs.py
@@ -31,23 +31,13 @@ def validate(self, in_dshape):
     @staticmethod
     @ngjit
     def _compute_x_bounds(xs):
-        if len(xs) == 0:
-            raise ValueError('x coordinate array is empty.')
         minval = maxval = xs[0]
-        idx = 1
-        while idx < len(xs) and np.isnan(minval) and np.isnan(maxval):
-            x = xs[idx]
+        for x in xs:
             if not np.isnan(x):
-                minval = maxval = x
-            idx += 1
-        while idx < len(xs):
-            x = xs[idx]
-            if not np.isnan(x):
-                if x < minval:
+                if np.isnan(minval) or x < minval:
                     minval = x
-                elif x > maxval:
+                if np.isnan(maxval) or x > maxval:
                     maxval = x
-            idx += 1
         if np.isnan(minval) or np.isnan(maxval):
             raise ValueError('All x coordinates are NaN.')
         return minval, maxval
@@ -55,23 +45,13 @@ def _compute_x_bounds(xs):
     @staticmethod
     @ngjit
     def _compute_y_bounds(ys):
-        if len(ys) == 0:
-            raise ValueError('y coordinate array is empty.')
         minval = maxval = ys[0]
-        idx = 1
-        while idx < len(ys) and np.isnan(minval) and np.isnan(maxval):
-            y = ys[idx]
-            if not np.isnan(y):
-                minval = maxval = y
-            idx += 1
-        while idx < len(ys):
-            y = ys[idx]
+        for y in ys:
             if not np.isnan(y):
-                if y < minval:
+                if np.isnan(minval) or y < minval:
                     minval = y
-                elif y > maxval:
+                if np.isnan(maxval) or y > maxval:
                     maxval = y
-            idx += 1
         if np.isnan(minval) or np.isnan(maxval):
             raise ValueError('All y coordinates are NaN.')
         return minval, maxval