From 09671fde13536e3784ca2d3752c3ef8c2907f4e4 Mon Sep 17 00:00:00 2001 From: gbrener Date: Thu, 4 May 2017 19:20:19 -0500 Subject: [PATCH 01/11] Optimize max/min operations Optimize the operations that datashader uses to calculate the x_range and y_range variables when the ranges are not provided by the user. Add a caching feature that avoids recalculating the x_range and y_range when the Canvas is reused for multiple aggregations. Caching feature can be turned off when instantiating the Canvas object, or invalidated while calling cvs.points(..., recalc_ranges=True) or cvs.line(..., recalc_ranges=True) --- datashader/core.py | 42 +++++++++++++++++++++++++++++++++++------- datashader/dask.py | 9 +++++++-- datashader/glyphs.py | 6 ++++-- datashader/pandas.py | 9 +++++++-- examples/filetimes.py | 15 ++++++++++----- 5 files changed, 63 insertions(+), 18 deletions(-) diff --git a/datashader/core.py b/datashader/core.py index 309bb6d82..eb18b27a9 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -138,23 +138,35 @@ class Canvas(object): plot_width, plot_height : int, optional Width and height of the output aggregate in pixels. x_range, y_range : tuple, optional - A tuple representing the bounds inclusive space ``[min, max]`` along - the axis. + A tuple representing the bounds inclusive space ``[min, max]`` along the + axis. These will be calculated and cached by datashader during the first + aggregation, if not provided. The cached values may be + invalidated/recalculated by providing the ``recalc_ranges`` keyword + argument to the ``Canvas.points()`` and/or ``Canvas.line()`` methods. x_axis_type, y_axis_type : str, optional The type of the axis. Valid options are ``'linear'`` [default], and ``'log'``. + cache_ranges : bool, optional + Whether to cache x_range and y_range variables when they get + updated. Default is ``True``. This provides a speedup by relying on the + x and y data remaining unchanged between data aggregations. If the x or + y data may have changed, one may either set this option to ``False``, or + provide the ``recalc_ranges=True`` when calling ``Canvas.points()`` + and/or ``Canvas.line()``. """ def __init__(self, plot_width=600, plot_height=600, x_range=None, y_range=None, - x_axis_type='linear', y_axis_type='linear'): + x_axis_type='linear', y_axis_type='linear', + cache_ranges=True): self.plot_width = plot_width self.plot_height = plot_height - self.x_range = tuple(x_range) if x_range is not None else x_range - self.y_range = tuple(y_range) if y_range is not None else y_range + self.x_range = tuple(x_range) if x_range is not None else None + self.y_range = tuple(y_range) if y_range is not None else None self.x_axis = _axis_lookup[x_axis_type] self.y_axis = _axis_lookup[y_axis_type] + self.cache_ranges = cache_ranges - def points(self, source, x, y, agg=None): + def points(self, source, x, y, agg=None, recalc_ranges=False): """Compute a reduction by pixel, mapping data to pixels as points. Parameters @@ -165,14 +177,22 @@ def points(self, source, x, y, agg=None): Column names for the x and y coordinates of each point. agg : Reduction, optional Reduction to compute. Default is ``count()``. + recalc_ranges : bool, optional + Recalculate the ranges that datashader calculated/cached during the + first aggregation. Default is ``False``. This option should only be + used if the dataframe's x or y data was altered after the ``Canvas`` + was created. """ from .glyphs import Point from .reductions import count + if not self.cache_ranges or recalc_ranges: + self.x_range = None + self.y_range = None if agg is None: agg = count() return bypixel(source, self, Point(x, y), agg) - def line(self, source, x, y, agg=None): + def line(self, source, x, y, agg=None, recalc_ranges=False): """Compute a reduction by pixel, mapping data to pixels as a line. For aggregates that take in extra fields, the interpolated bins will @@ -192,9 +212,17 @@ def line(self, source, x, y, agg=None): Column names for the x and y coordinates of each vertex. agg : Reduction, optional Reduction to compute. Default is ``any()``. + recalc_ranges : bool, optional + Recalculate the ranges that datashader calculated/cached during the + first aggregation. Default is ``False``. This option should only be + used if the dataframe's x or y data was altered after the ``Canvas`` + was created. """ from .glyphs import Line from .reductions import any + if not self.cache_ranges or recalc_ranges: + self.x_range = None + self.y_range = None if agg is None: agg = any() return bypixel(source, self, Line(x, y), agg) diff --git a/datashader/dask.py b/datashader/dask.py index 33657537d..c483f36ab 100644 --- a/datashader/dask.py +++ b/datashader/dask.py @@ -25,8 +25,13 @@ def dask_pipeline(df, schema, canvas, glyph, summary): def shape_bounds_st_and_axis(df, canvas, glyph): - x_range = canvas.x_range or glyph._compute_x_bounds(df) - y_range = canvas.y_range or glyph._compute_y_bounds(df) + # Cache the x and y ranges during the first aggregation + if canvas.x_range is None: + canvas.x_range = glyph._compute_x_bounds(df) + if canvas.y_range is None: + canvas.y_range = glyph._compute_y_bounds(df) + x_range = canvas.x_range + y_range = canvas.y_range x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) width = canvas.plot_width diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 321cc3cf7..1352c718c 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -29,10 +29,12 @@ def validate(self, in_dshape): raise ValueError('y must be real') def _compute_x_bounds(self, df): - return df[self.x].min(), df[self.x].max() + xs = df[self.x].values + return xs.min(), xs.max() def _compute_y_bounds(self, df): - return df[self.y].min(), df[self.y].max() + ys = df[self.y].values + return ys.min(), ys.max() class Point(_PointLike): diff --git a/datashader/pandas.py b/datashader/pandas.py index f631ff14e..c52fb5dc2 100644 --- a/datashader/pandas.py +++ b/datashader/pandas.py @@ -15,8 +15,13 @@ def pandas_pipeline(df, schema, canvas, glyph, summary): y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) - x_range = canvas.x_range or glyph._compute_x_bounds(df) - y_range = canvas.y_range or glyph._compute_y_bounds(df) + # Cache the x and y ranges during the first aggregation + if canvas.x_range is None: + canvas.x_range = glyph._compute_x_bounds(df) + if canvas.y_range is None: + canvas.y_range = glyph._compute_y_bounds(df) + x_range = canvas.x_range + y_range = canvas.y_range width = canvas.plot_width height = canvas.plot_height diff --git a/examples/filetimes.py b/examples/filetimes.py index 3e47ad3e4..db2540a03 100755 --- a/examples/filetimes.py +++ b/examples/filetimes.py @@ -227,11 +227,15 @@ def timed_read(filepath,dftype): return df, duration -def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12)): +CACHED_RANGES = (None, None) +def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12), recalc_ranges=False): + global CACHED_RANGES start = time.time() - cvs = ds.Canvas(plot_width, plot_height) - agg = cvs.points(df, p.x, p.y) + cvs = ds.Canvas(plot_width, plot_height, x_range=CACHED_RANGES[0], y_range=CACHED_RANGES[1]) + agg = cvs.points(df, p.x, p.y, recalc_ranges=recalc_ranges) end = time.time() + if not recalc_ranges: + CACHED_RANGES = (cvs.x_range, cvs.y_range) img = export_image(tf.shade(agg),filepath,export_path=".") return img, end-start @@ -273,6 +277,7 @@ def main(argv): parser.add_argument('--debug', action='store_true', help='Enable increased verbosity and DEBUG messages') parser.add_argument('--cache', choices=('persist', 'cachey'), default=None, help='Enable caching: "persist" causes Dask dataframes to force loading into memory; "cachey" uses dask.cache.Cache with a cachesize of {}. Caching is disabled by default'.format(int(p.cachesize))) parser.add_argument('--distributed', action='store_true', help='Enable the distributed scheduler instead of the threaded, which is the default.') + parser.add_argument('--recalc-ranges', action='store_true', help='Tell datashader to recalculate the ranges on each aggregation, instead of caching them (by default).') args = parser.parse_args(argv[1:]) if args.cache is None: @@ -321,7 +326,7 @@ def main(argv): if DEBUG: print('DEBUG: Memory usage (after read):\t{} MB'.format(get_proc_mem(), flush=True)) - img,aggtime1 = timed_agg(df,filepath,5,5) + img,aggtime1 = timed_agg(df,filepath,5,5,recalc_ranges=args.recalc_ranges) if DEBUG: mem_usage = df.memory_usage(deep=True) if p.dftype == 'dask': @@ -333,7 +338,7 @@ def main(argv): print('DEBUG: column "{}" dtype: {}'.format(colname, df[colname].dtype)) print('DEBUG: Memory usage (after agg1):\t{} MB'.format(get_proc_mem(), flush=True)) - img,aggtime2 = timed_agg(df,filepath) + img,aggtime2 = timed_agg(df,filepath,recalc_ranges=args.recalc_ranges) if DEBUG: print('DEBUG: Memory usage (after agg2):\t{} MB'.format(get_proc_mem(), flush=True)) From 206601ef4c6901042058dba7f56af92501fc8bcc Mon Sep 17 00:00:00 2001 From: gbrener Date: Fri, 5 May 2017 16:20:44 -0500 Subject: [PATCH 02/11] Revert caching feature. min/max -> nanmin/nanmax Remove caching feature from previous commits after discussion with Jim. Convert arr.min() and arr.max() calls to np.nanmin(arr) and np.nanmax(arr) to more-closely emulate the NaN-handling behavior of df.min() and df.max() --- datashader/core.py | 42 +++++++----------------------------------- datashader/dask.py | 9 ++------- datashader/glyphs.py | 4 ++-- datashader/pandas.py | 9 ++------- examples/filetimes.py | 2 +- 5 files changed, 14 insertions(+), 52 deletions(-) diff --git a/datashader/core.py b/datashader/core.py index eb18b27a9..1c09af0d9 100644 --- a/datashader/core.py +++ b/datashader/core.py @@ -138,35 +138,23 @@ class Canvas(object): plot_width, plot_height : int, optional Width and height of the output aggregate in pixels. x_range, y_range : tuple, optional - A tuple representing the bounds inclusive space ``[min, max]`` along the - axis. These will be calculated and cached by datashader during the first - aggregation, if not provided. The cached values may be - invalidated/recalculated by providing the ``recalc_ranges`` keyword - argument to the ``Canvas.points()`` and/or ``Canvas.line()`` methods. + A tuple representing the bounds inclusive space ``[min, max]`` along + the axis. x_axis_type, y_axis_type : str, optional The type of the axis. Valid options are ``'linear'`` [default], and ``'log'``. - cache_ranges : bool, optional - Whether to cache x_range and y_range variables when they get - updated. Default is ``True``. This provides a speedup by relying on the - x and y data remaining unchanged between data aggregations. If the x or - y data may have changed, one may either set this option to ``False``, or - provide the ``recalc_ranges=True`` when calling ``Canvas.points()`` - and/or ``Canvas.line()``. """ def __init__(self, plot_width=600, plot_height=600, x_range=None, y_range=None, - x_axis_type='linear', y_axis_type='linear', - cache_ranges=True): + x_axis_type='linear', y_axis_type='linear'): self.plot_width = plot_width self.plot_height = plot_height - self.x_range = tuple(x_range) if x_range is not None else None - self.y_range = tuple(y_range) if y_range is not None else None + self.x_range = None if x_range is None else tuple(x_range) + self.y_range = None if y_range is None else tuple(y_range) self.x_axis = _axis_lookup[x_axis_type] self.y_axis = _axis_lookup[y_axis_type] - self.cache_ranges = cache_ranges - def points(self, source, x, y, agg=None, recalc_ranges=False): + def points(self, source, x, y, agg=None): """Compute a reduction by pixel, mapping data to pixels as points. Parameters @@ -177,22 +165,14 @@ def points(self, source, x, y, agg=None, recalc_ranges=False): Column names for the x and y coordinates of each point. agg : Reduction, optional Reduction to compute. Default is ``count()``. - recalc_ranges : bool, optional - Recalculate the ranges that datashader calculated/cached during the - first aggregation. Default is ``False``. This option should only be - used if the dataframe's x or y data was altered after the ``Canvas`` - was created. """ from .glyphs import Point from .reductions import count - if not self.cache_ranges or recalc_ranges: - self.x_range = None - self.y_range = None if agg is None: agg = count() return bypixel(source, self, Point(x, y), agg) - def line(self, source, x, y, agg=None, recalc_ranges=False): + def line(self, source, x, y, agg=None): """Compute a reduction by pixel, mapping data to pixels as a line. For aggregates that take in extra fields, the interpolated bins will @@ -212,17 +192,9 @@ def line(self, source, x, y, agg=None, recalc_ranges=False): Column names for the x and y coordinates of each vertex. agg : Reduction, optional Reduction to compute. Default is ``any()``. - recalc_ranges : bool, optional - Recalculate the ranges that datashader calculated/cached during the - first aggregation. Default is ``False``. This option should only be - used if the dataframe's x or y data was altered after the ``Canvas`` - was created. """ from .glyphs import Line from .reductions import any - if not self.cache_ranges or recalc_ranges: - self.x_range = None - self.y_range = None if agg is None: agg = any() return bypixel(source, self, Line(x, y), agg) diff --git a/datashader/dask.py b/datashader/dask.py index c483f36ab..33657537d 100644 --- a/datashader/dask.py +++ b/datashader/dask.py @@ -25,13 +25,8 @@ def dask_pipeline(df, schema, canvas, glyph, summary): def shape_bounds_st_and_axis(df, canvas, glyph): - # Cache the x and y ranges during the first aggregation - if canvas.x_range is None: - canvas.x_range = glyph._compute_x_bounds(df) - if canvas.y_range is None: - canvas.y_range = glyph._compute_y_bounds(df) - x_range = canvas.x_range - y_range = canvas.y_range + x_range = canvas.x_range or glyph._compute_x_bounds(df) + y_range = canvas.y_range or glyph._compute_y_bounds(df) x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) width = canvas.plot_width diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 1352c718c..fc1347bcc 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -30,11 +30,11 @@ def validate(self, in_dshape): def _compute_x_bounds(self, df): xs = df[self.x].values - return xs.min(), xs.max() + return np.nanmin(xs), np.nanmax(xs) def _compute_y_bounds(self, df): ys = df[self.y].values - return ys.min(), ys.max() + return np.nanmin(ys), np.nanmax(ys) class Point(_PointLike): diff --git a/datashader/pandas.py b/datashader/pandas.py index c52fb5dc2..f631ff14e 100644 --- a/datashader/pandas.py +++ b/datashader/pandas.py @@ -15,13 +15,8 @@ def pandas_pipeline(df, schema, canvas, glyph, summary): y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) - # Cache the x and y ranges during the first aggregation - if canvas.x_range is None: - canvas.x_range = glyph._compute_x_bounds(df) - if canvas.y_range is None: - canvas.y_range = glyph._compute_y_bounds(df) - x_range = canvas.x_range - y_range = canvas.y_range + x_range = canvas.x_range or glyph._compute_x_bounds(df) + y_range = canvas.y_range or glyph._compute_y_bounds(df) width = canvas.plot_width height = canvas.plot_height diff --git a/examples/filetimes.py b/examples/filetimes.py index db2540a03..f482cb1cf 100755 --- a/examples/filetimes.py +++ b/examples/filetimes.py @@ -232,7 +232,7 @@ def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12), re global CACHED_RANGES start = time.time() cvs = ds.Canvas(plot_width, plot_height, x_range=CACHED_RANGES[0], y_range=CACHED_RANGES[1]) - agg = cvs.points(df, p.x, p.y, recalc_ranges=recalc_ranges) + agg = cvs.points(df, p.x, p.y) end = time.time() if not recalc_ranges: CACHED_RANGES = (cvs.x_range, cvs.y_range) From 8945def49c1ee9858f4089bc952411d3463d7f70 Mon Sep 17 00:00:00 2001 From: gbrener Date: Fri, 5 May 2017 17:43:57 -0500 Subject: [PATCH 03/11] Memoize x/y bounds computations Use toolz.memoize - similar to how other code in glyphs.py is optimized - to cache the x/y bound computations. This takes advantage of the fact that dask dataframes are immutable/hashable, and has the desired result that the cache_ranges feature had before. --- datashader/glyphs.py | 2 ++ examples/filetimes.py | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index fc1347bcc..f40d5ac2e 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -28,10 +28,12 @@ def validate(self, in_dshape): elif not isreal(in_dshape.measure[self.y]): raise ValueError('y must be real') + @memoize def _compute_x_bounds(self, df): xs = df[self.x].values return np.nanmin(xs), np.nanmax(xs) + @memoize def _compute_y_bounds(self, df): ys = df[self.y].values return np.nanmin(ys), np.nanmax(ys) diff --git a/examples/filetimes.py b/examples/filetimes.py index f482cb1cf..cf6a18b20 100755 --- a/examples/filetimes.py +++ b/examples/filetimes.py @@ -228,13 +228,13 @@ def timed_read(filepath,dftype): CACHED_RANGES = (None, None) -def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12), recalc_ranges=False): +def timed_agg(df, filepath, plot_width=int(900), plot_height=int(900*7.0/12), cache_ranges=True): global CACHED_RANGES start = time.time() cvs = ds.Canvas(plot_width, plot_height, x_range=CACHED_RANGES[0], y_range=CACHED_RANGES[1]) agg = cvs.points(df, p.x, p.y) end = time.time() - if not recalc_ranges: + if cache_ranges: CACHED_RANGES = (cvs.x_range, cvs.y_range) img = export_image(tf.shade(agg),filepath,export_path=".") return img, end-start @@ -326,7 +326,7 @@ def main(argv): if DEBUG: print('DEBUG: Memory usage (after read):\t{} MB'.format(get_proc_mem(), flush=True)) - img,aggtime1 = timed_agg(df,filepath,5,5,recalc_ranges=args.recalc_ranges) + img,aggtime1 = timed_agg(df,filepath,5,5,cache_ranges=(not args.recalc_ranges)) if DEBUG: mem_usage = df.memory_usage(deep=True) if p.dftype == 'dask': @@ -338,7 +338,7 @@ def main(argv): print('DEBUG: column "{}" dtype: {}'.format(colname, df[colname].dtype)) print('DEBUG: Memory usage (after agg1):\t{} MB'.format(get_proc_mem(), flush=True)) - img,aggtime2 = timed_agg(df,filepath,recalc_ranges=args.recalc_ranges) + img,aggtime2 = timed_agg(df,filepath,cache_ranges=(not args.recalc_ranges)) if DEBUG: print('DEBUG: Memory usage (after agg2):\t{} MB'.format(get_proc_mem(), flush=True)) From 53ce5f36378aed7cc6eb7d951f2b6491cb1612ce Mon Sep 17 00:00:00 2001 From: gbrener Date: Fri, 5 May 2017 18:42:35 -0500 Subject: [PATCH 04/11] Only memoize the dask versions of max/min The tests are failing because pandas DataFrames are not hashable. Rather than using dask.dataframe.hashing.hash_pandas_object, it is probably more efficient to simply recalculate the min/max. So memoization only happens for dask. --- datashader/dask.py | 4 ++-- datashader/glyphs.py | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/datashader/dask.py b/datashader/dask.py index 33657537d..5969cf9ab 100644 --- a/datashader/dask.py +++ b/datashader/dask.py @@ -25,8 +25,8 @@ def dask_pipeline(df, schema, canvas, glyph, summary): def shape_bounds_st_and_axis(df, canvas, glyph): - x_range = canvas.x_range or glyph._compute_x_bounds(df) - y_range = canvas.y_range or glyph._compute_y_bounds(df) + x_range = canvas.x_range or glyph._compute_x_bounds_hashable(df) + y_range = canvas.y_range or glyph._compute_y_bounds_hashable(df) x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) width = canvas.plot_width diff --git a/datashader/glyphs.py b/datashader/glyphs.py index f40d5ac2e..4725bafa5 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division from toolz import memoize +from dask.dataframe.hashing import hash_pandas_object import numpy as np from .core import Expr @@ -28,16 +29,30 @@ def validate(self, in_dshape): elif not isreal(in_dshape.measure[self.y]): raise ValueError('y must be real') - @memoize def _compute_x_bounds(self, df): xs = df[self.x].values return np.nanmin(xs), np.nanmax(xs) - @memoize def _compute_y_bounds(self, df): ys = df[self.y].values return np.nanmin(ys), np.nanmax(ys) + @memoize + def _compute_x_bounds_hashable(self, df): + """Same as ``PointLike._compute_x_bounds``, but memoized because + ``df`` is immutable/hashable (a Dask dataframe). + """ + xs = df[self.x].values + return np.nanmin(xs), np.nanmax(xs) + + @memoize + def _compute_y_bounds_hashable(self, df): + """Same as ``PointLike._compute_y_bounds``, but memoized because + ``df`` is immutable/hashable (a Dask dataframe). + """ + ys = df[self.y].values + return np.nanmin(ys), np.nanmax(ys) + class Point(_PointLike): """A point, with center at ``x`` and ``y``. From 6e9851020a70a8f0e3801144107cbfa6952f0aaf Mon Sep 17 00:00:00 2001 From: gbrener Date: Fri, 5 May 2017 19:23:30 -0500 Subject: [PATCH 05/11] Make a numba-ized version of bound computations Since I know of no straightforward way to incorporate numba functions into dask graphs, there are now two versions of the min/max computations; the pandas ones use numba, and the dask ones use memoization (relying on dask dataframe immutability). --- datashader/dask.py | 4 ++-- datashader/glyphs.py | 38 +++++++++++++++++++++++++++----------- datashader/pandas.py | 4 ++-- 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/datashader/dask.py b/datashader/dask.py index 5969cf9ab..80b4c266b 100644 --- a/datashader/dask.py +++ b/datashader/dask.py @@ -25,8 +25,8 @@ def dask_pipeline(df, schema, canvas, glyph, summary): def shape_bounds_st_and_axis(df, canvas, glyph): - x_range = canvas.x_range or glyph._compute_x_bounds_hashable(df) - y_range = canvas.y_range or glyph._compute_y_bounds_hashable(df) + x_range = canvas.x_range or glyph._compute_x_bounds_dask(df) + y_range = canvas.y_range or glyph._compute_y_bounds_dask(df) x_min, x_max, y_min, y_max = bounds = compute(*(x_range + y_range)) x_range, y_range = (x_min, x_max), (y_min, y_max) width = canvas.plot_width diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 4725bafa5..6cb17cdd9 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -29,25 +29,41 @@ def validate(self, in_dshape): elif not isreal(in_dshape.measure[self.y]): raise ValueError('y must be real') - def _compute_x_bounds(self, df): - xs = df[self.x].values - return np.nanmin(xs), np.nanmax(xs) - - def _compute_y_bounds(self, df): - ys = df[self.y].values - return np.nanmin(ys), np.nanmax(ys) + @staticmethod + @ngjit + def _compute_x_bounds_pandas(xs): + minval = maxval = xs[0] + for x in xs: + if not np.isnan(x): + if x < minval: + minval = x + elif x > maxval: + maxval = x + return minval, maxval + + @staticmethod + @ngjit + def _compute_y_bounds_pandas(ys): + minval = maxval = ys[0] + for y in ys: + if not np.isnan(y): + if y < minval: + minval = y + elif y > maxval: + maxval = y + return minval, maxval @memoize - def _compute_x_bounds_hashable(self, df): - """Same as ``PointLike._compute_x_bounds``, but memoized because + def _compute_x_bounds_dask(self, df): + """Like ``PointLike._compute_x_bounds``, but memoized because ``df`` is immutable/hashable (a Dask dataframe). """ xs = df[self.x].values return np.nanmin(xs), np.nanmax(xs) @memoize - def _compute_y_bounds_hashable(self, df): - """Same as ``PointLike._compute_y_bounds``, but memoized because + def _compute_y_bounds_dask(self, df): + """Like ``PointLike._compute_y_bounds``, but memoized because ``df`` is immutable/hashable (a Dask dataframe). """ ys = df[self.y].values diff --git a/datashader/pandas.py b/datashader/pandas.py index f631ff14e..c4249fa73 100644 --- a/datashader/pandas.py +++ b/datashader/pandas.py @@ -15,8 +15,8 @@ def pandas_pipeline(df, schema, canvas, glyph, summary): y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) - x_range = canvas.x_range or glyph._compute_x_bounds(df) - y_range = canvas.y_range or glyph._compute_y_bounds(df) + x_range = canvas.x_range or glyph._compute_x_bounds_pandas(df[glyph.x].values) + y_range = canvas.y_range or glyph._compute_y_bounds_pandas(df[glyph.y].values) width = canvas.plot_width height = canvas.plot_height From 64eec97394bc9ddfcdde95fb6bb40868ca5961c1 Mon Sep 17 00:00:00 2001 From: gbrener Date: Fri, 5 May 2017 19:44:39 -0500 Subject: [PATCH 06/11] Revert to old method names Nothing has changed except the names, so that the tests can return to passing state again. --- datashader/glyphs.py | 4 ++-- datashader/pandas.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 6cb17cdd9..a819b15dc 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -31,7 +31,7 @@ def validate(self, in_dshape): @staticmethod @ngjit - def _compute_x_bounds_pandas(xs): + def _compute_x_bounds(xs): minval = maxval = xs[0] for x in xs: if not np.isnan(x): @@ -43,7 +43,7 @@ def _compute_x_bounds_pandas(xs): @staticmethod @ngjit - def _compute_y_bounds_pandas(ys): + def _compute_y_bounds(ys): minval = maxval = ys[0] for y in ys: if not np.isnan(y): diff --git a/datashader/pandas.py b/datashader/pandas.py index c4249fa73..f3edfb668 100644 --- a/datashader/pandas.py +++ b/datashader/pandas.py @@ -15,8 +15,8 @@ def pandas_pipeline(df, schema, canvas, glyph, summary): y_mapper = canvas.y_axis.mapper extend = glyph._build_extend(x_mapper, y_mapper, info, append) - x_range = canvas.x_range or glyph._compute_x_bounds_pandas(df[glyph.x].values) - y_range = canvas.y_range or glyph._compute_y_bounds_pandas(df[glyph.y].values) + x_range = canvas.x_range or glyph._compute_x_bounds(df[glyph.x].values) + y_range = canvas.y_range or glyph._compute_y_bounds(df[glyph.y].values) width = canvas.plot_width height = canvas.plot_height From 430a1b5ef103b7a6a3d32faeffb5b4c436d03940 Mon Sep 17 00:00:00 2001 From: gbrener Date: Fri, 5 May 2017 19:51:38 -0500 Subject: [PATCH 07/11] Update test Update unit test so that it passes a numpy array to numba function instead of dataframe (as it was before). --- datashader/tests/test_glyphs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datashader/tests/test_glyphs.py b/datashader/tests/test_glyphs.py index 8d5256e5c..f6bd0e707 100644 --- a/datashader/tests/test_glyphs.py +++ b/datashader/tests/test_glyphs.py @@ -10,8 +10,8 @@ def test_point_bounds_check(): df = pd.DataFrame({'x': [1, 2, 3], 'y': [5, 6, 7]}) p = Point('x', 'y') - assert p._compute_x_bounds(df) == (1, 3) - assert p._compute_y_bounds(df) == (5, 7) + assert p._compute_x_bounds(df['x'].values) == (1, 3) + assert p._compute_y_bounds(df['y'].values) == (5, 7) def test_point_validate(): From 54425e2379eb68cc0836a96ab94d11afd657270e Mon Sep 17 00:00:00 2001 From: Greg Brener Date: Mon, 8 May 2017 00:11:14 -0500 Subject: [PATCH 08/11] Update glyphs.py Add checks to bounds computations for cases where NaN(s) occur at the beginning of the x/y arrays, or the arrays are all NaNs, or arrays are empty. --- datashader/glyphs.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index a819b15dc..7f191b82f 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -1,7 +1,6 @@ from __future__ import absolute_import, division from toolz import memoize -from dask.dataframe.hashing import hash_pandas_object import numpy as np from .core import Expr @@ -32,25 +31,33 @@ def validate(self, in_dshape): @staticmethod @ngjit def _compute_x_bounds(xs): + if len(xs) == 0: + raise ValueError('x coordinate array is empty.') minval = maxval = xs[0] for x in xs: if not np.isnan(x): - if x < minval: + if np.isnan(minval) or x < minval: minval = x - elif x > maxval: + elif np.isnan(maxval) x > maxval: maxval = x + if np.isnan(minval) or np.isnan(maxval): + raise ValueError('All x coordinates are NaN.') return minval, maxval @staticmethod @ngjit def _compute_y_bounds(ys): + if len(ys) == 0: + raise ValueError('y coordinate array is empty.') minval = maxval = ys[0] for y in ys: if not np.isnan(y): - if y < minval: + if np.isnan(minval) or y < minval: minval = y - elif y > maxval: + elif np.isnan(maxval) or y > maxval: maxval = y + if np.isnan(minval) or np.isnan(maxval): + raise ValueError('All y coordinates are NaN.') return minval, maxval @memoize From a08193f430c063556a059a6c82b2297adb5bdde4 Mon Sep 17 00:00:00 2001 From: Greg Brener Date: Mon, 8 May 2017 00:13:57 -0500 Subject: [PATCH 09/11] Fix typo --- datashader/glyphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index 7f191b82f..e579a0524 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -38,7 +38,7 @@ def _compute_x_bounds(xs): if not np.isnan(x): if np.isnan(minval) or x < minval: minval = x - elif np.isnan(maxval) x > maxval: + elif np.isnan(maxval) or x > maxval: maxval = x if np.isnan(minval) or np.isnan(maxval): raise ValueError('All x coordinates are NaN.') From 75897e14f4897eedcaa3bcf23727561e281f68cc Mon Sep 17 00:00:00 2001 From: Greg Brener Date: Mon, 8 May 2017 00:55:19 -0500 Subject: [PATCH 10/11] Tweak bounds computations to be 2 loops Although the former code was shorter, separating bounds computations into two loops should yield a slight speed improvement for the common case. --- datashader/glyphs.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index e579a0524..e9313dcce 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -34,12 +34,20 @@ def _compute_x_bounds(xs): if len(xs) == 0: raise ValueError('x coordinate array is empty.') minval = maxval = xs[0] - for x in xs: + idx = 1 + while idx < len(xs) and np.isnan(minval) and np.isnan(maxval): + x = xs[idx] if not np.isnan(x): - if np.isnan(minval) or x < minval: + minval = maxval = x + idx += 1 + while idx < len(xs): + x = xs[idx] + if not np.isnan(x): + if x < minval: minval = x - elif np.isnan(maxval) or x > maxval: + elif x > maxval: maxval = x + idx += 1 if np.isnan(minval) or np.isnan(maxval): raise ValueError('All x coordinates are NaN.') return minval, maxval @@ -50,12 +58,20 @@ def _compute_y_bounds(ys): if len(ys) == 0: raise ValueError('y coordinate array is empty.') minval = maxval = ys[0] - for y in ys: + idx = 1 + while idx < len(ys) and np.isnan(minval) and np.isnan(maxval): + y = ys[idx] + if not np.isnan(y): + minval = maxval = y + idx += 1 + while idx < len(ys): + y = ys[idx] if not np.isnan(y): - if np.isnan(minval) or y < minval: + if y < minval: minval = y - elif np.isnan(maxval) or y > maxval: + elif y > maxval: maxval = y + idx += 1 if np.isnan(minval) or np.isnan(maxval): raise ValueError('All y coordinates are NaN.') return minval, maxval From eed1d220f3aae3f0f29bf89a0286ef08be50645e Mon Sep 17 00:00:00 2001 From: Greg Brener Date: Mon, 8 May 2017 14:56:40 -0500 Subject: [PATCH 11/11] Revert back to 1 loop implementation Instead of doing 2 loops as suggested, revert back to 1 loop (to avoid computing the len on a dask dataframe). --- datashader/glyphs.py | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/datashader/glyphs.py b/datashader/glyphs.py index e9313dcce..ad0c596fc 100644 --- a/datashader/glyphs.py +++ b/datashader/glyphs.py @@ -31,23 +31,13 @@ def validate(self, in_dshape): @staticmethod @ngjit def _compute_x_bounds(xs): - if len(xs) == 0: - raise ValueError('x coordinate array is empty.') minval = maxval = xs[0] - idx = 1 - while idx < len(xs) and np.isnan(minval) and np.isnan(maxval): - x = xs[idx] + for x in xs: if not np.isnan(x): - minval = maxval = x - idx += 1 - while idx < len(xs): - x = xs[idx] - if not np.isnan(x): - if x < minval: + if np.isnan(minval) or x < minval: minval = x - elif x > maxval: + if np.isnan(maxval) or x > maxval: maxval = x - idx += 1 if np.isnan(minval) or np.isnan(maxval): raise ValueError('All x coordinates are NaN.') return minval, maxval @@ -55,23 +45,13 @@ def _compute_x_bounds(xs): @staticmethod @ngjit def _compute_y_bounds(ys): - if len(ys) == 0: - raise ValueError('y coordinate array is empty.') minval = maxval = ys[0] - idx = 1 - while idx < len(ys) and np.isnan(minval) and np.isnan(maxval): - y = ys[idx] - if not np.isnan(y): - minval = maxval = y - idx += 1 - while idx < len(ys): - y = ys[idx] + for y in ys: if not np.isnan(y): - if y < minval: + if np.isnan(minval) or y < minval: minval = y - elif y > maxval: + if np.isnan(maxval) or y > maxval: maxval = y - idx += 1 if np.isnan(minval) or np.isnan(maxval): raise ValueError('All y coordinates are NaN.') return minval, maxval