Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pandas ExtensionArray for storing homogeneous ragged arrays #687

Merged
merged 48 commits into from
Mar 1, 2019
Merged
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
fc148de
Fix for pandas 0.24.0rc1
jonmmease Jan 12, 2019
864a235
Initial RaggedArray implementation
jonmmease Jan 13, 2019
440e207
Add the extension test suite provided by pandas and fix tests.
jonmmease Jan 13, 2019
2f18587
Import register_extension_dtype from pandas public location
jonmmease Jan 13, 2019
5f46b8e
Fix copy/paste error
jonmmease Jan 14, 2019
a6b3c27
KeyError -> IndexError
jonmmease Jan 14, 2019
fbc5065
Document, validate, and test fast-path RaggedArray construction
jonmmease Jan 14, 2019
527e9d6
Support indexing RaggedArray with a list
jonmmease Jan 14, 2019
8d1c34b
Create single RaggedDtype() instance per RaggedArray
jonmmease Jan 14, 2019
dad6cc2
Allow astype() to cast RaggedArray to other extension array types
jonmmease Jan 14, 2019
fff0c3e
Allow RaggedArray constructor to accept a RaggedArray to copy
jonmmease Jan 14, 2019
478b655
Remove mask property and consider missing to be equivalent to empty
jonmmease Jan 14, 2019
9d84b3c
More test fixes for `[]` being null
jonmmease Jan 15, 2019
d71f866
Update datashader/datatypes.py
jbednar Jan 15, 2019
4cd7b4c
Add RaggedElement wrapper class for internal pandas operations
jonmmease Jan 16, 2019
16aff67
Override fillna is RaggedArray and enable test
jonmmease Jan 17, 2019
5772ade
Add vectorized equality operators
jonmmease Jan 17, 2019
939405b
pass start_indices and flat_array arrays as args to _validate_ragged_…
jonmmease Jan 17, 2019
7f355d2
Add copy arg to RaggedArray constructor
jonmmease Jan 17, 2019
9e44946
+=
jonmmease Jan 17, 2019
a52728a
Fix missing return
jonmmease Jan 17, 2019
75f914d
Parameterize RaggedDtype by element type
jonmmease Jan 17, 2019
32f4a3c
Remove tuple conversions in RaggedElement
jonmmease Jan 17, 2019
27403a7
Designate _RaggedElement as an internal class
jonmmease Jan 17, 2019
e93c24d
numba jit utility functions
jonmmease Jan 18, 2019
3fda786
Don't auto-import RaggedArray unless pandas is at least version 0.24.0
jonmmease Jan 18, 2019
04453ce
wrap _compute_*_bounds static methods with compute_*_bounds methods
jonmmease Jan 20, 2019
642a858
Small refactor to remove the need for a specialized _PolygonLike glyp…
jonmmease Jan 20, 2019
97bccf5
Refactor to extract required_columns glyph method
jonmmease Jan 20, 2019
2860511
Initial cvs.lines and LinesXY glyph
jonmmease Jan 20, 2019
d7cf092
WIP of LinesRagged type
jonmmease Jan 20, 2019
e781a0f
Merge branch 'master' into enh_ragged
jonmmease Feb 7, 2019
ea08fd1
Remove unused canvas.lines method
jonmmease Feb 7, 2019
1b02b0d
Add RaggedArray line aggregation support for pandas
jonmmease Feb 8, 2019
2314311
Dask ragged array support
jonmmease Feb 8, 2019
2078aad
Merge branch 'master' into enh_ragged
jonmmease Feb 8, 2019
f4a40eb
flake8
jonmmease Feb 8, 2019
59b0b3a
Add validation for LinesAxis1Ragged
jonmmease Feb 8, 2019
c48429e
Exception handling on import for pandas < 0.24
jonmmease Feb 8, 2019
cdecd85
Add pandas >=0.24.1 as testing dependency so that we can test RaggedA…
jonmmease Feb 8, 2019
7c8b953
absolute import
jonmmease Feb 8, 2019
c846f0c
specify that int lists should cast to int64 numpy arrays
jonmmease Feb 9, 2019
4145fb9
Merge branch 'master' into enh_ragged
jonmmease Feb 23, 2019
cad7d0a
Remove parameterized args from skipped tests
jonmmease Feb 24, 2019
89d1d51
Add Dask optimized bounds calculations for ragged list glyph
jonmmease Feb 24, 2019
92eaab2
Apply suggestions from code review
jbednar Feb 28, 2019
1538909
Refer to parent docstrings rather than duplicate
jonmmease Feb 28, 2019
c42f0df
Remove docstring references
jonmmease Mar 1, 2019
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions datashader/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import absolute_import

from distutils.version import LooseVersion

import param
__version__ = str(param.version.Version(fpath=__file__, archive_commit="$Format:%h$",reponame="datashader"))

@@ -15,6 +17,12 @@
except ImportError:
pass

# Make RaggedArray pandas extension array available for
# pandas >= 0.24.0 is installed
from pandas import __version__ as pandas_version
if LooseVersion(pandas_version) >= LooseVersion('0.24.0'):
from . import datatypes # noqa (API import)

# make pyct's example/data commands available if possible
from functools import partial
try:
39 changes: 33 additions & 6 deletions datashader/core.py
Original file line number Diff line number Diff line change
@@ -210,6 +210,7 @@ def line(self, source, x, y, agg=None, axis=0):
Define a canvas and a pandas DataFrame with 6 rows
>>> import pandas as pd # doctest: +SKIP
... import numpy as np
... import datashader as ds
... from datashader import Canvas
... import datashader.transfer_functions as tf
... cvs = Canvas()
@@ -222,23 +223,23 @@ def line(self, source, x, y, agg=None, axis=0):

Aggregate one line across all rows, with coordinates df.A1 by df.B1
>>> agg = cvs.line(df, x='A1', y='B1', axis=0) # doctest: +SKIP
... tf.shade(agg)
... tf.spread(tf.shade(agg))

Aggregate two lines across all rows. The first with coordinates
df.A1 by df.B1 and the second with coordinates df.A2 by df.B2
>>> agg = cvs.line(df, x=['A1', 'A2'], y=['B1', 'B2'], axis=0) # doctest: +SKIP
... tf.shade(agg)
... tf.spread(tf.shade(agg))

Aggregate two lines across all rows where the lines share the same
x coordinates. The first line will have coordinates df.A1 by df.B1
and the second will have coordinates df.A1 by df.B2
>>> agg = cvs.line(df, x='A1', y=['B1', 'B2'], axis=0) # doctest: +SKIP
... tf.shade(agg)
... tf.spread(tf.shade(agg))

Aggregate 6 length-2 lines, one per row, where the ith line has
coordinates [df.A1[i], df.A2[i]] by [df.B1[i], df.B2[i]]
>>> agg = cvs.line(df, x=['A1', 'A2'], y=['B1', 'B2'], axis=1) # doctest: +SKIP
... tf.shade(agg)
... tf.spread(tf.shade(agg))

Aggregate 6 length-4 lines, one per row, where the x coordinates
of every line are [0, 1, 2, 3] and the y coordinates of the ith line
@@ -247,10 +248,32 @@ def line(self, source, x, y, agg=None, axis=0):
... x=np.arange(4),
... y=['A1', 'A2', 'B1', 'B2'],
... axis=1)
... tf.shade(agg)
... tf.spread(tf.shade(agg))

Aggregate RaggedArrays of variable length lines, one per row
(requires pandas >= 0.24.0)
>>> df_ragged = pd.DataFrame({ # doctest: +SKIP
... 'A1': pd.array([
... [1, 1.5], [2, 2.5, 3], [1.5, 2, 3, 4], [3.2, 4, 5]],
... dtype='Ragged[float32]'),
... 'B1': pd.array([
... [10, 12], [11, 14, 13], [10, 7, 9, 10], [7, 8, 12]],
... dtype='Ragged[float32]'),
... 'group': pd.Categorical([0, 1, 2, 1])
... })
...
... agg = cvs.line(df_ragged, x='A1', y='B1', axis=1)
... tf.spread(tf.shade(agg))

Aggregate RaggedArrays of variable length lines by group column,
one per row (requires pandas >= 0.24.0)
>>> agg = cvs.line(df_ragged, x='A1', y='B1', # doctest: +SKIP
... agg=ds.count_cat('group'), axis=1)
... tf.spread(tf.shade(agg))
"""
from .glyphs import (LineAxis0, LinesAxis1, LinesAxis1XConstant,
LinesAxis1YConstant, LineAxis0Multi)
LinesAxis1YConstant, LineAxis0Multi,
LinesAxis1Ragged)
from .reductions import any as any_rdn
if agg is None:
agg = any_rdn()
@@ -286,6 +309,9 @@ def line(self, source, x, y, agg=None, axis=0):
elif (isinstance(x, (list, tuple)) and
isinstance(y, np.ndarray)):
glyph = LinesAxis1YConstant(tuple(x), y)
elif (isinstance(x, (Number, string_types)) and
isinstance(y, (Number, string_types))):
glyph = LinesAxis1Ragged(x, y)
else:
raise ValueError("""
Invalid combination of x and y arguments to Canvas.line when axis=1.
@@ -302,6 +328,7 @@ def line(self, source, x, y, agg=None, axis=0):

return bypixel(source, self, glyph, agg)


# TODO re 'untested', below: Consider replacing with e.g. a 3x3
# array in the call to Canvas (plot_height=3,plot_width=3), then
# show the output as a numpy array that has a compact
Loading