diff --git a/doc/assets/diagram.png b/doc/assets/diagram.png index 28d55d592..255d5f47a 100644 Binary files a/doc/assets/diagram.png and b/doc/assets/diagram.png differ diff --git a/doc/assets/diagram.svg b/doc/assets/diagram.svg index 392f5c5a8..a0825a141 100644 --- a/doc/assets/diagram.svg +++ b/doc/assets/diagram.svg @@ -10,6 +10,7 @@ width="875.71826mm" sodipodi:docname="diagram.svg" inkscape:export-filename="diagram.png" + xml:space="preserve" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:xlink="http://www.w3.org/1999/xlink" @@ -17,108 +18,85 @@ xmlns:svg="http://www.w3.org/2000/svg" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:cc="http://creativecommons.org/ns#" - xmlns:dc="http://purl.org/dc/elements/1.1/"> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Ibis - - Ibis - - - Data libraries - Data libraries - - - - - - .plot() API - .plot() APIRepresentation - RepresentationPlotting output - Plotting output - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - NetworkX - NetworkX - - - - - - - - - - - - - - - - - - - - - - - - - - + style="fill:none;fill-opacity:1;fill-rule:evenodd;stroke:#2c7fb8;stroke-width:3.48217;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" /> diff --git a/doc/index.md b/doc/index.md index 0a3d1171f..1f9396819 100644 --- a/doc/index.md +++ b/doc/index.md @@ -101,6 +101,7 @@ alt: Works with GeoPandas align: center --- ::: + :::{tab-item} Polars ```python import polars @@ -116,6 +117,24 @@ align: center --- ::: +:::{tab-item} DuckDB +```python +import duckdb +import hvplot.duckdb +from bokeh.sampledata.autompg import autompg_clean as df + +df_duckdb = duckdb.from_df(df) +table = df_duckdb.groupby(['origin', 'mfr'])['mpg'].mean().sort_values().tail(5) +table.hvplot.barh('mfr', 'mpg', by='origin', stacked=True) +``` +```{image} ./_static/home/pandas.gif +--- +alt: Works with DuckDB +align: center +--- +``` + +::: :::{tab-item} Intake ```python import hvplot.intake diff --git a/doc/user_guide/Integrations.ipynb b/doc/user_guide/Integrations.ipynb index 39ce9f3f1..76676d013 100644 --- a/doc/user_guide/Integrations.ipynb +++ b/doc/user_guide/Integrations.ipynb @@ -254,19 +254,13 @@ }, { "cell_type": "markdown", - "id": "a46e377e-729a-4f99-b5d3-83b0736cb8a3", + "id": "7474a792-2cfd-4139-a1cd-872f913fa07b", "metadata": {}, "source": [ ":::{note}\n", "Added in version `0.9.0`.\n", - ":::" - ] - }, - { - "cell_type": "markdown", - "id": "7474a792-2cfd-4139-a1cd-872f913fa07b", - "metadata": {}, - "source": [ + ":::\n", + "\n", ":::{important}\n", "While other data sources like `Pandas` or `Dask` have built-in support in HoloViews, as of version 1.17.1 this is not yet the case for `Polars`. You can track this [issue](https://github.com/holoviz/holoviews/issues/5939) to follow the evolution of this feature in HoloViews. Internally hvPlot simply selects the columns that contribute to the plot and casts them to a Pandas object using Polars' `.to_pandas()` method.\n", ":::" @@ -327,6 +321,111 @@ "df_polars['A'].hvplot.line(height=150)" ] }, + { + "cell_type": "markdown", + "id": "efc2f45e", + "metadata": {}, + "source": [ + "#### DuckDB" + ] + }, + { + "cell_type": "markdown", + "id": "db91860c", + "metadata": {}, + "source": [ + ":::{note}\n", + "Added in version `0.11.0`.\n", + ":::" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d6460d0", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "df_pandas = pd.DataFrame(np.random.randn(1000, 4), columns=list('ABCD')).cumsum()\n", + "df_pandas.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21638d45", + "metadata": {}, + "outputs": [], + "source": [ + "import hvplot.duckdb # noqa \n", + "import duckdb\n", + "\n", + "connection = duckdb.connect(':memory:')\n", + "relation = duckdb.from_df(df_pandas, connection=connection)\n", + "relation.to_view(\"example_view\");" + ] + }, + { + "cell_type": "markdown", + "id": "40b56f16", + "metadata": {}, + "source": [ + "`.hvplot()` supports [DuckDB](https://duckdb.org/docs/api/python/overview.html) `DuckDBPyRelation` and `DuckDBConnection` objects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f588e3fe", + "metadata": {}, + "outputs": [], + "source": [ + "relation.hvplot.line(y=['A', 'B', 'C', 'D'], height=150)" + ] + }, + { + "cell_type": "markdown", + "id": "68a47856", + "metadata": {}, + "source": [ + "`DuckDBPyRelation` is a bit more optimized because it handles column subsetting directly within DuckDB before the data is converted to a `pd.DataFrame`.\n", + "\n", + "So, it's a good idea to use the `connection.sql()` method when possible, which gives you a `DuckDBPyRelation`, instead of `connection.execute()`, which returns a `DuckDBPyConnection`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "214c60ee", + "metadata": {}, + "outputs": [], + "source": [ + "sql_expr = \"SELECT * FROM example_view WHERE A > 0 AND B > 0\"\n", + "connection.sql(sql_expr).hvplot.line(y=['A', 'B'], hover_cols=[\"C\"], height=150) # subsets A, B, C" + ] + }, + { + "cell_type": "markdown", + "id": "2a2f61d4", + "metadata": {}, + "source": [ + "Alternatively, you can directly subset the desired columns in the SQL expression." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ce25c3d", + "metadata": {}, + "outputs": [], + "source": [ + "sql_expr = \"SELECT A, B, C FROM example_view WHERE A > 0 AND B > 0\"\n", + "connection.execute(sql_expr).hvplot.line(y=['A', 'B'], hover_cols=[\"C\"], height=150)" + ] + }, { "cell_type": "markdown", "id": "25a6e724-6a84-4bff-9108-ac71dcfa9116", diff --git a/doc/user_guide/Introduction.ipynb b/doc/user_guide/Introduction.ipynb index 0a25dec8b..a15e91a94 100644 --- a/doc/user_guide/Introduction.ipynb +++ b/doc/user_guide/Introduction.ipynb @@ -15,6 +15,7 @@ "\n", "* [Pandas](https://pandas.pydata.org): DataFrame, Series (columnar/tabular data)\n", "* [Rapids cuDF](https://docs.rapids.ai/api/cudf/stable/): GPU DataFrame, Series (columnar/tabular data)\n", + "* [DuckDB](https://www.duckdb.org/): DuckDB is a fast in-process analytical database\n", "* [Polars](https://www.pola.rs/): Polars is a fast DataFrame library/in-memory query engine (columnar/tabular data)\n", "* [Dask](https://www.dask.org): DataFrame, Series (distributed/out of core arrays and columnar data)\n", "* [XArray](https://xarray.pydata.org): Dataset, DataArray (labelled multidimensional arrays)\n", diff --git a/envs/py3.10-tests.yaml b/envs/py3.10-tests.yaml index 34c9b6d0c..6d17eab82 100644 --- a/envs/py3.10-tests.yaml +++ b/envs/py3.10-tests.yaml @@ -21,6 +21,7 @@ dependencies: - dask - dask>=2021.3.0 - datashader>=0.6.5 + - duckdb - fiona - fugue - fugue-sql-antlr>=0.2.0 diff --git a/envs/py3.11-docs.yaml b/envs/py3.11-docs.yaml index f8c29d248..8df704288 100644 --- a/envs/py3.11-docs.yaml +++ b/envs/py3.11-docs.yaml @@ -20,6 +20,7 @@ dependencies: - colorcet>=2 - dask>=2021.3.0 - datashader>=0.6.5 + - duckdb - fiona - fugue - fugue-sql-antlr>=0.2.0 diff --git a/envs/py3.11-tests.yaml b/envs/py3.11-tests.yaml index a13f4d400..292decce8 100644 --- a/envs/py3.11-tests.yaml +++ b/envs/py3.11-tests.yaml @@ -21,6 +21,7 @@ dependencies: - dask - dask>=2021.3.0 - datashader>=0.6.5 + - duckdb - fiona - fugue - fugue-sql-antlr>=0.2.0 diff --git a/envs/py3.12-tests.yaml b/envs/py3.12-tests.yaml index 2f9e4d653..a429d17eb 100644 --- a/envs/py3.12-tests.yaml +++ b/envs/py3.12-tests.yaml @@ -21,6 +21,7 @@ dependencies: - dask - dask>=2021.3.0 - datashader>=0.6.5 + - duckdb - fiona - fugue - fugue-sql-antlr>=0.2.0 diff --git a/envs/py3.9-tests.yaml b/envs/py3.9-tests.yaml index 45bfee438..5b354ff7f 100644 --- a/envs/py3.9-tests.yaml +++ b/envs/py3.9-tests.yaml @@ -20,6 +20,7 @@ dependencies: - dask - dask>=2021.3.0 - datashader>=0.6.5 + - duckdb - fiona - fugue - fugue-sql-antlr>=0.2.0 diff --git a/hvplot/converter.py b/hvplot/converter.py index 57baa2f04..f8d34800f 100644 --- a/hvplot/converter.py +++ b/hvplot/converter.py @@ -55,6 +55,7 @@ is_tabular, is_series, is_dask, + is_duckdb, is_intake, is_cudf, is_streamz, @@ -1088,6 +1089,9 @@ def _process_data( elif is_dask(data): datatype = 'dask' self.data = data.persist() if persist else data + elif is_duckdb(data): + datatype = 'duckdb' + self.data = data elif is_cudf(data): datatype = 'cudf' self.data = data diff --git a/hvplot/duckdb.py b/hvplot/duckdb.py new file mode 100644 index 000000000..3f53bb630 --- /dev/null +++ b/hvplot/duckdb.py @@ -0,0 +1,27 @@ +"""Adds the `.hvplot` method to duckdb.DuckDBPyRelation and duckdb.DuckDBPyConnection""" + + +def patch(name='hvplot', interactive='interactive', extension='bokeh', logo=False): + from hvplot.plotting.core import hvPlotTabularDuckDB + from . import post_patch, _module_extensions + + if 'hvplot.duckdb' not in _module_extensions: + try: + import duckdb + except ImportError: + raise ImportError( + 'Could not patch plotting API onto DuckDB. DuckDB could not be imported.' + ) + + # Patching for DuckDBPyRelation and DuckDBPyConnection + _patch_duckdb_plot = lambda self: hvPlotTabularDuckDB(self) # noqa: E731 + _patch_duckdb_plot.__doc__ = hvPlotTabularDuckDB.__call__.__doc__ + plot_prop_duckdb = property(_patch_duckdb_plot) + setattr(duckdb.DuckDBPyRelation, name, plot_prop_duckdb) + setattr(duckdb.DuckDBPyConnection, name, plot_prop_duckdb) + _module_extensions.add('hvplot.duckdb') + + post_patch(extension, logo) + + +patch() diff --git a/hvplot/plotting/__init__.py b/hvplot/plotting/__init__.py index d58ad1ace..e5038abd7 100644 --- a/hvplot/plotting/__init__.py +++ b/hvplot/plotting/__init__.py @@ -1,5 +1,5 @@ import holoviews as hv -from ..util import with_hv_extension, is_polars +from ..util import with_hv_extension, is_duckdb, is_polars from .core import hvPlot, hvPlotTabular # noqa @@ -34,6 +34,11 @@ def plot(data, kind, **kwargs): from .core import hvPlotTabularPolars return hvPlotTabularPolars(data)(kind=kind, **no_none_kwargs) + + elif is_duckdb(data): + from .core import hvPlotTabularDuckDB + + return hvPlotTabularDuckDB(data)(kind=kind, **no_none_kwargs) return hvPlotTabular(data)(kind=kind, **no_none_kwargs) diff --git a/hvplot/plotting/core.py b/hvplot/plotting/core.py index 25defa6e9..bd445ca5f 100644 --- a/hvplot/plotting/core.py +++ b/hvplot/plotting/core.py @@ -1864,6 +1864,89 @@ def labels(self, x=None, y=None, text=None, **kwds): return self(x, y, text=text, kind='labels', **kwds) +class hvPlotTabularDuckDB(hvPlotTabular): + def _get_converter(self, x=None, y=None, kind=None, **kwds): + import duckdb + from duckdb.typing import ( + BIGINT, + FLOAT, + DOUBLE, + INTEGER, + SMALLINT, + TINYINT, + UBIGINT, + UINTEGER, + USMALLINT, + UTINYINT, + HUGEINT, + ) + + params = dict(self._metadata, **kwds) + x = x or params.pop('x', None) + y = y or params.pop('y', None) + kind = kind or params.pop('kind', None) + + # Handle DuckDB Relation and Connection objects + if isinstance(self._data, (duckdb.DuckDBPyConnection, duckdb.DuckDBPyRelation)): + if isinstance(self._data, duckdb.DuckDBPyConnection): + data = self._data.df() + else: + data = self._data + + if params.get('hover_cols') != 'all': + data_columns = data.columns + possible_columns = [ + [v] if isinstance(v, str) else v + for v in params.values() + if isinstance(v, (str, list)) + ] + + columns = (set(data_columns) & set(itertools.chain(*possible_columns))) or { + data_columns[0] + } + if y is None: + # When y is not specified HoloViewsConverter finds all the numeric + # columns and use them as y values (see _process_chart_y). We need + # to include these columns too. + + if isinstance(data, duckdb.DuckDBPyRelation): + numeric_columns = data.select_types( + [ + BIGINT, + FLOAT, + DOUBLE, + INTEGER, + SMALLINT, + TINYINT, + UBIGINT, + UINTEGER, + USMALLINT, + UTINYINT, + HUGEINT, + ] + ).columns + else: + numeric_columns = data.select_dtypes(include='number').columns + columns |= set(numeric_columns) + xs = x if is_list_like(x) else (x,) + ys = y if is_list_like(y) else (y,) + columns |= {*xs, *ys} + columns.discard(None) + + if isinstance(data, duckdb.DuckDBPyRelation): + columns = sorted(columns, key=lambda c: data_columns.index(c)) + data = data.select(*columns).to_df() + else: + columns = sorted(columns, key=lambda c: data.columns.get_loc(c)) + data = data[list(columns)] + else: + raise ValueError( + 'Only duckdb.DuckDBPyConnection and duckdb.DuckDBPyRelation are supported' + ) + + return HoloViewsConverter(data, x, y, kind=kind, **params) + + class hvPlotTabularPolars(hvPlotTabular): def _get_converter(self, x=None, y=None, kind=None, **kwds): import polars as pl diff --git a/hvplot/tests/testpatch.py b/hvplot/tests/testpatch.py index ae05e488d..9ed865ae6 100644 --- a/hvplot/tests/testpatch.py +++ b/hvplot/tests/testpatch.py @@ -128,3 +128,30 @@ def test_polars_lazyframe_patched(self): pldf = pl.LazyFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) self.assertIsInstance(pldf.hvplot, hvPlotTabular) + + +class TestPatchDuckDB(TestCase): + def setUp(self): + try: + import duckdb # noqa + except ImportError: + raise SkipTest('DuckDB not available') + import hvplot.duckdb # noqa + + def test_duckdb_relation_patched(self): + import duckdb + + df = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 2, 3]}) + connection = duckdb.connect(':memory:') + relation = duckdb.from_df(df, connection=connection) + self.assertIsInstance(relation.hvplot, hvPlotTabular) + + def test_duckdb_connection_patched(self): + import duckdb + + df = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 2, 3]}) + connection = duckdb.connect(':memory:') + duckdb.from_df(df, connection=connection).to_view('test_connection') + self.assertIsInstance( + connection.execute('SELECT * FROM test_connection').hvplot, hvPlotTabular + ) diff --git a/hvplot/tests/testplotting.py b/hvplot/tests/testplotting.py index e7c747e0b..bdf54b139 100644 --- a/hvplot/tests/testplotting.py +++ b/hvplot/tests/testplotting.py @@ -4,10 +4,14 @@ from unittest import TestCase +import pytest import pandas as pd from parameterized import parameterized +import holoviews as hv +from hvplot.plotting import plot +from hvplot.tests.util import makeDataFrame from hvplot.converter import HoloViewsConverter no_args = ['line', 'area', 'hist', 'box', 'kde', 'density', 'bar', 'barh'] @@ -50,3 +54,20 @@ def test_pandas_dataframe_plot_does_not_implement_pie(self): class TestPandasHvplotPlotting(TestPandasHoloviewsPlotting): def setUp(self): pd.options.plotting.backend = 'hvplot' + + +def test_plot_supports_duckdb_relation(): + duckdb = pytest.importorskip('duckdb') + connection = duckdb.connect(':memory:') + relation = duckdb.from_df(makeDataFrame(), connection=connection) + out = plot(relation, 'line') + assert isinstance(out, hv.NdOverlay) + + +def test_plot_supports_duckdb_connection(): + duckdb = pytest.importorskip('duckdb') + connection = duckdb.connect(':memory:') + relation = duckdb.from_df(makeDataFrame(), connection=connection) + relation.to_view('test') + out = plot(connection.execute('SELECT * FROM test'), 'line') + assert isinstance(out, hv.NdOverlay) diff --git a/hvplot/util.py b/hvplot/util.py index a0ffe3361..8a8c1079c 100644 --- a/hvplot/util.py +++ b/hvplot/util.py @@ -398,6 +398,14 @@ def is_dask(data): return isinstance(data, (dd.DataFrame, dd.Series)) +def is_duckdb(data): + if not check_library(data, 'duckdb'): + return False + import duckdb + + return isinstance(data, (duckdb.DuckDBPyRelation, duckdb.DuckDBPyConnection)) + + def is_polars(data): if not check_library(data, 'polars'): return False diff --git a/pyproject.toml b/pyproject.toml index fb8d936fd..5f8fe5a8c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,6 +80,7 @@ tests = [ "polars", "dask", "spatialpandas", + "duckdb", ] # In 0.9 fugue added the sql extra but didn't add a fugue-sql package, removing the sql deps from fugue # Adding them manually here @@ -107,6 +108,7 @@ graphviz = [ examples = [ "dask[dataframe] >=2021.3.0", "datashader >=0.6.5", + "duckdb", "fugue[sql]", "hvplot[fugue-sql]", "ibis-framework[duckdb]", # ibis-duckdb on conda