From e1045a85af0d1751fe0a6b4278db577ab01c2b50 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:32:30 +0100 Subject: [PATCH] Remove dask-expr files --- README.md | 64 -------- demo.ipynb | 423 ----------------------------------------------------- setup.cfg | 10 -- 3 files changed, 497 deletions(-) delete mode 100644 README.md delete mode 100644 demo.ipynb delete mode 100644 setup.cfg diff --git a/README.md b/README.md deleted file mode 100644 index 38937b045b2..00000000000 --- a/README.md +++ /dev/null @@ -1,64 +0,0 @@ -Dask Expressions -================ - -Dask DataFrames with query optimization. - -This is a rewrite of Dask DataFrame that includes query -optimization and generally improved organization. - -More in our blog posts: -- [Dask Expressions overview](https://blog.dask.org/2023/08/25/dask-expr-introduction) -- [TPC-H benchmark results vs. Dask DataFrame](https://docs.coiled.io/blog/tpch.html) - -Example -------- - -```python -import dask_expr as dx - -df = dx.datasets.timeseries() -df.head() - -df.groupby("name").x.mean().compute() -``` - -Query Representation --------------------- - -Dask-expr encodes user code in an expression tree: - -```python ->>> df.x.mean().pprint() - -Mean: - Projection: columns='x' - Timeseries: seed=1896674884 -``` - -This expression tree will be optimized and modified before execution: - -```python ->>> df.x.mean().optimize().pprint() - -Div: - Sum: - Fused(375f9): - | Projection: columns='x' - | Timeseries: dtypes={'x': } seed=1896674884 - Count: - Fused(375f9): - | Projection: columns='x' - | Timeseries: dtypes={'x': } seed=1896674884 -``` - -Stability ---------- - -This is the default backend for dask.DataFrame since version 2024.3.0. - -API Coverage ------------- - -Dask-Expr covers almost everything of the Dask DataFrame API. The only missing features are: - -- named GroupBy Aggregations diff --git a/demo.ipynb b/demo.ipynb deleted file mode 100644 index 7ed28e327f1..00000000000 --- a/demo.ipynb +++ /dev/null @@ -1,423 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "9bbe3374-c6d7-4b2c-a55f-1290067c6ae5", - "metadata": {}, - "source": [ - "# Dask Match Demonstration" - ] - }, - { - "cell_type": "markdown", - "id": "3efeda20-6f1d-4ae2-996a-d9f4fe09328d", - "metadata": {}, - "source": [ - "We make a Dask client, mostly so we can use the dashboard." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "16afe6ff-446c-4ccc-8b1f-87fedcc5e66b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from dask.distributed import Client\n", - "client = Client()" - ] - }, - { - "cell_type": "markdown", - "id": "304c473e-0a18-4255-a6b1-2611bdb2d8ca", - "metadata": {}, - "source": [ - "## Example DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2403e22e-8da0-47d6-b323-f1812c258039", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import dask_expr as dx\n", - "\n", - "df = dx.datasets.timeseries(\n", - " start=\"2000-01-01\", \n", - " end=\"2000-12-30\", \n", - " freq=\"100ms\",\n", - ")\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5cd1714-d542-4a97-b27b-0471a4283103", - "metadata": {}, - "outputs": [], - "source": [ - "df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "988abc5b-e068-496f-b72b-6f11e7d9afc8", - "metadata": {}, - "source": [ - "## We can do high level query optimization" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17a0ac65-1e28-4200-b944-05f40cbceabc", - "metadata": {}, - "outputs": [], - "source": [ - "out = df[df.id == 1000].sum()[\"x\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7665e2d-ab78-4fe0-b20a-5ee7790c15ad", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%%time\n", - "out.compute()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f96fd6be-3be3-4520-8c9c-d96c52506e95", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "out.optimize().compute()" - ] - }, - { - "cell_type": "markdown", - "id": "77ebc25f-7386-47b1-a337-4df5b9fd4d2f", - "metadata": {}, - "source": [ - "## Let's inspect what's going on\n", - "\n", - "In database terms: \n", - "\n", - "- Historically Dask DataFrame/Array never had a logical plan.\n", - "- Instead we wrote everything immediately as a physical plan.\n", - "\n", - " This was great for flexibility and expressivity, which is what early users craved, but holds us back now as we target less sophisticated users.\n", - "\n", - "Now we're adding a logical plan around these high level collections." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ccb03266-1eaf-4786-8834-ef69355d6cc5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "out.pprint()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d66995e-3042-428a-95ca-a55c1b126cb6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "out.optimize(fuse=False).pprint()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e770d156-9c1d-4550-b754-49a606d99494", - "metadata": {}, - "outputs": [], - "source": [ - "out.optimize().pprint()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "240b241a-4c8e-4941-a3f8-d49fad40e264", - "metadata": {}, - "outputs": [], - "source": [ - "df.x + 1 # good" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "965c122d-bb5e-4bff-9be7-0ec0d58ff535", - "metadata": {}, - "outputs": [], - "source": [ - "(df + 1).x # bad" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "117f2bcf-246d-44ff-a233-ea65a16c3e2f", - "metadata": {}, - "outputs": [], - "source": [ - "out.expr.visualize()" - ] - }, - { - "cell_type": "markdown", - "id": "e10082f7-7044-459f-b845-f258f181958c", - "metadata": {}, - "source": [ - "## Let's look at expressions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ffd9a54-0a43-474a-8946-3b1a60e09f47", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# Dask dataframe/series class to match user expected API\n", - "\n", - "out = (df.x + 1)\n", - "out" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0ac9d05-a2a2-4372-a0de-e43b706e6acd", - "metadata": {}, - "outputs": [], - "source": [ - "# Holds an expression object, which captures user intent\n", - "\n", - "out.expr" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e552c770-a368-4574-8b26-0dafd97ed6d2", - "metadata": {}, - "outputs": [], - "source": [ - "# Operation is stored in the type\n", - "\n", - "type(out.expr)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "562e8a1c-f29c-4978-b593-c5594b9ae6b0", - "metadata": {}, - "outputs": [], - "source": [ - "# Follows a standard class hierarchy\n", - "\n", - "type(out.expr).mro()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "953aa941-cbd3-4be2-a9dc-4b5a21ac6431", - "metadata": {}, - "outputs": [], - "source": [ - "# Most optimizations written on the opertions themselves\n", - "\n", - "out.expr._simplify_down??" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6536b18-2d4a-49cb-9098-8d40d9e3eba2", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# State managed as parameters (names)\n", - "\n", - "out.expr._parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "81f2e534-ad51-4342-af45-9fb43385405a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# ... and operands (values) which are often other expressions\n", - "\n", - "out.expr.operands" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbd05d9f-58c0-4f7b-995a-8c97f349c428", - "metadata": {}, - "outputs": [], - "source": [ - "out.expr.left" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62e41fd3-acb9-43a3-910a-1326983ea356", - "metadata": {}, - "outputs": [], - "source": [ - "out.expr.right" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9498546d-7d32-49aa-8cc7-f881334a379e", - "metadata": {}, - "outputs": [], - "source": [ - "type(out.expr.left)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ee8a172-0222-43cd-a38a-6ccbf5867b32", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "dict(\n", - " zip(\n", - " out.left._parameters, \n", - " out.left.operands,\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5fdb28f2-2a36-429a-a77d-028ff9ced87a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "dict(\n", - " zip(\n", - " out.left.frame._parameters, \n", - " out.left.frame.operands,\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f078cdc9-f994-4f94-99b4-d3aeeddc8e03", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "id": "0f114a9f-6280-44cb-982d-9582c92c1c77", - "metadata": {}, - "source": [ - "## What works today and what doesn't" - ] - }, - { - "cell_type": "markdown", - "id": "2fc3e3ba-b948-4b95-b877-bc3a02e9a9ef", - "metadata": {}, - "source": [ - "#### Works\n", - "\n", - "- Native Dask collection\n", - "- Standard optimizations (column projection, predicate pushdown, ...)\n", - "- POC on most operation types\n", - " - blockwise\n", - " - reductions\n", - " - groupby aggregations\n", - " - sorts/shuffling\n", - " - data ingestion (like parquet)\n", - "\n", - "It also feels pretty clean and easy to work on from a maintainability perspective\n", - "\n", - "#### Doesn't work\n", - "\n", - "- API completeness (lots of fill-in to do)\n", - "- Data Writing (but this is easy)\n", - "- Task Annotations / priorities / worker restrictions\n", - "\n", - "#### Future plans\n", - "\n", - "- Adding new protocols, like parquet-style metadata (working on this now)\n", - "- Keep the expressions on the Scheduler (can probably make better decisions)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 68c17823c1a..00000000000 --- a/setup.cfg +++ /dev/null @@ -1,10 +0,0 @@ -[isort] -sections = FUTURE,STDLIB,THIRDPARTY,DISTRIBUTED,FIRSTPARTY,LOCALFOLDER -profile = black -skip_gitignore = true -force_to_top = true -default_section = THIRDPARTY - - -[aliases] -test = pytest