dask-contrib · charlesbluca · Aug 2, 2022 · Mar 25, 2022 · Mar 25, 2022 · Apr 1, 2022
diff --git a/.github/cluster.yml b/.github/cluster.yml
@@ -5,9 +5,6 @@ services:
         container_name: dask-scheduler
         image: daskdev/dask:dev-py3.9
         command: dask-scheduler
-        environment:
-            USE_MAMBA: "true"
-            EXTRA_CONDA_PACKAGES: "cloudpickle>=1.5.0"  # match client cloudpickle version
         ports:
             - "8786:8786"
     dask-worker:

diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml
@@ -13,6 +13,7 @@ jobs:
   test-dev:
     name: "Test upstream dev (${{ matrix.os }}, python: ${{ matrix.python }})"
     runs-on: ${{ matrix.os }}
+    if: github.repository == 'dask-contrib/dask-sql'
     env:
       CONDA_FILE: continuous_integration/environment-${{ matrix.python }}-dev.yaml
     defaults:
@@ -37,6 +38,15 @@ jobs:
           channels: dask/label/dev,conda-forge,nodefaults
           activate-environment: dask-sql
           environment-file: ${{ env.CONDA_FILE }}
+      - name: Setup Rust Toolchain
+        uses: actions-rs/toolchain@v1
+        id: rust-toolchain
+        with:
+          toolchain: stable
+          override: true
+      - name: Build the Rust DataFusion bindings
+        run: |
+          python setup.py build install
       - name: Install hive testing dependencies for Linux
         if: matrix.os == 'ubuntu-latest'
         run: |
@@ -57,11 +67,6 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Cache local Maven repository
-        uses: actions/cache@v2
-        with:
-          path: ~/.m2/repository
-          key: ${{ runner.os }}-maven-v1-jdk11-${{ hashFiles('**/pom.xml') }}
       - name: Set up Python
         uses: conda-incubator/setup-miniconda@v2
         with:
@@ -71,12 +76,16 @@ jobs:
           channel-priority: strict
           channels: dask/label/dev,conda-forge,nodefaults
           activate-environment: dask-sql
-          environment-file: continuous_integration/environment-3.9-jdk11-dev.yaml
-      - name: Download the pre-build jar
-        uses: actions/download-artifact@v1
+          environment-file: continuous_integration/environment-3.9-dev.yaml
+      - name: Setup Rust Toolchain
+        uses: actions-rs/toolchain@v1
+        id: rust-toolchain
         with:
-          name: jar
-          path: dask_sql/jar/
+          toolchain: stable
+          override: true
+      - name: Build the Rust DataFusion bindings
+        run: |
+          python setup.py build install
       - name: Install cluster dependencies
         run: |
           mamba install python-blosc lz4 -c conda-forge
@@ -107,23 +116,22 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Cache local Maven repository
-        uses: actions/cache@v2
-        with:
-          path: ~/.m2/repository
-          key: ${{ runner.os }}-maven-v1-jdk11-${{ hashFiles('**/pom.xml') }}
       - name: Set up Python
         uses: conda-incubator/setup-miniconda@v2
         with:
           python-version: "3.8"
           mamba-version: "*"
           channels: dask/label/dev,conda-forge,nodefaults
           channel-priority: strict
-      - name: Download the pre-build jar
-        uses: actions/download-artifact@v1
+      - name: Setup Rust Toolchain
+        uses: actions-rs/toolchain@v1
+        id: rust-toolchain
         with:
-          name: jar
-          path: dask_sql/jar/
+          toolchain: stable
+          override: true
+      - name: Build the Rust DataFusion bindings
+        run: |
+          python setup.py build install
       - name: Install upstream dev Dask / dask-ml
         if: needs.detect-ci-trigger.outputs.triggered == 'true'
         run: |

diff --git a/README.md b/README.md
@@ -124,6 +124,19 @@ You can run the tests (after installation) with
 
     pytest tests
 
+GPU-specific tests require additional dependencies specified in `continuous_integration/gpuci/environment.yaml`.
+These can be added to the development environment by running
+
+```
+conda env update -n dask-sql -f continuous_integration/gpuci/environment.yaml
+```
+
+And GPU-specific tests can be run with
+
+```
+pytest tests -m gpu --rungpu
+```
+
 ## SQL Server
 
 `dask-sql` comes with a small test implementation for a SQL server.

diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml
@@ -6,6 +6,7 @@ dependencies:
 - dask-ml>=2022.1.22
 - dask>=2022.3.0
 - fastapi>=0.69.0
+- fugue>=0.7.0
 - intake>=0.6.0
 - jsonschema
 - lightgbm
@@ -31,13 +32,3 @@ dependencies:
 - tpot
 - tzlocal>=2.1
 - uvicorn>=0.11.3
-# fugue dependencies; remove when we conda install fugue
-- adagio
-- antlr4-python3-runtime<4.10
-- ciso8601
-- fs
-- pip
-- qpd
-- triad
-- pip:
-  - fugue[sql]>=0.5.3
diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml
@@ -6,6 +6,7 @@ dependencies:
 - dask-ml=2022.1.22
 - dask=2022.3.0
 - fastapi=0.69.0
+- fugue=0.7.0
 - intake=0.6.0
 - jsonschema
 - lightgbm
@@ -31,13 +32,3 @@ dependencies:
 - tpot
 - tzlocal=2.1
 - uvicorn=0.11.3
-# fugue dependencies; remove when we conda install fugue
-- adagio
-- antlr4-python3-runtime<4.10
-- ciso8601
-- fs
-- pip
-- qpd
-- triad
-- pip:
-  - fugue[sql]==0.5.3
diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml
@@ -6,6 +6,7 @@ dependencies:
 - dask-ml>=2022.1.22
 - dask>=2022.3.0
 - fastapi>=0.69.0
+- fugue>=0.7.0
 - intake>=0.6.0
 - jsonschema
 - lightgbm
@@ -31,13 +32,3 @@ dependencies:
 - tpot
 - tzlocal>=2.1
 - uvicorn>=0.11.3
-# fugue dependencies; remove when we conda install fugue
-- adagio
-- antlr4-python3-runtime<4.10
-- ciso8601
-- fs
-- pip
-- qpd
-- triad
-- pip:
-  - fugue[sql]>=0.5.3
diff --git a/continuous_integration/gpuci/environment.yaml b/continuous_integration/gpuci/environment.yaml
@@ -0,0 +1,17 @@
+name: gpuci
+channels:
+  - rapidsai
+  - rapidsai-nightly
+  - nvidia
+dependencies:
+  - rust>=1.60.0
+  - setuptools-rust>=1.2.0
+  - cudatoolkit=11.5
+  - cudf=22.08
+  - cuml=22.08
+  - dask-cudf=22.08
+  - dask-cuda=22.08
+  - numpy>=1.20.0
+  - ucx-proc=*=gpu
+  - ucx-py=0.27
+  - xgboost=*=cuda_*
diff --git a/dask_sql/context.py b/dask_sql/context.py
@@ -780,7 +780,7 @@ def _prepare_schemas(self):
                 logger.debug("No custom functions defined.")
             for function_description in schema.function_lists:
                 name = function_description.name
-                sql_return_type = python_to_sql_type(function_description.return_type)
+                sql_return_type = function_description.return_type
                 if function_description.aggregation:
                     logger.debug(f"Adding function '{name}' to schema as aggregation.")
                     # TODO: Not yet implemented
@@ -800,10 +800,7 @@ def _prepare_schemas(self):
     @staticmethod
     def _add_parameters_from_description(function_description, dask_function):
         for parameter in function_description.parameters:
-            param_name, param_type = parameter
-            sql_param_type = python_to_sql_type(param_type)
-
-            dask_function.addParameter(param_name, sql_param_type, False)
+            dask_function.addParameter(*parameter, False)
 
         return dask_function
 
@@ -884,9 +881,16 @@ def _register_callable(
         row_udf: bool = False,
     ):
         """Helper function to do the function or aggregation registration"""
+
         schema_name = schema_name or self.schema_name
         schema = self.schema[schema_name]
 
+        # validate and cache UDF metadata
+        sql_parameters = [
+            (name, python_to_sql_type(param_type)) for name, param_type in parameters
+        ]
+        sql_return_type = python_to_sql_type(return_type)
+
         if not aggregation:
             f = UDF(f, row_udf, parameters, return_type)
         lower_name = name.lower()
@@ -906,10 +910,14 @@ def _register_callable(
                 )
 
         schema.function_lists.append(
-            FunctionDescription(name.upper(), parameters, return_type, aggregation)
+            FunctionDescription(
+                name.upper(), sql_parameters, sql_return_type, aggregation
+            )
         )
         schema.function_lists.append(
-            FunctionDescription(name.lower(), parameters, return_type, aggregation)
+            FunctionDescription(
+                name.lower(), sql_parameters, sql_return_type, aggregation
+            )
         )
         schema.functions[lower_name] = f
 

diff --git a/dask_sql/datacontainer.py b/dask_sql/datacontainer.py
@@ -229,11 +229,6 @@ def __init__(self, func, row_udf: bool, params, return_type=None):
 
         self.names = [param[0] for param in params]
 
-        if return_type is None:
-            # These UDFs go through apply and without providing
-            # a return type, dask will attempt to guess it, and
-            # dask might be wrong.
-            raise ValueError("Return type must be provided")
         self.meta = (None, return_type)
 
     def __call__(self, *args, **kwargs):
@@ -249,7 +244,6 @@ def __call__(self, *args, **kwargs):
             df = column_args[0].to_frame(self.names[0])
             for name, col in zip(self.names[1:], column_args[1:]):
                 df[name] = col
-
             result = df.apply(
                 self.func, axis=1, args=tuple(scalar_args), meta=self.meta
             ).astype(self.meta[1])

diff --git a/dask_sql/integrations/fugue.py b/dask_sql/integrations/fugue.py
@@ -1,8 +1,10 @@
 try:
     import fugue
     import fugue_dask
+    from dask.distributed import Client
     from fugue import WorkflowDataFrame, register_execution_engine
     from fugue_sql import FugueSQLWorkflow
+    from triad import run_at_def
     from triad.utils.convert import get_caller_global_local_vars
 except ImportError:  # pragma: no cover
     raise ImportError(
@@ -15,9 +17,25 @@
 
 from dask_sql.context import Context
 
-register_execution_engine(
-    "dask", lambda conf: DaskSQLExecutionEngine(conf), on_dup="overwrite"
-)
+
+@run_at_def
+def _register_engines() -> None:
+    """Register (overwrite) the default Dask execution engine of Fugue. This
+    function is invoked as an entrypoint, users don't need to call it explicitly.
+    """
+    register_execution_engine(
+        "dask",
+        lambda conf, **kwargs: DaskSQLExecutionEngine(conf=conf),
+        on_dup="overwrite",
+    )
+
+    register_execution_engine(
+        Client,
+        lambda engine, conf, **kwargs: DaskSQLExecutionEngine(
+            dask_client=engine, conf=conf
+        ),
+        on_dup="overwrite",
+    )
 
 
 class DaskSQLEngine(fugue.execution.execution_engine.SQLEngine):

diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py
@@ -86,6 +86,12 @@
 
 def python_to_sql_type(python_type) -> "DaskTypeMap":
     """Mapping between python and SQL types."""
+
+    if python_type in (int, float):
+        python_type = np.dtype(python_type)
+    elif python_type is str:
+        python_type = np.dtype("object")
+
     if isinstance(python_type, np.dtype):
         python_type = python_type.type
 
@@ -286,15 +292,17 @@ def cast_column_to_type(col: dd.Series, expected_type: str):
         logger.debug("...not converting.")
         return None
 
-    current_float = pd.api.types.is_float_dtype(current_type)
-    expected_integer = pd.api.types.is_integer_dtype(expected_type)
-    if current_float and expected_integer:
-        logger.debug("...truncating...")
-        # Currently "trunc" can not be applied to NA (the pandas missing value type),
-        # because NA is a different type. It works with np.NaN though.
-        # For our use case, that does not matter, as the conversion to integer later
-        # will convert both NA and np.NaN to NA.
-        col = da.trunc(col.fillna(value=np.NaN))
+    if pd.api.types.is_integer_dtype(expected_type):
+        if pd.api.types.is_float_dtype(current_type):
+            logger.debug("...truncating...")
+            # Currently "trunc" can not be applied to NA (the pandas missing value type),
+            # because NA is a different type. It works with np.NaN though.
+            # For our use case, that does not matter, as the conversion to integer later
+            # will convert both NA and np.NaN to NA.
+            col = da.trunc(col.fillna(value=np.NaN))
+        elif pd.api.types.is_timedelta64_dtype(current_type):
+            logger.debug(f"Explicitly casting from {current_type} to np.int64")
+            return col.astype(np.int64)
 
     logger.debug(f"Need to cast from {current_type} to {expected_type}")
     return col.astype(expected_type)
diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py
@@ -8,13 +8,6 @@
 import pandas as pd
 from dask import config as dask_config
 
-try:
-    import dask_cudf
-
-    from dask_planner.rust import LogicalPlan
-except ImportError:
-    dask_cudf = None
-
 from dask_sql.datacontainer import ColumnContainer, DataContainer
 from dask_sql.physical.rel.base import BaseRelPlugin
 from dask_sql.physical.rex.core.call import IsNullOperation
@@ -23,6 +16,7 @@
 
 if TYPE_CHECKING:
     import dask_sql
+    from dask_planner.rust import LogicalPlan
 
 logger = logging.getLogger(__name__)
 

diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py
@@ -186,7 +186,7 @@ def div(self, lhs, rhs):
         # of this function.
         if isinstance(result, (datetime.timedelta, np.timedelta64)):
             return result
-        else:  # pragma: no cover
+        else:
             return da.trunc(result).astype(np.int64)
 
 
@@ -960,8 +960,7 @@ def convert(
         ]
 
         # Now use the operator name in the mapping
-        # TODO: obviously this needs to not be hardcoded but not sure of the best place to pull the value from currently???
-        schema_name = "root"
+        schema_name = context.schema_name
         operator_name = expr.getOperatorName().lower()
 
         try: