diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 3d79aab64..12411fe06 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -22,21 +22,21 @@ jobs: - name: Check out the repo uses: actions/checkout@v3 - name: Set up QEMU - uses: docker/setup-qemu-action@v1 + uses: docker/setup-qemu-action@v2 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 + uses: docker/setup-buildx-action@v2 - name: Login to DockerHub - uses: docker/login-action@v1 + uses: docker/login-action@v2 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - name: Docker meta for main image id: docker_meta_main - uses: crazy-max/ghaction-docker-meta@v1 + uses: crazy-max/ghaction-docker-meta@v4 with: images: nbraun/dask-sql - name: Build and push main image - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v3 with: context: . file: ./docker/main.dockerfile @@ -47,11 +47,11 @@ jobs: push: true - name: Docker meta for cloud image id: docker_meta_cloud - uses: crazy-max/ghaction-docker-meta@v1 + uses: crazy-max/ghaction-docker-meta@v4 with: images: nbraun/dask-sql-cloud - name: Build and push cloud image - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v3 with: context: . file: ./docker/cloud.dockerfile diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 0cbdabab5..440776f79 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -26,7 +26,7 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 2 - - uses: xarray-contrib/ci-trigger@v1.1 + - uses: xarray-contrib/ci-trigger@v1.2 id: detect-trigger with: keyword: "[test-df-upstream]" diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index abd407fac..6696faad2 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -14,9 +14,9 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 - uses: actions-rs/toolchain@v1 with: toolchain: nightly components: rustfmt - - uses: pre-commit/action@v2.0.0 + - uses: pre-commit/action@v3.0.0 diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index bd682c114..874d3e62a 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -73,11 +73,10 @@ jobs: mamba install -c conda-forge "sasl>=0.3.1" docker pull bde2020/hive:2.3.2-postgresql-metastore docker pull bde2020/hive-metastore-postgresql:2.3.0 - - name: Install upstream dev Dask / dask-ml + - name: Install upstream dev Dask if: env.which_upstream == 'Dask' run: | mamba update dask - python -m pip install --no-deps git+https://github.com/dask/dask-ml - name: Test with pytest run: | pytest --junitxml=junit/test-results.xml --cov-report=xml -n auto tests --dist loadfile @@ -112,11 +111,10 @@ jobs: which python pip list mamba list - - name: Install upstream dev dask-ml + - name: Install upstream dev Dask if: env.which_upstream == 'Dask' run: | mamba update dask - python -m pip install --no-deps git+https://github.com/dask/dask-ml - name: run a dask cluster run: | if [[ $which_upstream == "Dask" ]]; then @@ -161,12 +159,11 @@ jobs: which python pip list mamba list - - name: Install upstream dev Dask / dask-ml + - name: Install upstream dev Dask if: env.which_upstream == 'Dask' run: | python -m pip install --no-deps git+https://github.com/dask/dask python -m pip install --no-deps git+https://github.com/dask/distributed - python -m pip install --no-deps git+https://github.com/dask/dask-ml - name: Try to import dask-sql run: | python -c "import dask_sql; print('ok')" @@ -183,7 +180,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Report failures - uses: actions/github-script@v3 + uses: actions/github-script@v6 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 624ec0022..da437653f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -27,7 +27,7 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 2 - - uses: xarray-contrib/ci-trigger@v1.1 + - uses: xarray-contrib/ci-trigger@v1.2 id: detect-trigger with: keyword: "[test-upstream]" @@ -64,11 +64,10 @@ jobs: mamba install -c conda-forge "sasl>=0.3.1" docker pull bde2020/hive:2.3.2-postgresql-metastore docker pull bde2020/hive-metastore-postgresql:2.3.0 - - name: Optionally install upstream dev Dask / dask-ml + - name: Optionally install upstream dev Dask if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | mamba update dask - python -m pip install --no-deps git+https://github.com/dask/dask-ml - name: Test with pytest run: | pytest --junitxml=junit/test-results.xml --cov-report=xml -n auto tests --dist loadfile @@ -108,11 +107,10 @@ jobs: which python pip list mamba list - - name: Optionally install upstream dev dask-ml + - name: Optionally install upstream dev Dask if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | mamba update dask - python -m pip install --no-deps git+https://github.com/dask/dask-ml - name: run a dask cluster env: UPSTREAM: ${{ needs.detect-ci-trigger.outputs.triggered }} @@ -153,12 +151,11 @@ jobs: which python pip list mamba list - - name: Optionally install upstream dev Dask / dask-ml + - name: Optionally install upstream dev Dask if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | python -m pip install --no-deps git+https://github.com/dask/dask python -m pip install --no-deps git+https://github.com/dask/distributed - python -m pip install --no-deps git+https://github.com/dask/dask-ml - name: Try to import dask-sql run: | python -c "import dask_sql; print('ok')" diff --git a/.github/workflows/update-gpuci.yml b/.github/workflows/update-gpuci.yml index 62634c987..b85dfdb13 100644 --- a/.github/workflows/update-gpuci.yml +++ b/.github/workflows/update-gpuci.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v3 - name: Parse current axis YAML - uses: the-coding-turtle/ga-yaml-parser@v0.1.1 + uses: the-coding-turtle/ga-yaml-parser@v0.1.2 with: file: continuous_integration/gpuci/axis.yaml @@ -71,7 +71,7 @@ jobs: regex: true - name: Create Pull Request - uses: peter-evans/create-pull-request@v3 + uses: peter-evans/create-pull-request@v4 # make sure ucx-py nightlies are available and that cuDF/cuML nightly versions match up if: | env.UCX_PY_VER != env.NEW_UCX_PY_VER && diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index e86d4e62f..d5ffa777a 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -3,7 +3,6 @@ channels: - conda-forge - nodefaults dependencies: -- dask-ml>=2022.1.22 - dask>=2022.3.0 - fastapi>=0.69.0 - fugue>=0.7.0 diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index 33b6492db..93bab825c 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -3,7 +3,6 @@ channels: - conda-forge - nodefaults dependencies: -- dask-ml=2022.1.22 - dask=2022.3.0 - fastapi=0.69.0 - fugue=0.7.0 diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 8a2a2bcb0..4c5c23511 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -3,7 +3,6 @@ channels: - conda-forge - nodefaults dependencies: -- dask-ml>=2022.1.22 - dask>=2022.3.0 - fastapi>=0.69.0 - fugue>=0.7.0 diff --git a/continuous_integration/gpuci/build.sh b/continuous_integration/gpuci/build.sh index f8492f0c0..683735196 100644 --- a/continuous_integration/gpuci/build.sh +++ b/continuous_integration/gpuci/build.sh @@ -52,6 +52,9 @@ python -m pip install git+https://github.com/dask/dask gpuci_logger "Install distributed" python -m pip install git+https://github.com/dask/distributed +gpuci_logger "Install latest dask-cuda" +gpuci_mamba_retry update -y -c rapidsai-nightly dask-cuda + gpuci_logger "Install dask-sql" pip install -e ".[dev]" diff --git a/continuous_integration/gpuci/environment.yaml b/continuous_integration/gpuci/environment.yaml index c839083e6..ba248a2ff 100644 --- a/continuous_integration/gpuci/environment.yaml +++ b/continuous_integration/gpuci/environment.yaml @@ -6,7 +6,6 @@ channels: - conda-forge - nodefaults dependencies: -- dask-ml>=2022.1.22 - dask>=2022.3.0 - fastapi>=0.69.0 - fugue>=0.7.0 diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index ecce581d2..c48a9faf8 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -144,26 +144,15 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.58" +version = "0.1.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e805d94e6b5001b651426cf4cd446b1ab5f319d27bab5c644f61de0a804360c" +checksum = "31e6e93155431f3931513b243d371981bb2770112b370c82745a1d19d2f99364" dependencies = [ "proc-macro2", "quote", "syn", ] -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi", - "libc", - "winapi", -] - [[package]] name = "autocfg" version = "1.1.0" @@ -531,17 +520,38 @@ checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" [[package]] name = "env_logger" -version = "0.9.3" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12e6657c4c97ebab115a42dcee77225f7f482cdd841cf7088c657a42e9e00e7" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" dependencies = [ - "atty", "humantime", + "is-terminal", "log", "regex", "termcolor", ] +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "flatbuffers" version = "22.9.29" @@ -609,6 +619,15 @@ dependencies = [ "libc", ] +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + [[package]] name = "humantime" version = "2.1.0" @@ -655,6 +674,28 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "adab1eaa3408fb7f0c777a73e7465fd5656136fc93b670eb6df3c88c2c1344e3" +[[package]] +name = "io-lifetimes" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7d367024b3f3414d8e01f437f704f41a9f64ab36f9067fa73e526ad4c763c87" +dependencies = [ + "libc", + "windows-sys", +] + +[[package]] +name = "is-terminal" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aae5bc6e2eb41c9def29a3e0f1306382807764b9b53112030eff57435667352d" +dependencies = [ + "hermit-abi 0.2.6", + "io-lifetimes", + "rustix", + "windows-sys", +] + [[package]] name = "itertools" version = "0.10.5" @@ -769,11 +810,12 @@ checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565" [[package]] name = "libmimalloc-sys" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c37567b180c1af25924b303ddf1ee4467653783440c62360beb2b322a4d93361" +checksum = "04d1c67deb83e6b75fa4fe3309e09cfeade12e7721d95322af500d3814ea60c9" dependencies = [ "cc", + "libc", ] [[package]] @@ -785,6 +827,12 @@ dependencies = [ "cc", ] +[[package]] +name = "linux-raw-sys" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f9f08d8963a6c613f4b1a78f4f4a4dbfadf8e6545b2d72861731e4858b8b47f" + [[package]] name = "lock_api" version = "0.4.9" @@ -830,9 +878,9 @@ dependencies = [ [[package]] name = "mimalloc" -version = "0.1.31" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b32d6a9ac92d0239d7bfa31137fb47634ac7272a3c11bcee91379ac100781670" +checksum = "9b2374e2999959a7b583e1811a1ddbf1d3a4b9496eceb9746f1192a59d871eca" dependencies = [ "libmimalloc-sys", ] @@ -940,7 +988,7 @@ version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6058e64324c71e02bc2b150e4f3bc8286db6c83092132ffa3f6b1eab0f9def5" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", ] @@ -1146,6 +1194,20 @@ version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +[[package]] +name = "rustix" +version = "0.36.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b1fbb4dfc4eb1d390c02df47760bb19a84bb80b301ecc947ab5406394d8223e" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", +] + [[package]] name = "rustversion" version = "1.0.9" @@ -1301,9 +1363,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.21.2" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9e03c497dc955702ba729190dc4aac6f2a0ce97f913e5b1b5912fc5039d9099" +checksum = "d76ce4a75fb488c605c54bf610f221cea8b0dafb53333c1a67e8ee199dcd2ae3" dependencies = [ "autocfg", "num_cpus", diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index 9ac4e8a44..4562294ed 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,18 +10,18 @@ rust-version = "1.62" [dependencies] arrow = { version = "26.0.0", features = ["prettyprint"] } -async-trait = "0.1.58" +async-trait = "0.1.59" datafusion-common = "14.0.0" datafusion-expr = "14.0.0" datafusion-optimizer = "14.0.0" datafusion-sql = "14.0.0" -env_logger = "0.9" +env_logger = "0.10" log = "^0.4" mimalloc = { version = "*", default-features = false } parking_lot = "0.12" pyo3 = { version = "0.17.3", features = ["extension-module", "abi3", "abi3-py38"] } rand = "0.8" -tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] } +tokio = { version = "1.22", features = ["macros", "rt", "rt-multi-thread", "sync", "fs", "parking_lot"] } uuid = { version = "1.2", features = ["v4"] } [lib] diff --git a/dask_sql/physical/rel/custom/predict.py b/dask_sql/physical/rel/custom/predict.py index eb5e4b69f..1d1f2fd10 100644 --- a/dask_sql/physical/rel/custom/predict.py +++ b/dask_sql/physical/rel/custom/predict.py @@ -2,6 +2,9 @@ import uuid from typing import TYPE_CHECKING +import dask.dataframe as dd +import pandas as pd + from dask_sql.datacontainer import ColumnContainer, DataContainer from dask_sql.physical.rel.base import BaseRelPlugin @@ -30,8 +33,7 @@ class PredictModelPlugin(BaseRelPlugin): Please note however, that it will need to act on Dask dataframes. If you are using a model not optimized for this, it might be that you run out of memory if your data is larger than the RAM of a single machine. - To prevent this, have a look into the dask-ml package, - especially the [ParallelPostFit](https://ml.dask.org/meta-estimators.html) + To prevent this, have a look into the dask_sql.physical.rel.custom.wrappers.ParallelPostFit meta-estimator. If you are using a model trained with `CREATE MODEL` and the `wrap_predict` flag, this is done automatically. @@ -59,8 +61,21 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai model, training_columns = context.schema[schema_name].models[model_name] df = context.sql(sql_select) - prediction = model.predict(df[training_columns]) - predicted_df = df.assign(target=prediction) + try: + prediction = model.predict(df[training_columns]) + predicted_df = df.assign(target=prediction) + except TypeError: + df = df.set_index(df.columns[0], drop=False) + prediction = model.predict(df[training_columns]) + # Convert numpy.ndarray to Dask Series + prediction = dd.from_pandas( + pd.Series(prediction, index=df.index), + npartitions=df.npartitions, + ) + predicted_df = df.assign(target=prediction) + # Need to drop first column to reset index + # because the first column is equal to the index + predicted_df = predicted_df.drop(columns=[df.columns[0]]).reset_index() # Create a temporary context, which includes the # new "table" so that we can use the normal diff --git a/docker/conda.txt b/docker/conda.txt index 32a08c7a9..3d57e18dc 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -16,7 +16,6 @@ uvicorn>=0.13.4 pyarrow>=6.0.1 prompt_toolkit>=3.0.8 pygments>=2.7.1 -dask-ml>=2022.1.22 scikit-learn>=1.0.0 intake>=0.6.0 pre-commit>=2.11.1 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index 824f8ce27..8c908ce19 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -12,7 +12,7 @@ ENV PATH="/root/.cargo/bin:${PATH}" # Install conda dependencies for dask-sql COPY docker/conda.txt /opt/dask_sql/ -RUN mamba install --freeze-installed -y \ +RUN mamba install -y \ # build requirements "setuptools-rust>=1.4.1" \ # core dependencies @@ -27,7 +27,6 @@ RUN mamba install --freeze-installed -y \ nest-asyncio \ # additional dependencies "pyarrow>=6.0.1" \ - "dask-ml>=2022.1.22" \ "scikit-learn>=1.0.0" \ "intake>=0.6.0" \ && conda clean -ay diff --git a/docs/source/machine_learning.rst b/docs/source/machine_learning.rst index fac0daacb..3dd301863 100644 --- a/docs/source/machine_learning.rst +++ b/docs/source/machine_learning.rst @@ -125,8 +125,7 @@ following sql statements Want to increase the performance of your model by tuning the parameters? Use the hyperparameter tuning directly in SQL using below SQL syntax, choose different tuners -from the dask_ml package based on memory and compute constraints and -for more details refer to the `dask ml documentation `_ +based on memory and compute constraints. .. TODO - add a GPU section to these examples once we have working CREATE EXPERIMENT tests for GPU @@ -135,7 +134,7 @@ for more details refer to the `dask ml documentation `_ +To prevent this, have a look into the `dask_sql.physical.rel.custom.wrappers.ParallelPostFit` meta-estimator. If you are using a model trained with ``CREATE MODEL`` and the ``wrap_predict`` flag set to true, this is done automatically. diff --git a/notebooks/Feature Overview.ipynb b/notebooks/Feature Overview.ipynb index 28538ab64..ac23a9777 100644 --- a/notebooks/Feature Overview.ipynb +++ b/notebooks/Feature Overview.ipynb @@ -590,7 +590,7 @@ "metadata": {}, "source": [ "- Tune single model with different Hyperparameters \n", - " - install **dask_ml** for tunning\n", + " - install **sklearn** for tuning\n", "- Tune multiple model with different Hyperparameters\n", " - install **tpot** for Automl" ] @@ -604,7 +604,7 @@ "%%sql\n", "CREATE EXPERIMENT my_exp WITH (\n", " model_class = 'sklearn.ensemble.GradientBoostingClassifier',\n", - " experiment_class = 'dask_ml.model_selection.GridSearchCV',\n", + " experiment_class = 'sklearn.model_selection.GridSearchCV',\n", " tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001],\n", " max_depth = ARRAY [3,4,5,10]),\n", " target_column = 'species'\n", diff --git a/setup.py b/setup.py index c982e40a0..0f8520de9 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,6 @@ "mock>=4.0.3", "sphinx>=3.2.1", "pyarrow>=6.0.1", - "dask-ml>=2022.1.22", "scikit-learn>=1.0.0", "intake>=0.6.0", "pre-commit", diff --git a/tests/integration/test_model.py b/tests/integration/test_model.py index ad48e5b44..d1d89248f 100644 --- a/tests/integration/test_model.py +++ b/tests/integration/test_model.py @@ -18,8 +18,6 @@ xgboost = None dask_cudf = None -pytest.importorskip("dask_ml") - def check_trained_model(c, model_name=None): if model_name is None: @@ -157,7 +155,24 @@ def test_clustering_and_prediction(c, training_df): c.sql( """ CREATE MODEL my_model WITH ( - model_class = 'dask_ml.cluster.KMeans' + model_class = 'sklearn.cluster.KMeans' + ) AS ( + SELECT x, y + FROM timeseries + LIMIT 100 + ) + """ + ) + + check_trained_model(c) + + +@pytest.mark.gpu +def test_gpu_clustering_and_prediction(c, gpu_training_df, gpu_client): + c.sql( + """ + CREATE MODEL my_model WITH ( + model_class = 'cuml.dask.cluster.KMeans' ) AS ( SELECT x, y FROM timeseries @@ -244,7 +259,7 @@ def test_show_models(c, training_df): c.sql( """ CREATE MODEL my_model2 WITH ( - model_class = 'dask_ml.cluster.KMeans' + model_class = 'sklearn.cluster.KMeans' ) AS ( SELECT x, y FROM timeseries @@ -691,7 +706,7 @@ def test_ml_experiment(c, client, training_df): c.sql( """ CREATE EXPERIMENT my_exp WITH ( - experiment_class = 'dask_ml.model_selection.GridSearchCV', + experiment_class = 'sklearn.model_selection.GridSearchCV', tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], max_depth = ARRAY [3,4,5,10]), target_column = 'target' @@ -731,7 +746,7 @@ def test_ml_experiment(c, client, training_df): """ CREATE EXPERIMENT IF NOT EXISTS my_exp WITH ( model_class = 'that.is.not.a.python.class', - experiment_class = 'dask_ml.model_selection.GridSearchCV', + experiment_class = 'sklearn.model_selection.GridSearchCV', tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], max_depth = ARRAY [3,4,5,10]), target_column = 'target' @@ -794,7 +809,7 @@ def test_ml_experiment(c, client, training_df): """ CREATE EXPERIMENT my_exp WITH ( model_class = 'sklearn.ensemble.GradientBoostingClassifier', - experiment_class = 'dask_ml.model_selection.GridSearchCV', + experiment_class = 'sklearn.model_selection.GridSearchCV', tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], max_depth = ARRAY [3,4,5,10]), target_column = 'target' @@ -816,7 +831,7 @@ def test_ml_experiment(c, client, training_df): """ CREATE EXPERIMENT my_exp WITH ( model_class = 'sklearn.ensemble.GradientBoostingClassifier', - experiment_class = 'dask_ml.model_selection.GridSearchCV', + experiment_class = 'sklearn.model_selection.GridSearchCV', tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], max_depth = ARRAY [3,4,5,10]), target_column = 'target' @@ -831,7 +846,7 @@ def test_ml_experiment(c, client, training_df): """ CREATE EXPERIMENT IF NOT EXISTS my_exp WITH ( model_class = 'sklearn.ensemble.GradientBoostingClassifier', - experiment_class = 'dask_ml.model_selection.GridSearchCV', + experiment_class = 'sklearn.model_selection.GridSearchCV', tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], max_depth = ARRAY [3,4,5,10]), target_column = 'target' @@ -847,7 +862,7 @@ def test_ml_experiment(c, client, training_df): """ CREATE OR REPLACE EXPERIMENT my_exp WITH ( model_class = 'sklearn.ensemble.GradientBoostingClassifier', - experiment_class = 'dask_ml.model_selection.GridSearchCV', + experiment_class = 'sklearn.model_selection.GridSearchCV', tune_parameters = (n_estimators = ARRAY [16, 32, 2],learning_rate = ARRAY [0.1,0.01,0.001], max_depth = ARRAY [3,4,5,10]), target_column = 'target' @@ -867,8 +882,8 @@ def test_ml_experiment(c, client, training_df): c.sql( """ CREATE EXPERIMENT my_exp1 WITH ( - model_class = 'dask_ml.cluster.KMeans', - experiment_class = 'dask_ml.model_selection.RandomizedSearchCV', + model_class = 'sklearn.cluster.KMeans', + experiment_class = 'sklearn.model_selection.RandomizedSearchCV', tune_parameters = (n_clusters = ARRAY [3,4,16],tol = ARRAY [0.1,0.01,0.001], max_iter = ARRAY [3,4,5,10]) ) AS ( @@ -889,7 +904,7 @@ def test_experiment_automl_classifier(c, client, training_df): """ CREATE EXPERIMENT my_automl_exp1 WITH ( automl_class = 'tpot.TPOTClassifier', - automl_kwargs = (population_size = 2 ,generations=2,cv=2,n_jobs=-1,use_dask=True), + automl_kwargs = (population_size=2, generations=2, cv=2, n_jobs=-1), target_column = 'target' ) AS ( SELECT x, y, x*y > 0 AS target @@ -914,11 +929,10 @@ def test_experiment_automl_regressor(c, client, training_df): """ CREATE EXPERIMENT my_automl_exp2 WITH ( automl_class = 'tpot.TPOTRegressor', - automl_kwargs = (population_size = 2, + automl_kwargs = (population_size=2, generations=2, cv=2, n_jobs=-1, - use_dask=True, max_eval_time_mins=1), target_column = 'target'