diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 5a77c6749fe..edcc140b191 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -116,7 +116,7 @@ jobs: build_type: pull-request script: ci/test_wheel_cudf.sh wheel-build-dask-cudf: - needs: wheel-tests-cudf + needs: wheel-build-cudf secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.02 with: diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index ae1d9c3fb1a..9c674518810 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. set -euo pipefail @@ -23,7 +23,7 @@ pyproject_file="${package_dir}/pyproject.toml" sed -i "s/^name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file} echo "${version}" > VERSION -sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py" +sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name//-/_}/_version.py" # For nightlies we want to ensure that we're pulling in alphas as well. The # easiest way to do so is to augment the spec with a constraint containing a @@ -34,7 +34,7 @@ if ! rapids-is-release-build; then alpha_spec=',>=0.0.0a0' fi -if [[ ${package_name} == "dask_cudf" ]]; then +if [[ ${package_name} == "dask-cudf" ]]; then sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file} sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file} diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh index 47e35c46004..b09c1e51271 100755 --- a/ci/build_wheel_dask_cudf.sh +++ b/ci/build_wheel_dask_cudf.sh @@ -1,11 +1,11 @@ #!/bin/bash -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. set -euo pipefail package_dir="python/dask_cudf" -./ci/build_wheel.sh dask_cudf ${package_dir} +./ci/build_wheel.sh dask-cudf ${package_dir} RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/dist diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 2c723146f35..21b540e24ab 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -51,7 +51,6 @@ dependencies: - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.2.* - make -- mimesis>=4.1.0 - moto>=4.0.8 - msgpack-python - myst-nb diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 52ef95c335a..c109dcca625 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -50,7 +50,6 @@ dependencies: - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.2.* - make -- mimesis>=4.1.0 - moto>=4.0.8 - msgpack-python - myst-nb diff --git a/dependencies.yaml b/dependencies.yaml index bc51372623a..94f31240797 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -603,7 +603,6 @@ dependencies: - cramjam - fastavro>=0.22.9 - hypothesis - - mimesis>=4.1.0 - pytest-benchmark - pytest-cases - python-snappy>=0.6.0 diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 1ba205275f3..13c194d6be0 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # This module is for generating "synthetic" datasets. It was originally # designed for testing filtered reading. Generally, it should be useful @@ -11,11 +11,9 @@ import uuid from multiprocessing import Pool -import mimesis import numpy as np import pandas as pd import pyarrow as pa -from mimesis import Generic from pyarrow import parquet as pq import cudf @@ -35,8 +33,7 @@ class ColumnParameters: null_frequency : 0.1 Probability of a generated value being null generator : Callable - Function for generating random data. It is passed a Mimesis Generic - provider and returns an Iterable that generates data. + Function for generating random data. is_sorted : bool Sort this column. Columns are sorted in same order as ColumnParameters instances stored in column_params of Parameters. If there are one or @@ -51,7 +48,10 @@ def __init__( self, cardinality=100, null_frequency=0.1, - generator=lambda g: [g.address.country for _ in range(100)], + generator=lambda: [ + _generate_string(string.ascii_letters, random.randint(4, 8)) + for _ in range(100) + ], is_sorted=True, dtype=None, ): @@ -235,15 +235,9 @@ def get_dataframe(parameters, use_threads): if parameters.seed is not None: np.random.seed(parameters.seed) - # For each column, use a generic Mimesis producer to create an Iterable - # for generating data - for i, column_params in enumerate(parameters.column_parameters): - if column_params.dtype is None: - column_params.generator = column_params.generator( - Generic("en", seed=parameters.seed) - ) - else: - column_params.generator = column_params.generator() + # For each column, invoke the data generator + for column_params in parameters.column_parameters: + column_params.generator = column_params.generator() # Get schema for each column table_fields = [] @@ -343,7 +337,6 @@ def rand_dataframe( # Apply seed random.seed(seed) np.random.seed(seed) - mimesis.random.random.seed(seed) column_params = [] for meta in dtypes_meta: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 5c9e3aa3d9f..007349ab551 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import datetime import glob @@ -6,6 +6,7 @@ import os import pathlib import random +import string from contextlib import contextmanager from io import BytesIO from string import ascii_letters @@ -432,13 +433,20 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): dg.ColumnParameters( cardinality=40, null_frequency=0.05, - generator=lambda g: [g.address.city() for _ in range(40)], + generator=lambda: [ + "".join( + random.sample( + string.ascii_letters, random.randint(4, 8) + ) + ) + for _ in range(40) + ], is_sorted=False, ), dg.ColumnParameters( 40, 0.2, - lambda g: [g.person.age() for _ in range(40)], + lambda: np.random.default_rng().integers(0, 100, size=40), True, ), ], diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 17c8ba02d3a..7c3f4a97a5e 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -56,7 +56,6 @@ test = [ "cramjam", "fastavro>=0.22.9", "hypothesis", - "mimesis>=4.1.0", "msgpack", "pytest", "pytest-benchmark", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index e5237d206d4..33065da6e8d 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -8,7 +8,7 @@ requires = [ ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project] -name = "dask_cudf" +name = "dask-cudf" dynamic = ["version"] description = "Utilities for Dask and cuDF interactions" readme = { file = "README.md", content-type = "text/markdown" }