diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 2c723146f35..21b540e24ab 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -51,7 +51,6 @@ dependencies: - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.2.* - make -- mimesis>=4.1.0 - moto>=4.0.8 - msgpack-python - myst-nb diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 52ef95c335a..c109dcca625 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -50,7 +50,6 @@ dependencies: - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.2.* - make -- mimesis>=4.1.0 - moto>=4.0.8 - msgpack-python - myst-nb diff --git a/dependencies.yaml b/dependencies.yaml index 91ac8371308..30c21b42b21 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -603,7 +603,6 @@ dependencies: - cramjam - fastavro>=0.22.9 - hypothesis - - mimesis>=4.1.0 - pytest-benchmark - pytest-cases - python-snappy>=0.6.0 diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 1ba205275f3..13c194d6be0 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # This module is for generating "synthetic" datasets. It was originally # designed for testing filtered reading. Generally, it should be useful @@ -11,11 +11,9 @@ import uuid from multiprocessing import Pool -import mimesis import numpy as np import pandas as pd import pyarrow as pa -from mimesis import Generic from pyarrow import parquet as pq import cudf @@ -35,8 +33,7 @@ class ColumnParameters: null_frequency : 0.1 Probability of a generated value being null generator : Callable - Function for generating random data. It is passed a Mimesis Generic - provider and returns an Iterable that generates data. + Function for generating random data. is_sorted : bool Sort this column. Columns are sorted in same order as ColumnParameters instances stored in column_params of Parameters. If there are one or @@ -51,7 +48,10 @@ def __init__( self, cardinality=100, null_frequency=0.1, - generator=lambda g: [g.address.country for _ in range(100)], + generator=lambda: [ + _generate_string(string.ascii_letters, random.randint(4, 8)) + for _ in range(100) + ], is_sorted=True, dtype=None, ): @@ -235,15 +235,9 @@ def get_dataframe(parameters, use_threads): if parameters.seed is not None: np.random.seed(parameters.seed) - # For each column, use a generic Mimesis producer to create an Iterable - # for generating data - for i, column_params in enumerate(parameters.column_parameters): - if column_params.dtype is None: - column_params.generator = column_params.generator( - Generic("en", seed=parameters.seed) - ) - else: - column_params.generator = column_params.generator() + # For each column, invoke the data generator + for column_params in parameters.column_parameters: + column_params.generator = column_params.generator() # Get schema for each column table_fields = [] @@ -343,7 +337,6 @@ def rand_dataframe( # Apply seed random.seed(seed) np.random.seed(seed) - mimesis.random.random.seed(seed) column_params = [] for meta in dtypes_meta: diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 5c9e3aa3d9f..007349ab551 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import datetime import glob @@ -6,6 +6,7 @@ import os import pathlib import random +import string from contextlib import contextmanager from io import BytesIO from string import ascii_letters @@ -432,13 +433,20 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): dg.ColumnParameters( cardinality=40, null_frequency=0.05, - generator=lambda g: [g.address.city() for _ in range(40)], + generator=lambda: [ + "".join( + random.sample( + string.ascii_letters, random.randint(4, 8) + ) + ) + for _ in range(40) + ], is_sorted=False, ), dg.ColumnParameters( 40, 0.2, - lambda g: [g.person.age() for _ in range(40)], + lambda: np.random.default_rng().integers(0, 100, size=40), True, ), ], diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 17c8ba02d3a..7c3f4a97a5e 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -56,7 +56,6 @@ test = [ "cramjam", "fastavro>=0.22.9", "hypothesis", - "mimesis>=4.1.0", "msgpack", "pytest", "pytest-benchmark",