Remove mimesis as a testing dependency (#14723)

Looks like this is used to generate random ages/cities/countries. As generically cudf should work with arbitrary ints/strings, I don't think it's critical that the test data has real world meaning Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) - AJ Schmidt (https://github.com/ajschmidt8) - Ashwin Srinath (https://github.com/shwina) URL: #14723
rapidsai · Jan 8, 2024 · 79d5070 · 79d5070
1 parent fa8db7a
commit 79d5070
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 23 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -51,7 +51,6 @@ dependencies:
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.2.*
 - make
-- mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
 - myst-nb

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -50,7 +50,6 @@ dependencies:
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.2.*
 - make
-- mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
 - myst-nb

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -603,7 +603,6 @@ dependencies:
           - cramjam
           - fastavro>=0.22.9
           - hypothesis
-          - mimesis>=4.1.0
           - pytest-benchmark
           - pytest-cases
           - python-snappy>=0.6.0

diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 # This module is for generating "synthetic" datasets. It was originally
 # designed for testing filtered reading. Generally, it should be useful
@@ -11,11 +11,9 @@
 import uuid
 from multiprocessing import Pool
 
-import mimesis
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from mimesis import Generic
 from pyarrow import parquet as pq
 
 import cudf
@@ -35,8 +33,7 @@ class ColumnParameters:
     null_frequency : 0.1
         Probability of a generated value being null
     generator : Callable
-        Function for generating random data. It is passed a Mimesis Generic
-        provider and returns an Iterable that generates data.
+        Function for generating random data.
     is_sorted : bool
         Sort this column. Columns are sorted in same order as ColumnParameters
         instances stored in column_params of Parameters. If there are one or
@@ -51,7 +48,10 @@ def __init__(
         self,
         cardinality=100,
         null_frequency=0.1,
-        generator=lambda g: [g.address.country for _ in range(100)],
+        generator=lambda: [
+            _generate_string(string.ascii_letters, random.randint(4, 8))
+            for _ in range(100)
+        ],
         is_sorted=True,
         dtype=None,
     ):
@@ -235,15 +235,9 @@ def get_dataframe(parameters, use_threads):
     if parameters.seed is not None:
         np.random.seed(parameters.seed)
 
-    # For each column, use a generic Mimesis producer to create an Iterable
-    # for generating data
-    for i, column_params in enumerate(parameters.column_parameters):
-        if column_params.dtype is None:
-            column_params.generator = column_params.generator(
-                Generic("en", seed=parameters.seed)
-            )
-        else:
-            column_params.generator = column_params.generator()
+    # For each column, invoke the data generator
+    for column_params in parameters.column_parameters:
+        column_params.generator = column_params.generator()
 
     # Get schema for each column
     table_fields = []
@@ -343,7 +337,6 @@ def rand_dataframe(
     # Apply seed
     random.seed(seed)
     np.random.seed(seed)
-    mimesis.random.random.seed(seed)
 
     column_params = []
     for meta in dtypes_meta:

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -1,11 +1,12 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
 import glob
 import math
 import os
 import pathlib
 import random
+import string
 from contextlib import contextmanager
 from io import BytesIO
 from string import ascii_letters
@@ -432,13 +433,20 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
                 dg.ColumnParameters(
                     cardinality=40,
                     null_frequency=0.05,
-                    generator=lambda g: [g.address.city() for _ in range(40)],
+                    generator=lambda: [
+                        "".join(
+                            random.sample(
+                                string.ascii_letters, random.randint(4, 8)
+                            )
+                        )
+                        for _ in range(40)
+                    ],
                     is_sorted=False,
                 ),
                 dg.ColumnParameters(
                     40,
                     0.2,
-                    lambda g: [g.person.age() for _ in range(40)],
+                    lambda: np.random.default_rng().integers(0, 100, size=40),
                     True,
                 ),
             ],

diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
@@ -56,7 +56,6 @@ test = [
     "cramjam",
     "fastavro>=0.22.9",
     "hypothesis",
-    "mimesis>=4.1.0",
     "msgpack",
     "pytest",
     "pytest-benchmark",