From 392d09b4717f954eb3eb203c887a4c959692f02e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Mon, 1 May 2023 15:42:40 -0400 Subject: [PATCH] Roll our own generate_string() because mimesis' has gone away (#13257) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting `mimesis=9.0.0`, the `generate_string` function has become private: ```python In [1]: import mimesis In [2]: mimesis.__version__ Out[2]: '9.0.0' In [3]: mimesis.random.random.generate_string --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[3], line 1 ----> 1 mimesis.random.random.generate_string AttributeError: 'Random' object has no attribute 'generate_string' In [4]: mimesis.random.random._generate_string Out[4]: > ``` This PR replaces all uses of the function with a homespun one. Note that the implementation is about as fast (perhaps identical?) ```python In [6]: %timeit "".join(random.choices(string.printable, k=100)) 9.25 µs ± 98.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each) In [7]: %timeit mimesis.random.random._generate_string(string.printable, 100) 9.62 µs ± 103 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each) ``` Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13257 --- python/cudf/cudf/testing/dataset_generator.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 2867c4d10eb..1ba205275f3 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. # This module is for generating "synthetic" datasets. It was originally # designed for testing filtered reading. Generally, it should be useful @@ -8,6 +8,7 @@ import copy import random import string +import uuid from multiprocessing import Pool import mimesis @@ -457,8 +458,7 @@ def rand_dataframe( cardinality=cardinality, null_frequency=null_frequency, generator=lambda cardinality=cardinality: [ - mimesis.random.random.randstr(unique=True, length=2000) - for _ in range(cardinality) + _unique_string() for _ in range(cardinality) ], is_sorted=False, dtype="category", @@ -502,7 +502,7 @@ def rand_dataframe( cardinality=cardinality, null_frequency=null_frequency, generator=lambda cardinality=cardinality: [ - mimesis.random.random.generate_string( + _generate_string( string.printable, np.random.randint( low=0, @@ -684,7 +684,7 @@ def get_values_for_nested_data(dtype, lists_max_length=None, size=None): values = float_generator(dtype=dtype, size=cardinality)() elif dtype.kind in ("U", "O"): values = [ - mimesis.random.random.generate_string( + _generate_string( string.printable, 100, ) @@ -847,3 +847,11 @@ def create_nested_struct_type(max_types_at_each_level, nesting_level): else: type_dict[str(name)] = cudf.dtype(type_) return cudf.StructDtype(type_dict) + + +def _generate_string(str_seq: str, length: int = 10) -> str: + return "".join(random.choices(str_seq, k=length)) + + +def _unique_string() -> str: + return str(uuid.uuid4()).replace("-", "")