Merge branch 'master' into 3714-pdarray.shape-should-be-a-tuple

Bears-R-Us · Oct 1, 2024 · a191247 · a191247
2 parents 7e41aac + b90155f
commit a191247
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 42 deletions.
diff --git a/Makefile b/Makefile
@@ -568,9 +568,10 @@ CLEAN_TARGETS += test-clean
 test-clean:
 	$(RM) $(TEST_TARGETS) $(addsuffix _real,$(TEST_TARGETS))
 
+size = 10**8
 .PHONY: benchmark
 benchmark:
-	python3 -m pytest -c benchmark.ini --benchmark-autosave --benchmark-storage=file://benchmark_v2/.benchmarks
+	python3 -m pytest -c benchmark.ini --benchmark-autosave --benchmark-storage=file://benchmark_v2/.benchmarks --size=$(size)
 
 version:
 	@echo $(VERSION);

diff --git a/benchmark_v2/array_create_benchmark.py b/benchmark_v2/array_create_benchmark.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pytest
-from context import arkouda as ak
+import arkouda as ak
 
 OPS = ("zeros", "ones", "randint")
 TYPES = ("int64", "float64", "uint64")

diff --git a/benchmark_v2/conftest.py b/benchmark_v2/conftest.py
@@ -16,6 +16,119 @@
 default_compression = [None, "snappy", "gzip", "brotli", "zstd", "lz4"]
 
 
+def pytest_addoption(parser):
+    parser.addoption(
+        "--optional-parquet", action="store_true", default=False, help="run optional parquet tests"
+    )
+
+    parser.addoption(
+        "--size",
+        action="store",
+        default="10**8",
+        help="Benchmark only option. Problem size: length of array to use for benchmarks.",
+    )
+    parser.addoption(
+        "--trials",
+        action="store",
+        default="5",
+        help="Benchmark only option. Problem size: length of array to use for benchmarks. For tests that run "
+        "as many trials as possible in a given time, will be treated as number of seconds to run for.",
+    )
+    parser.addoption(
+        "--seed",
+        action="store",
+        default="",
+        help="Benchmark only option. Value to initialize random number generator.",
+    )
+    parser.addoption(
+        "--dtype",
+        action="store",
+        default="",
+        help="Benchmark only option. Dtypes to run benchmarks against. Comma separated list "
+        "(NO SPACES) allowing for multiple. Accepted values: int64, uint64, bigint, float64, bool, str and mixed."
+        "Mixed is used to generate sets of multiple types.",
+    )
+    parser.addoption(
+        "--numpy",
+        action="store_true",
+        default=False,
+        help="Benchmark only option. When set, runs numpy comparison benchmarks.",
+    )
+    parser.addoption(
+        "--maxbits",
+        action="store",
+        default="-1",
+        help="Benchmark only option. Only applies to bigint testing."
+        "Maximum number of bits, so values > 2**max_bits will wraparound. -1 is interpreted as no maximum.",
+    )
+    parser.addoption(
+        "--alpha", action="store", default="1.0", help="Benchmark only option. Scalar multiple"
+    )
+    parser.addoption(
+        "--randomize",
+        action="store_true",
+        default=False,
+        help="Benchmark only option. Fill arrays with random values instead of ones",
+    )
+    parser.addoption(
+        "--index_size",
+        action="store",
+        default="",
+        help="Benchmark only option. Length of index array (number of gathers to perform)",
+    )
+    parser.addoption(
+        "--value_size",
+        action="store",
+        default="",
+        help="Benchmark only option.Length of array from which values are gathered",
+    )
+    parser.addoption(
+        "--encoding",
+        action="store",
+        default="",
+        help="Benchmark only option. Only applies to encoding benchmarks."
+        "Comma separated list (NO SPACES) allowing for multiple"
+        "Encoding to be used. Accepted values: idna, ascii",
+    )
+    parser.addoption(
+        "--io_only_write",
+        action="store_true",
+        default=False,
+        help="Benchmark only option. Only write the files; files will not be removed",
+    )
+    parser.addoption(
+        "--io_only_read",
+        action="store_true",
+        default=False,
+        help="Benchmark only option. Only read the files; files will not be removed",
+    )
+    parser.addoption(
+        "--io_only_delete",
+        action="store_true",
+        default=False,
+        help="Benchmark only option. Only delete files created from writing with this benchmark",
+    )
+    parser.addoption(
+        "--io_files_per_loc",
+        action="store",
+        default="1",
+        help="Benchmark only option. Number of files to create per locale",
+    )
+    parser.addoption(
+        "--io_compression",
+        action="store",
+        default="",
+        help="Benchmark only option. Compression types to run IO benchmarks against. Comma delimited list"
+        "(NO SPACES) allowing for multiple. Accepted values: none, snappy, gzip, brotli, zstd, and lz4",
+    )
+    parser.addoption(
+        "--io_path",
+        action="store",
+        default=os.path.join(os.getcwd(), "ak_io_benchmark"),
+        help="Benchmark only option. Target path for measuring read/write rates",
+    )
+
+
 def pytest_configure(config):
     pytest.prob_size = eval(config.getoption("size"))
     pytest.trials = eval(config.getoption("trials"))
@@ -28,8 +141,12 @@ def pytest_configure(config):
     pytest.numpy = config.getoption("numpy")
     encode_str = config.getoption("encoding")
     pytest.encoding = default_encoding if encode_str == "" else encode_str.split(",")
-    pytest.idx_size = None if config.getoption("index_size") == "" else eval(config.getoption("index_size"))
-    pytest.val_size = None if config.getoption("value_size") == "" else eval(config.getoption("value_size"))
+    pytest.idx_size = (
+        None if config.getoption("index_size") == "" else eval(config.getoption("index_size"))
+    )
+    pytest.val_size = (
+        None if config.getoption("value_size") == "" else eval(config.getoption("value_size"))
+    )
 
     # IO settings
     comp_str = config.getoption("io_compression")
@@ -64,11 +181,7 @@ def startup_teardown():
                 e,
             )
     else:
-        print(
-            "in client stack test mode with host: {} port: {}".format(
-                server, port
-            )
-        )
+        print("in client stack test mode with host: {} port: {}".format(server, port))
 
     yield
 
@@ -85,9 +198,7 @@ def manage_connection():
     server = os.getenv("ARKOUDA_SERVER_HOST", "localhost")
     timeout = int(os.getenv("ARKOUDA_CLIENT_TIMEOUT", 5))
     try:
-        ak.connect(
-            server=server, port=port, timeout=timeout
-        )
+        ak.connect(server=server, port=port, timeout=timeout)
     except Exception as e:
         raise ConnectionError(e)
 
@@ -96,4 +207,4 @@ def manage_connection():
     try:
         ak.disconnect()
     except Exception as e:
-        raise ConnectionError(e)
+        raise ConnectionError(e)
diff --git a/benchmark_v2/sort_cases_benchmark.py b/benchmark_v2/sort_cases_benchmark.py
@@ -4,7 +4,6 @@
 from arkouda.sorting import SortingAlgorithm
 
 TYPES = ("int64", "float64")
-POWERLAW_DATA = None
 
 
 def get_nbytes(data):
@@ -63,15 +62,11 @@ def bench_random_uniform(benchmark, algo, dtype, bits):
 
 
 def _generate_power_law_data():
-    global POWERLAW_DATA
+    y = ak.uniform(pytest.prob_size)
+    a = -2.5  # power law exponent, between -2 and -3
+    ub = 2**32  # upper bound
 
-    if POWERLAW_DATA is None:
-        y = ak.uniform(pytest.prob_size)
-        a = -2.5  # power law exponent, between -2 and -3
-        ub = 2 ** 32  # upper bound
-        POWERLAW_DATA = ((ub ** (a + 1) - 1) * y + 1) ** (1 / (a + 1))
-
-    return POWERLAW_DATA
+    return ((ub ** (a + 1) - 1) * y + 1) ** (1 / (a + 1))
 
 
 @pytest.mark.benchmark(group="AK_Sort_Cases")

diff --git a/benchmark_v2/str_locality_benchmark.py b/benchmark_v2/str_locality_benchmark.py
@@ -5,45 +5,36 @@
     "Hashing": lambda x: x.hash(),
     "Regex_Search": lambda x: x.contains(r"\d{3,5}\.\d{5,8}", regex=True),
     "Casting": lambda x: ak.cast(x, ak.float64),
-    "Scalar_Compare": lambda x: (x == "5.5")
+    "Scalar_Compare": lambda x: (x == "5.5"),
 }
 
 # Good - generates random Strings object with "good" locality
 # poor - generates a sorted Strings object with "poor" locality
 LOCALITY = {"Good", "Poor"}
 
 
-RAND_DATA = None
-SORT_DATA = None
-
-
 def _generate_data(loc):
     """
     Generate the test data. In an interest to leverage the same data for the benchmark
     The data is all created at once.
     """
-    global RAND_DATA, SORT_DATA
-
-    # early out if already set
-    if loc == "Good" and RAND_DATA is not None:
-        return RAND_DATA
-    if loc == "Poor" and SORT_DATA is not None:
-        return SORT_DATA
 
     # otherwise set both and return the desired one.
     N = pytest.prob_size * ak.get_config()["numLocales"]
-    prefix = ak.random_strings_uniform(minlen=1, maxlen=16, size=N, seed=pytest.seed, characters="numeric")
+    prefix = ak.random_strings_uniform(
+        minlen=1, maxlen=16, size=N, seed=pytest.seed, characters="numeric"
+    )
     if pytest.seed is not None:
         pytest.seed += 1
-    suffix = ak.random_strings_uniform(minlen=1, maxlen=16, size=N, seed=pytest.seed, characters="numeric")
+    suffix = ak.random_strings_uniform(
+        minlen=1, maxlen=16, size=N, seed=pytest.seed, characters="numeric"
+    )
     random_strings = prefix.stick(suffix, delimiter=".")
-    RAND_DATA = random_strings
 
     perm = ak.argsort(random_strings.get_lengths())
     sorted_strings = random_strings[perm]
-    SORT_DATA = sorted_strings
 
-    return RAND_DATA if loc == "Good" else SORT_DATA
+    return random_strings if loc == "Good" else sorted_strings
 
 
 @pytest.mark.benchmark(group="String_Locality")
@@ -53,8 +44,11 @@ def bench_str_locality(benchmark, op, loc):
     data = _generate_data(loc)
     benchmark.pedantic(OPS[op], args=[data], rounds=pytest.trials)
 
-    benchmark.extra_info["description"] = "Measure the performance of various string operations on " \
-                                          "strings with good locality (random) and poor locality (sorted)."
+    benchmark.extra_info["description"] = (
+        "Measure the performance of various string operations on "
+        "strings with good locality (random) and poor locality (sorted)."
+    )
     benchmark.extra_info["problem_size"] = pytest.prob_size
     benchmark.extra_info["transfer_rate"] = "{:.4f} GiB/sec".format(
-        (data.nbytes / benchmark.stats["mean"]) / 2 ** 30)
+        (data.nbytes / benchmark.stats["mean"]) / 2**30
+    )