Skip to content

Commit

Permalink
Closes Bears-R-Us#3827: rename flatten to split
Browse files Browse the repository at this point in the history
  • Loading branch information
ajpotts committed Oct 9, 2024
1 parent 7c0599a commit 57852b6
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 45 deletions.
6 changes: 3 additions & 3 deletions arkouda/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,7 @@ def fullmatch(self, pattern: Union[bytes, str_scalars]) -> Match:
return self._get_matcher(pattern).get_match(MatchType.FULLMATCH, self)

@typechecked()
def split(
def regex_split(
self, pattern: Union[bytes, str_scalars], maxsplit: int = 0, return_segments: bool = False
) -> Union[Strings, Tuple]:
"""
Expand Down Expand Up @@ -1560,7 +1560,7 @@ def endswith(self, substr: Union[bytes, str_scalars], regex: bool = False) -> pd
self._empty_pattern_verification(substr)
return self.contains(substr + "$", regex=True)

def flatten(
def split(
self, delimiter: str, return_segments: bool = False, regex: bool = False
) -> Union[Strings, Tuple]:
"""Unpack delimiter-joined substrings into a flat array.
Expand Down Expand Up @@ -1609,7 +1609,7 @@ def flatten(
re.compile(delimiter)
except Exception as e:
raise ValueError(e)
return self.split(delimiter, return_segments=return_segments)
return self.regex_split(delimiter, return_segments=return_segments)
else:
cmd = "segmentedFlatten"
repMsg = cast(
Expand Down
2 changes: 1 addition & 1 deletion benchmark.ini
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ testpaths =
benchmark_v2/array_create_benchmark.py
benchmark_v2/groupby_benchmark.py
benchmark_v2/coargsort_benchmark.py
benchmark_v2/flatten_benchmark.py
benchmark_v2/split_benchmark.py
benchmark_v2/encoding_benchmark.py
benchmark_v2/reduce_benchmark.py
benchmark_v2/gather_benchmark.py
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,35 @@ def _generate_test_data():


@pytest.mark.benchmark(group="AK_Flatten")
def bench_flatten_nonregex(benchmark):
def bench_split_nonregex(benchmark):
thickrange, nbytes = _generate_test_data()

benchmark.pedantic(thickrange.flatten, args=["_"], rounds=pytest.trials)
benchmark.extra_info["description"] = "Measures the performance of Strings.flatten"
benchmark.pedantic(thickrange.split, args=["_"], rounds=pytest.trials)
benchmark.extra_info["description"] = "Measures the performance of Strings.split"
benchmark.extra_info["problem_size"] = pytest.prob_size
benchmark.extra_info["transfer_rate"] = "{:.4f} GiB/sec".format(
(nbytes / benchmark.stats["mean"]) / 2**30
)


@pytest.mark.benchmark(group="AK_Flatten")
def bench_flatten_regexliteral(benchmark):
def bench_split_regexliteral(benchmark):
thickrange, nbytes = _generate_test_data()

benchmark.pedantic(thickrange.flatten, args=["_"], kwargs={"regex": True}, rounds=pytest.trials)
benchmark.extra_info["description"] = "Measures the performance of Strings.flatten"
benchmark.pedantic(thickrange.split, args=["_"], kwargs={"regex": True}, rounds=pytest.trials)
benchmark.extra_info["description"] = "Measures the performance of Strings.split"
benchmark.extra_info["problem_size"] = pytest.prob_size
benchmark.extra_info["transfer_rate"] = "{:.4f} GiB/sec".format(
(nbytes / benchmark.stats["mean"]) / 2**30
)


@pytest.mark.benchmark(group="AK_Flatten")
def bench_flatten_regexpattern(benchmark):
def bench_split_regexpattern(benchmark):
thickrange, nbytes = _generate_test_data()

benchmark.pedantic(thickrange.flatten, args=["_+"], kwargs={"regex": True}, rounds=pytest.trials)
benchmark.extra_info["description"] = "Measures the performance of Strings.flatten"
benchmark.pedantic(thickrange.split, args=["_+"], kwargs={"regex": True}, rounds=pytest.trials)
benchmark.extra_info["description"] = "Measures the performance of Strings.split"
benchmark.extra_info["problem_size"] = pytest.prob_size
benchmark.extra_info["transfer_rate"] = "{:.4f} GiB/sec".format(
(nbytes / benchmark.stats["mean"]) / 2**30
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/run_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"str-gather",
"str-in1d",
"substring_search",
"flatten",
"split",
"sort-cases",
"multiIO",
"str-locality",
Expand Down
34 changes: 17 additions & 17 deletions benchmarks/flatten.py → benchmarks/split.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import arkouda as ak


def time_flatten(N_per_locale, trials):
print(">>> arkouda flatten")
def time_split(N_per_locale, trials):
print(">>> arkouda split")
cfg = ak.get_config()
N = N_per_locale * cfg["numLocales"]
print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
Expand All @@ -21,17 +21,17 @@ def time_flatten(N_per_locale, trials):
regex_pattern_times = []
for i in range(trials):
start = time.time()
non_regex = thickrange.flatten("_")
non_regex = thickrange.split("_")
end = time.time()
non_regex_times.append(end - start)

start = time.time()
regex_literal = thickrange.flatten("_", regex=True)
regex_literal = thickrange.split("_", regex=True)
end = time.time()
regex_literal_times.append(end - start)

start = time.time()
regex_pattern = thickrange.flatten("_+", regex=True)
regex_pattern = thickrange.split("_+", regex=True)
end = time.time()
regex_pattern_times.append(end - start)

Expand All @@ -44,22 +44,22 @@ def time_flatten(N_per_locale, trials):
assert (regex_literal == answer).all()
assert (regex_pattern == answer).all()

print("non-regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_non_regex))
print("regex flatten with literal delimiter Average time = {:.4f} sec".format(avg_regex_literal))
print("regex flatten with pattern delimiter Average time = {:.4f} sec".format(avg_regex_pattern))
print("non-regex split with literal delimiter Average time = {:.4f} sec".format(avg_non_regex))
print("regex split with literal delimiter Average time = {:.4f} sec".format(avg_regex_literal))
print("regex split with pattern delimiter Average time = {:.4f} sec".format(avg_regex_pattern))

print(
"non-regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(
"non-regex split with literal delimiter Average rate = {:.4f} GiB/sec".format(
nbytes / 2**30 / avg_non_regex
)
)
print(
"regex flatten with literal delimiter Average rate = {:.4f} GiB/sec".format(
"regex split with literal delimiter Average rate = {:.4f} GiB/sec".format(
nbytes / 2**30 / avg_regex_literal
)
)
print(
"regex flatten with pattern delimiter Average rate = {:.4f} GiB/sec".format(
"regex split with pattern delimiter Average rate = {:.4f} GiB/sec".format(
nbytes / 2**30 / avg_regex_pattern
)
)
Expand All @@ -72,19 +72,19 @@ def check_correctness():
thickrange = thirds[0].stick(thirds[1], delimiter="_").stick(thirds[2], delimiter="_")

answer = ak.cast(ak.arange(N * 3), "str")
assert (thickrange.flatten("_") == answer).all()
assert (thickrange.flatten("_", regex=True) == answer).all()
assert (thickrange.flatten("_+", regex=True) == answer).all()
assert (thickrange.split("_") == answer).all()
assert (thickrange.split("_", regex=True) == answer).all()
assert (thickrange.split("_+", regex=True) == answer).all()


def create_parser():
parser = argparse.ArgumentParser(
description="Measure the performance of regex and non-regex flatten on Strings."
description="Measure the performance of regex and non-regex split on Strings."
)
parser.add_argument("hostname", help="Hostname of arkouda server")
parser.add_argument("port", type=int, help="Port of arkouda server")
parser.add_argument(
"-n", "--size", type=int, default=10**5, help="Problem size: Number of Strings to flatten"
"-n", "--size", type=int, default=10**5, help="Problem size: Number of Strings to split"
)
parser.add_argument(
"-t", "--trials", type=int, default=1, help="Number of times to run the benchmark"
Expand Down Expand Up @@ -112,5 +112,5 @@ def create_parser():

print("array size = {:,}".format(args.size))
print("number of trials = ", args.trials)
time_flatten(args.size, args.trials)
time_split(args.size, args.trials)
sys.exit(0)
4 changes: 2 additions & 2 deletions tests/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def test_segarray_read(self, par_test_base_tmp, comp):

def test_segarray_string(self, par_test_base_tmp):
words = ak.array(["one,two,three", "uno,dos,tres"])
strs, segs = words.split(",", return_segments=True)
strs, segs = words.regex_split(",", return_segments=True)
x = ak.SegArray(segs, strs)

with tempfile.TemporaryDirectory(dir=par_test_base_tmp) as tmp_dirname:
Expand Down Expand Up @@ -1865,7 +1865,7 @@ def test_dataframe_segarr(self, hdf_test_base_tmp):

def test_segarray_str_hdf5(self, hdf_test_base_tmp):
words = ak.array(["one,two,three", "uno,dos,tres"])
strs, segs = words.split(",", return_segments=True)
strs, segs = words.regex_split(",", return_segments=True)

x = ak.SegArray(segs, strs)
with tempfile.TemporaryDirectory(dir=hdf_test_base_tmp) as tmp_dirname:
Expand Down
18 changes: 9 additions & 9 deletions tests/regex_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def sub_helper(
def test_empty_string_patterns(self):
lit_str = ["0 String 0", "^", " "]
ak_str = ak.array(lit_str)
has_regex_arg = ["contains", "startswith", "endswith", "peel", "flatten"]
has_regex_arg = ["contains", "startswith", "endswith", "peel", "split"]

for pattern in "", "|", "^":
TestRegex.match_objects_helper(pattern, lit_str)
Expand All @@ -74,7 +74,7 @@ def test_empty_string_patterns(self):
# peel is broken on one char strings with patterns that match empty string
# str split and non-regex flatten don't work with empty separator, so
# it makes sense for the regex versions to return a value error
for fn in "peel", "split", "flatten":
for fn in "peel", "regex_split", "split":
func = getattr(ak_str, fn)
with pytest.raises(ValueError):
func(pattern, regex=True) if fn in has_regex_arg else func(pattern)
Expand All @@ -92,8 +92,8 @@ def test_empty_string_patterns(self):
"endswith",
"findall",
"peel",
"regex_split",
"split",
"flatten",
):
for s, pat in zip([ak.array([""]), ak.array(["0 String 0"])], ["", "$"]):
func = getattr(s, fn)
Expand Down Expand Up @@ -133,13 +133,13 @@ def test_caputure_groups(self):
# verify fluid programming with Match object doesn't raise a RuntimeError
ak.array(["1_2___", "____", "3", "__4___5____6___7", ""]).search("_+").find_matches()

def test_split(self):
def test_regex_split(self):
strings = ak.array(
["", "____", "_1_2____", "3___4___", "5", "__6__", "___7", "__8___9____10____11"]
)
pattern = "_+"
maxsplit = 3
split, split_map = strings.split(pattern, maxsplit, return_segments=True)
split, split_map = strings.regex_split(pattern, maxsplit, return_segments=True)
for i in range(strings.size):
re_split = re.split(pattern, strings[i], maxsplit)
ak_split = (
Expand Down Expand Up @@ -251,7 +251,7 @@ def test_regex_peel(self):
assert ["", "", "1f2g"] == d_right.to_list()
assert ["", "", "__f____g"] == u_right.to_list()

def test_regex_flatten(self):
def test_regex_regex_split(self):
orig = ak.array(["one|two", "three|four|five", "six", "seven|eight|nine|ten|", "eleven"])
digit = ak.array(["one1two", "three2four3five", "six", "seven4eight5nine6ten7", "eleven"])
under = ak.array(
Expand All @@ -274,7 +274,7 @@ def test_regex_flatten(self):
]
answer_map = [0, 2, 5, 6, 11]
for pattern, strings in zip(["|", "\\d", "_+"], [orig, digit, under]):
ak_flat, ak_map = strings.flatten(pattern, return_segments=True, regex=pattern != "|")
ak_flat, ak_map = strings.split(pattern, return_segments=True, regex=pattern != "|")
assert answer_flat == ak_flat.to_list()
assert answer_map == ak_map.to_list()

Expand All @@ -285,8 +285,8 @@ def test_regex_flatten(self):
answer_flat = ["", "", "", "", "1", "2", "3", "4", "", "5"]
answer_map = [0, 1, 3, 6, 9]

orig_flat, orig_map = orig.flatten("|", return_segments=True)
regex_flat, regex_map = regex.flatten("_+", return_segments=True, regex=True)
orig_flat, orig_map = orig.split("|", return_segments=True)
regex_flat, regex_map = regex.split("_+", return_segments=True, regex=True)

assert answer_flat == orig_flat.to_list()
assert answer_flat == regex_flat.to_list()
Expand Down
6 changes: 3 additions & 3 deletions tests/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,14 +451,14 @@ def test_str_output(self):
str_ans = "['string 0', 'string 1', 'string 2', ... , 'string 98', 'string 99', 'string 100']"
assert str_ans == str(strings)

def test_flatten(self):
def test_split(self):
orig = ak.array(["one|two", "three|four|five", "six"])
flat, mapping = orig.flatten("|", return_segments=True)
flat, mapping = orig.split("|", return_segments=True)
assert flat.to_list() == ["one", "two", "three", "four", "five", "six"]
assert mapping.to_list() == [0, 2, 5]
thirds = [ak.cast(ak.arange(i, 99, 3), "str") for i in range(3)]
thickrange = thirds[0].stick(thirds[1], delimiter=", ").stick(thirds[2], delimiter=", ")
flatrange = thickrange.flatten(", ")
flatrange = thickrange.split(", ")
assert ak.cast(flatrange, "int64").to_list(), np.arange(99).tolist()

def test_get_lengths(self):
Expand Down

0 comments on commit 57852b6

Please sign in to comment.