From 2962954b7449556efe2e98a533ecc6d9fdc2c6fd Mon Sep 17 00:00:00 2001 From: Ben McDonald <46734217+bmcdonald3@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:03:44 -0700 Subject: [PATCH 1/2] Add Parquet fixed length string benchmark Add a Parquet string read benchmark that compares the performance of reading Parquet files with a fixed number of characters with and without the fixed length optimization. This will test (1) single file reads (2) multi-file reads with larger files (3) multi-file reads with smaller files and (4) all of the previous with the fixed length optimization. Here are results demonstrating ~2x speedup with using the fixed length string optimization collected on a Cray XC with a Lustre file system: | test | sec | |:------------------|-------:| | single-file | 39.388 | | fixed-single | 20.664 | | scaled-five | 17.283 | | fixed-scaled-five | 8.814 | | five | 85.332 | | fixed-five | 43.743 | | scaled-ten | 8.998 | | fixed-scaled-ten | 4.794 | | ten | 86.761 | | fixed-ten | 45.498 | --- benchmarks/parquet-fixed-strings.py | 167 ++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 benchmarks/parquet-fixed-strings.py diff --git a/benchmarks/parquet-fixed-strings.py b/benchmarks/parquet-fixed-strings.py new file mode 100644 index 0000000000..23374b71ce --- /dev/null +++ b/benchmarks/parquet-fixed-strings.py @@ -0,0 +1,167 @@ +import time +import arkouda as ak +import os +import argparse +import shutil +import pandas as pd + +str_length = 2 +test_dir = '' +test_results = { + 'single-file':0, + 'fixed-single':0, + 'scaled-five':0, + 'fixed-scaled-five':0, + 'five':0, + 'fixed-five':0, + 'scaled-ten':0, + 'fixed-scaled-ten':0, + 'ten':0, + 'fixed-ten':0 +} + +correctness_test = False + +def generate_arr(num_files, scaling): + if scaling: + return ak.random_strings_uniform(str_length, str_length+1, int(size/num_files)) + else: + return ak.random_strings_uniform(str_length, str_length+1, size) + +def compare_arrs(a,b): + for i in range(len(a)): + if a[i] != b[i]: + print("FAIL!") + print(a[i], '!=', b[i], " at ", i) + +def write_files(): + # write 1 file with size + a = generate_arr(1, False) + a.to_parquet(test_dir+"single-file") + # write 5 files with size/5 + for i in range(5): + a = generate_arr(5, True) + a.to_parquet(test_dir+"scaled-five"+str(i)) + # write 5 files with size + for i in range(5): + a = generate_arr(5, False) + a.to_parquet(test_dir+"five"+str(i)) + # write 10 files with size/10 + for i in range(10): + a = generate_arr(10, True) + a.to_parquet(test_dir+"scaled-ten"+str(i)) + # write 10 files with size + for i in range(10): + a = generate_arr(10,False) + a.to_parquet(test_dir+"ten"+str(i)) + +def read_files_fixed(): + start = time.time() + a = ak.read(test_dir +"single-file*", fixed_len=str_length) + stop = time.time() + test_results["fixed-single"] += stop-start + + start = time.time() + a = ak.read(test_dir +"scaled-five*", fixed_len=str_length) + stop = time.time() + test_results["fixed-scaled-five"] += stop-start + + start = time.time() + a = ak.read(test_dir +"five*", fixed_len=str_length) + stop = time.time() + test_results["fixed-five"] += stop-start + + start = time.time() + a = ak.read(test_dir +"scaled-ten*", fixed_len=str_length) + stop = time.time() + test_results["fixed-scaled-ten"] += stop-start + + start = time.time() + a = ak.read(test_dir +"ten*", fixed_len=str_length) + stop = time.time() + test_results["fixed-ten"] += stop-start + +def read_files(): + start = time.time() + a = ak.read(test_dir +"single-file*") + stop = time.time() + test_results["single-file"] += stop-start + + start = time.time() + a = ak.read(test_dir +"scaled-five*") + stop = time.time() + test_results["scaled-five"] += stop-start + + start = time.time() + a = ak.read(test_dir +"five*") + stop = time.time() + test_results["five"] += stop-start + + start = time.time() + a = ak.read(test_dir +"scaled-ten*") + stop = time.time() + test_results["scaled-ten"] += stop-start + + start = time.time() + a = ak.read(test_dir +"ten*") + stop = time.time() + test_results["ten"] += stop-start + +def delete_folder_contents(folder_path): + for root, _, files in os.walk(folder_path): + for file in files: + file_path = os.path.join(root, file) + try: + os.remove(file_path) + except OSError as e: + print(f"Error deleting file {file_path}: {e}") + +def print_performance_table(test_results): + data = [(test, time) for test, time in test_results.items()] + df = pd.DataFrame(data, columns=["test", "sec"]) + df["sec"] = df["sec"].apply(lambda x: f"{x:.3f}") + print(df.to_markdown(index=False)) + +def create_parser(): + parser = argparse.ArgumentParser( + description="Measure performance of writing and reading random arrays from disk." + ) + parser.add_argument("hostname", help="Hostname of arkouda server") + parser.add_argument("port", type=int, help="Port of arkouda server") + parser.add_argument( + "-n", "--size", type=int, default=10**7, help="Problem size: length of array to write/read" + ) + parser.add_argument( + "-w", "--write", default=False, action="store_true", + ) + parser.add_argument( + "-f", "--fixed", default=False, action="store_true", + ) + parser.add_argument( + "-p", + "--path", + default=os.path.join(os.getcwd(), "ak-io-test/"), + help="Target path for measuring read/write rates", + ) + return parser + +if __name__ == "__main__": + import sys + parser = create_parser() + args = parser.parse_args() + ak.connect(args.hostname, args.port) + if not os.path.exists(args.path): + os.makedirs(args.path) + test_dir = args.path + size = args.size + + write_files() + + read_files_fixed() + read_files() + + delete_folder_contents(test_dir) + + print_performance_table(test_results) + +import pandas as pd From b475c13a43a1f496c8aaff453733ffccca32a58b Mon Sep 17 00:00:00 2001 From: Ben McDonald <46734217+bmcdonald3@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:29:31 -0700 Subject: [PATCH 2/2] Change default size --- benchmarks/parquet-fixed-strings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/parquet-fixed-strings.py b/benchmarks/parquet-fixed-strings.py index 23374b71ce..a8b7d0383d 100644 --- a/benchmarks/parquet-fixed-strings.py +++ b/benchmarks/parquet-fixed-strings.py @@ -129,7 +129,7 @@ def create_parser(): parser.add_argument("hostname", help="Hostname of arkouda server") parser.add_argument("port", type=int, help="Port of arkouda server") parser.add_argument( - "-n", "--size", type=int, default=10**7, help="Problem size: length of array to write/read" + "-n", "--size", type=int, default=10**8, help="Problem size: length of array to write/read" ) parser.add_argument( "-w", "--write", default=False, action="store_true",