-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathdataframe.py
146 lines (119 loc) · 4.63 KB
/
dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3
import argparse
import time
import numpy as np
import pandas as pd
import arkouda as ak
OPS = ["_get_head_tail_server", "_get_head_tail"]
TYPES = (
"int64",
"uint64",
)
def generate_dataframe(N, seed):
types = [ak.Categorical, ak.pdarray, ak.Strings, ak.SegArray]
# generate random columns to build dataframe
df_dict = {}
np.random.seed(seed)
for x in range(20): # loop to create 20 random columns
key = f"c_{x}"
d = types[x % 4]
if d == ak.Categorical:
str_arr = ak.random_strings_uniform(minlen=5, maxlen=6, size=N, seed=seed)
df_dict[key] = ak.Categorical(str_arr)
elif d == ak.pdarray:
df_dict[key] = ak.array(np.random.randint(0, 2**32, N))
elif d == ak.Strings:
df_dict[key] = ak.random_strings_uniform(minlen=5, maxlen=6, size=N, seed=seed)
elif d == ak.SegArray:
df_dict[key] = ak.segarray(
ak.arange(0, N), ak.array(np.random.randint(0, 2**32, N))
)
return ak.DataFrame(df_dict)
def time_ak_df_display(N_per_locale, trials, seed):
print(">>> arkouda dataframe display")
cfg = ak.get_config()
N = N_per_locale * cfg["numLocales"]
print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
pd.set_option("display.max_rows", 100)
pd.set_option("display.min_rows", 10)
pd.set_option("display.max_columns", 20)
df = generate_dataframe(N, seed)
timings = {op: [] for op in OPS}
results = {}
for i in range(trials):
for op in timings.keys():
fxn = getattr(df, op)
start = time.time()
r = fxn()
end = time.time()
timings[op].append(end - start)
results[op] = r
tavg = {op: sum(t) / trials for op, t in timings.items()}
# calculate nbytes based on the columns
nbytes = 0
for col in df.columns:
col_obj = df[col]
if isinstance(col_obj, ak.pdarray):
nbytes += col_obj.size * col_obj.itemsize
elif isinstance(col_obj, ak.Categorical):
nbytes += col_obj.codes.size * col_obj.codes.itemsize
elif isinstance(col_obj, ak.Strings):
nbytes += col_obj.nbytes * col_obj.entry.itemsize
elif isinstance(col_obj, ak.SegArray):
nbytes += col_obj.values.size * col_obj.values.itemsize + \
(col_obj.segments.size * col_obj.segments.itemsize)
for op, t in tavg.items():
print(" {} Average time = {:.4f} sec".format(op, t))
bytes_per_sec = nbytes / t
print(" {} Average rate = {:.2f} GiB/sec".format(op, bytes_per_sec / 2**30))
def check_correctness(N_per_locale, seed):
cfg = ak.get_config()
N = N_per_locale * cfg["numLocales"]
df = generate_dataframe(N, seed)
pd.set_option("display.max_rows", 100)
pd.set_option("display.min_rows", 10)
pd.set_option("display.max_columns", 20)
printdf = df._get_head_tail_server() # measure the pandas df returned
# Mainly want to verify shape for the print
assert printdf.shape[0] == 101
assert printdf.shape[1] == 20
def create_parser():
parser = argparse.ArgumentParser(
description="Run the dataframe display benchmarks: " "_get_head_tail, _get_head_tail_server"
)
parser.add_argument("hostname", help="Hostname of arkouda server")
parser.add_argument("port", type=int, help="Port of arkouda server")
parser.add_argument(
"-n", "--size", type=int, default=10**4, help="Problem size: length of columns in dataframe."
)
parser.add_argument(
"-t", "--trials", type=int, default=1, help="Number of times to run the benchmark"
)
parser.add_argument(
"-d", "--dtype", default="int64", help="Dtype of array ({})".format(", ".join(TYPES))
)
parser.add_argument(
"--correctness-only",
default=False,
action="store_true",
help="Only check correctness, not performance.",
)
parser.add_argument(
"-s", "--seed", default=None, type=int, help="Value to initialize random number generator"
)
return parser
if __name__ == "__main__":
import sys
parser = create_parser()
args = parser.parse_args()
if args.dtype not in TYPES:
raise ValueError("Dtype must be {}, not {}".format("/".join(TYPES), args.dtype))
ak.verbose = False
ak.connect(args.hostname, args.port)
if args.correctness_only:
check_correctness(args.size, args.seed)
sys.exit(0)
print("array size = {:,}".format(args.size))
print("number of trials = ", args.trials)
time_ak_df_display(args.size, args.trials, args.seed)
sys.exit(0)