Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #1300 - Improve Performance of DataFrame Display #1334

Merged
1 change: 1 addition & 0 deletions ServerModules.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ ArraySetopsMsg
KExtremeMsg
ArgSortMsg
SegmentedMsg
DataFrameIndexingMsg
OperatorMsg
RandMsg
IndexingMsg
Expand Down
60 changes: 56 additions & 4 deletions arkouda/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,22 @@
from warnings import warn
import pandas as pd # type: ignore
import random
import json
from typing import cast

from arkouda.segarray import SegArray
from arkouda.pdarrayclass import pdarray
from arkouda.categorical import Categorical
from arkouda.strings import Strings
from arkouda.pdarraycreation import arange, array
from arkouda.pdarraycreation import arange, array, create_pdarray
from arkouda.groupbyclass import GroupBy as akGroupBy
from arkouda.pdarraysetops import concatenate, unique, intersect1d, in1d
from arkouda.pdarrayIO import save_all, load_all
from arkouda.dtypes import int64 as akint64
from arkouda.dtypes import float64 as akfloat64
from arkouda.sorting import argsort, coargsort
from arkouda.numeric import where
from arkouda.client import maxTransferBytes
from arkouda.client import maxTransferBytes, generic_msg
from arkouda.row import Row
from arkouda.alignment import in1dmulti
from arkouda.series import Series
Expand Down Expand Up @@ -422,6 +424,55 @@ def _get_head_tail(self):
newdf._set_index(idx)
return newdf.to_pandas(retain_index=True)

def _get_head_tail_server(self):
if self._empty:
return pd.DataFrame()
self.update_size()
maxrows = pd.get_option('display.max_rows')
if self._size <= maxrows:
newdf = DataFrame()
for col in self._columns:
if isinstance(self[col], Categorical):
newdf[col] = self[col].categories[self[col].codes]
else:
newdf[col] = self[col]
newdf._set_index(self.index)
return newdf.to_pandas(retain_index=True)
# Being 1 above the threshold causes the PANDAS formatter to split the data frame vertically
idx = array(list(range(maxrows // 2 + 1)) + list(range(self._size - (maxrows // 2), self._size)))
msg_list = []
for col in self._columns:
if isinstance(self[col], Categorical):
msg_list.append(f"Categorical+{col}+{self[col].codes.name}+{self[col].categories.name}")
elif isinstance(self[col], SegArray):
msg_list.append(f"SegArray+{col}+{self[col].segments.name}+{self[col].values.name}")
elif isinstance(self[col], Strings):
msg_list.append(f"Strings+{col}+{self[col].name}")
else:
msg_list.append(f"pdarray+{col}+{self[col].name}")

repMsg = cast(str, generic_msg(cmd="dataframe_idx", args="{} {} {}".
format(len(msg_list), idx.name, json.dumps(msg_list))))
msgList = json.loads(repMsg)

df_dict = {}
for m in msgList:
# Split to [datatype, column, create]
msg = m.split("+", 2)
t = msg[0]
if t == "Strings":
df_dict[msg[1]] = Strings.from_return_msg(msg[2])
stress-tess marked this conversation as resolved.
Show resolved Hide resolved
elif t == "SegArray":
# split creates for segments and values
eles = msg[2].split("+")
df_dict[msg[1]] = SegArray(create_pdarray(eles[0]), create_pdarray(eles[1]))
else:
df_dict[msg[1]] = create_pdarray(msg[2])

new_df = DataFrame(df_dict)
new_df._set_index(idx)
return new_df.to_pandas(retain_index=True)[self._columns]

def _shape_str(self):
return "{} rows x {} columns".format(self.size, self._ncols())

Expand All @@ -430,7 +481,7 @@ def __repr__(self):
Return ascii-formatted version of the dataframe.
"""

prt = self._get_head_tail()
prt = self._get_head_tail_server()
with pd.option_context("display.show_dimensions", False):
retval = prt.__repr__()
retval += " (" + self._shape_str() + ")"
Expand All @@ -440,8 +491,9 @@ def _repr_html_(self):
"""
Return html-formatted version of the dataframe.
"""
#
Ethan-DeBandi99 marked this conversation as resolved.
Show resolved Hide resolved
prt = self._get_head_tail_server()

prt = self._get_head_tail()
reuster986 marked this conversation as resolved.
Show resolved Hide resolved
with pd.option_context("display.show_dimensions", False):
retval = prt._repr_html_()
retval += "<p>" + self._shape_str() + "</p>"
Expand Down
112 changes: 112 additions & 0 deletions benchmarks/dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
import string
import time, argparse
import numpy as np
import pandas as pd

import arkouda as ak
import random

OPS = ['_repr_html_', '_get_head_tail_server', '_get_head_tail']
TYPES = ('int64', 'uint64',)

def generate_dataframe(N):
types = [ak.Categorical, ak.pdarray, ak.Strings, ak.SegArray]

# generate random columns to build dataframe
df_dict = {}
for x in range(20): # loop to create 20 random columns
key = f"c_{x}"
d = types[random.randint(0, len(types)-1)]
reuster986 marked this conversation as resolved.
Show resolved Hide resolved
if d == ak.Categorical:
str_arr = ak.array(["".join(random.choices(string.ascii_letters + string.digits, k=5)) for _ in range(N)])
reuster986 marked this conversation as resolved.
Show resolved Hide resolved
df_dict[key] = ak.Categorical(str_arr)
elif d == ak.pdarray:
df_dict[key] = ak.array(np.random.randint(0, 2 ** 32, N))
elif d == ak.Strings:
df_dict[key] = ak.array(["".join(random.choices(string.ascii_letters + string.digits, k=5)) for _ in range(N)])
reuster986 marked this conversation as resolved.
Show resolved Hide resolved
elif d == ak.SegArray:
df_dict[key] = ak.SegArray(ak.arange(0, N*5, 5), ak.array(np.random.randint(0, 2 ** 32, N*5)))

return ak.DataFrame(df_dict)

def time_ak_df_display(N_per_locale, trials):
print(">>> arkouda dataframe display")
cfg = ak.get_config()
N = N_per_locale * cfg["numLocales"]
print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))

pd.set_option("display.max_rows", 100)
pd.set_option("display.min_rows", 10)
pd.set_option("display.max_columns", 20)

df = generate_dataframe(N)

timings = {op: [] for op in OPS}
results = {}
for i in range(trials):
timings = {op: [] for op in OPS}
for op in timings.keys():
fxn = getattr(df, op)
start = time.time()
r = fxn()
end = time.time()
timings[op].append(end - start)
results[op] = r

tavg = {op: sum(t) / trials for op, t in timings.items()}

for op, t in tavg.items():
print(" {} Average time = {:.4f} sec".format(op, t))
bytes_per_sec = (df.size * 64 * 2) / t
stress-tess marked this conversation as resolved.
Show resolved Hide resolved
print(" {} Average rate = {:.2f} GiB/sec".format(op, bytes_per_sec / 2 ** 30))

def check_correctness(N_per_locale):
cfg = ak.get_config()
N = N_per_locale * cfg["numLocales"]
df = generate_dataframe(N)

pd.set_option("display.max_rows", 100)
pd.set_option("display.min_rows", 10)
pd.set_option("display.max_columns", 20)

printdf = df._get_head_tail_server() # measure the pandas df returned
# Mainly want to verify shape for the print
assert(printdf.shape[0] == 101)
assert(printdf.shape[1] == 20)


def create_parser():
parser = argparse.ArgumentParser(description="Run the setops benchmarks: intersect1d, union1d, setdiff1d, setxor1d")
stress-tess marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument('hostname', help='Hostname of arkouda server')
parser.add_argument('port', type=int, help='Port of arkouda server')
parser.add_argument('-n', '--size', type=int, default=10**4, help='Problem size: length of arrays A and B')
stress-tess marked this conversation as resolved.
Show resolved Hide resolved
parser.add_argument('-t', '--trials', type=int, default=1, help='Number of times to run the benchmark')
parser.add_argument('-d', '--dtype', default='int64', help='Dtype of array ({})'.format(', '.join(TYPES)))
parser.add_argument('--correctness-only', default=False, action='store_true',
help='Only check correctness, not performance.')
parser.add_argument('-s', '--seed', default=None, type=int, help='Value to initialize random number generator')
stress-tess marked this conversation as resolved.
Show resolved Hide resolved
return parser


if __name__ == "__main__":
import sys

parser = create_parser()
args = parser.parse_args()
if args.dtype not in TYPES:
raise ValueError("Dtype must be {}, not {}".format('/'.join(TYPES), args.dtype))

ak.verbose = False
ak.connect(args.hostname, args.port)

if args.correctness_only:
for dtype in TYPES:
check_correctness(args.size)
sys.exit(0)

print("array size = {:,}".format(args.size))
print("number of trials = ", args.trials)
time_ak_df_display(args.size, args.trials)

sys.exit(0)
Loading