Bears-R-Us · mhmerrill · Apr 29, 2022 · Apr 20, 2022 · Apr 22, 2022 · Apr 26, 2022
diff --git a/ServerModules.cfg b/ServerModules.cfg
@@ -5,6 +5,7 @@ ArraySetopsMsg
 KExtremeMsg
 ArgSortMsg
 SegmentedMsg
+DataFrameIndexingMsg
 OperatorMsg
 RandMsg
 IndexingMsg

diff --git a/arkouda/dataframe.py b/arkouda/dataframe.py
@@ -4,20 +4,22 @@
 from warnings import warn
 import pandas as pd  # type: ignore
 import random
+import json
+from typing import cast
 
 from arkouda.segarray import SegArray
 from arkouda.pdarrayclass import pdarray
 from arkouda.categorical import Categorical
 from arkouda.strings import Strings
-from arkouda.pdarraycreation import arange, array
+from arkouda.pdarraycreation import arange, array, create_pdarray
 from arkouda.groupbyclass import GroupBy as akGroupBy
 from arkouda.pdarraysetops import concatenate, unique, intersect1d, in1d
 from arkouda.pdarrayIO import save_all, load_all
 from arkouda.dtypes import int64 as akint64
 from arkouda.dtypes import float64 as akfloat64
 from arkouda.sorting import argsort, coargsort
 from arkouda.numeric import where
-from arkouda.client import maxTransferBytes
+from arkouda.client import maxTransferBytes, generic_msg
 from arkouda.row import Row
 from arkouda.alignment import in1dmulti
 from arkouda.series import Series
@@ -422,6 +424,55 @@ def _get_head_tail(self):
         newdf._set_index(idx)
         return newdf.to_pandas(retain_index=True)
 
+    def _get_head_tail_server(self):
+        if self._empty:
+            return pd.DataFrame()
+        self.update_size()
+        maxrows = pd.get_option('display.max_rows')
+        if self._size <= maxrows:
+            newdf = DataFrame()
+            for col in self._columns:
+                if isinstance(self[col], Categorical):
+                    newdf[col] = self[col].categories[self[col].codes]
+                else:
+                    newdf[col] = self[col]
+            newdf._set_index(self.index)
+            return newdf.to_pandas(retain_index=True)
+        # Being 1 above the threshold causes the PANDAS formatter to split the data frame vertically
+        idx = array(list(range(maxrows // 2 + 1)) + list(range(self._size - (maxrows // 2), self._size)))
+        msg_list = []
+        for col in self._columns:
+            if isinstance(self[col], Categorical):
+                msg_list.append(f"Categorical+{col}+{self[col].codes.name}+{self[col].categories.name}")
+            elif isinstance(self[col], SegArray):
+                msg_list.append(f"SegArray+{col}+{self[col].segments.name}+{self[col].values.name}")
+            elif isinstance(self[col], Strings):
+                msg_list.append(f"Strings+{col}+{self[col].name}")
+            else:
+                msg_list.append(f"pdarray+{col}+{self[col].name}")
+
+        repMsg = cast(str, generic_msg(cmd="dataframe_idx", args="{} {} {}".
+                                       format(len(msg_list), idx.name, json.dumps(msg_list))))
+        msgList = json.loads(repMsg)
+
+        df_dict = {}
+        for m in msgList:
+            # Split to [datatype, column, create]
+            msg = m.split("+", 2)
+            t = msg[0]
+            if t == "Strings":
+                df_dict[msg[1]] = Strings.from_return_msg(msg[2])
+            elif t == "SegArray":
+                # split creates for segments and values
+                eles = msg[2].split("+")
+                df_dict[msg[1]] = SegArray(create_pdarray(eles[0]), create_pdarray(eles[1]))
+            else:
+                df_dict[msg[1]] = create_pdarray(msg[2])
+
+        new_df = DataFrame(df_dict)
+        new_df._set_index(idx)
+        return new_df.to_pandas(retain_index=True)[self._columns]
+
     def _shape_str(self):
         return "{} rows x {} columns".format(self.size, self._ncols())
 
@@ -430,7 +481,7 @@ def __repr__(self):
         Return ascii-formatted version of the dataframe.
         """
 
-        prt = self._get_head_tail()
+        prt = self._get_head_tail_server()
         with pd.option_context("display.show_dimensions", False):
             retval = prt.__repr__()
         retval += " (" + self._shape_str() + ")"
@@ -440,8 +491,9 @@ def _repr_html_(self):
         """
         Return html-formatted version of the dataframe.
         """
+        #
+        prt = self._get_head_tail_server()
 
-        prt = self._get_head_tail()
         with pd.option_context("display.show_dimensions", False):
             retval = prt._repr_html_()
         retval += "<p>" + self._shape_str() + "</p>"

diff --git a/benchmarks/dataframe.py b/benchmarks/dataframe.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+import string
+import time, argparse
+import numpy as np
+import pandas as pd
+
+import arkouda as ak
+import random
+
+OPS = ['_repr_html_', '_get_head_tail_server', '_get_head_tail']
+TYPES = ('int64', 'uint64',)
+
+def generate_dataframe(N):
+    types = [ak.Categorical, ak.pdarray, ak.Strings, ak.SegArray]
+
+    # generate random columns to build dataframe
+    df_dict = {}
+    for x in range(20):  # loop to create 20 random columns
+        key = f"c_{x}"
+        d = types[random.randint(0, len(types)-1)]
+        if d == ak.Categorical:
+            str_arr = ak.array(["".join(random.choices(string.ascii_letters + string.digits, k=5)) for _ in range(N)])
+            df_dict[key] = ak.Categorical(str_arr)
+        elif d == ak.pdarray:
+            df_dict[key] = ak.array(np.random.randint(0, 2 ** 32, N))
+        elif d == ak.Strings:
+            df_dict[key] = ak.array(["".join(random.choices(string.ascii_letters + string.digits, k=5)) for _ in range(N)])
+        elif d == ak.SegArray:
+            df_dict[key] = ak.SegArray(ak.arange(0, N*5, 5), ak.array(np.random.randint(0, 2 ** 32, N*5)))
+
+    return ak.DataFrame(df_dict)
+
+def time_ak_df_display(N_per_locale, trials):
+    print(">>> arkouda dataframe display")
+    cfg = ak.get_config()
+    N = N_per_locale * cfg["numLocales"]
+    print("numLocales = {}, N = {:,}".format(cfg["numLocales"], N))
+
+    pd.set_option("display.max_rows", 100)
+    pd.set_option("display.min_rows", 10)
+    pd.set_option("display.max_columns", 20)
+
+    df = generate_dataframe(N)
+
+    timings = {op: [] for op in OPS}
+    results = {}
+    for i in range(trials):
+        timings = {op: [] for op in OPS}
+        for op in timings.keys():
+            fxn = getattr(df, op)
+            start = time.time()
+            r = fxn()
+            end = time.time()
+            timings[op].append(end - start)
+            results[op] = r
+
+    tavg = {op: sum(t) / trials for op, t in timings.items()}
+
+    for op, t in tavg.items():
+        print("  {} Average time = {:.4f} sec".format(op, t))
+        bytes_per_sec = (df.size * 64 * 2) / t
+        print("  {} Average rate = {:.2f} GiB/sec".format(op, bytes_per_sec / 2 ** 30))
+
+def check_correctness(N_per_locale):
+    cfg = ak.get_config()
+    N = N_per_locale * cfg["numLocales"]
+    df = generate_dataframe(N)
+
+    pd.set_option("display.max_rows", 100)
+    pd.set_option("display.min_rows", 10)
+    pd.set_option("display.max_columns", 20)
+
+    printdf = df._get_head_tail_server() # measure the pandas df returned
+    # Mainly want to verify shape for the print
+    assert(printdf.shape[0] == 101)
+    assert(printdf.shape[1] == 20)
+
+
+def create_parser():
+    parser = argparse.ArgumentParser(description="Run the setops benchmarks: intersect1d, union1d, setdiff1d, setxor1d")
+    parser.add_argument('hostname', help='Hostname of arkouda server')
+    parser.add_argument('port', type=int, help='Port of arkouda server')
+    parser.add_argument('-n', '--size', type=int, default=10**4, help='Problem size: length of arrays A and B')
+    parser.add_argument('-t', '--trials', type=int, default=1, help='Number of times to run the benchmark')
+    parser.add_argument('-d', '--dtype', default='int64', help='Dtype of array ({})'.format(', '.join(TYPES)))
+    parser.add_argument('--correctness-only', default=False, action='store_true',
+                        help='Only check correctness, not performance.')
+    parser.add_argument('-s', '--seed', default=None, type=int, help='Value to initialize random number generator')
+    return parser
+
+
+if __name__ == "__main__":
+    import sys
+
+    parser = create_parser()
+    args = parser.parse_args()
+    if args.dtype not in TYPES:
+        raise ValueError("Dtype must be {}, not {}".format('/'.join(TYPES), args.dtype))
+
+    ak.verbose = False
+    ak.connect(args.hostname, args.port)
+
+    if args.correctness_only:
+        for dtype in TYPES:
+            check_correctness(args.size)
+        sys.exit(0)
+
+    print("array size = {:,}".format(args.size))
+    print("number of trials = ", args.trials)
+    time_ak_df_display(args.size, args.trials)
+
+    sys.exit(0)