Closes #2744 - Implement skew and hist_all (#2756)

* init commit * removed excess math imports * fix flake8 errors * remove extraneous print * isnan issue * adding ceil function to handle axes error * add bias to skew method * bleh flake8 * remove log arg --------- Co-authored-by: Eddie <[email protected]> Co-authored-by: pierce <[email protected]>
Bears-R-Us · Sep 12, 2023 · 5fa1e52 · 5fa1e52
1 parent cdc2457
commit 5fa1e52
Show file tree

Hide file tree

Showing 2 changed files with 145 additions and 0 deletions.
diff --git a/arkouda/pdarrayclass.py b/arkouda/pdarrayclass.py
@@ -3135,6 +3135,52 @@ def sqrt(pda: pdarray, where: Union[bool, pdarray] = True) -> pdarray:
     return power(pda, 0.5, where)
 
 
+@typechecked
+def skew(pda: pdarray, bias: bool = True) -> np.float64:
+
+    """
+    Computes the sample skewness of an array.
+    Skewness > 0 means there's greater weight in the right tail of the distribution.
+    Skewness < 0 means there's greater weight in the left tail of the distribution.
+    Skewness == 0 means the data is normally distributed.
+    Based on the `scipy.stats.skew` function.
+
+    Parameters
+    ----------
+    pda : pdarray
+        A pdarray of values that will be calculated to find the skew
+    bias : bool, optional
+        If False, then the calculations are corrected for statistical bias.
+
+    Returns
+    -------
+        np.float64
+            The skew of all elements in the array
+
+    Examples:
+    >>> a = ak.array([1, 1, 1, 5, 10])
+    >>> ak.skew(a)
+    0.9442193396379163
+    """
+
+    deviations = pda - pda.mean()
+    cubed_deviations = deviations ** 3
+
+    std_dev = pda.std()
+
+    if std_dev != 0:
+        skewness = cubed_deviations.mean() / (std_dev ** 3)
+        # Apply bias correction using the Fisher-Pearson method
+        if not bias:
+            n = len(pda)
+            correction = np.sqrt((n-1)*n)/(n-2)
+            skewness = correction * skewness
+    else:
+        skewness = 0
+
+    return skewness
+
+
 # there's no need for typechecking, % can handle that
 def mod(dividend, divisor) -> pdarray:
     """

diff --git a/arkouda/plotting.py b/arkouda/plotting.py
@@ -1,5 +1,12 @@
+import math
 import numpy as np  # type: ignore
 from matplotlib import pyplot as plt  # type: ignore
+from arkouda.dataframe import DataFrame
+from arkouda.timeclass import Datetime, Timedelta, date_range, timedelta_range
+from arkouda.pdarrayclass import skew
+from arkouda.pdarraycreation import arange
+from arkouda.numeric import histogram, isnan
+from arkouda.groupbyclass import GroupBy
 
 
 def plot_dist(b, h, log=True, xlabel=None, newfig=True):
@@ -50,3 +57,95 @@ def plot_dist(b, h, log=True, xlabel=None, newfig=True):
     plt.gca().set_title("cumulative distribution")
     if xlabel is not None:
         plt.gca().set_xlabel(xlabel, fontsize=14)
+
+
+def hist_all(ak_df: DataFrame, cols: list = []):
+    """
+    Create a grid plot histogramming all numeric columns in ak dataframe
+
+    Parameters
+    ----------
+    ak_df : ak.DataFrame
+        Full Arkouda DataFrame containing data to be visualized
+    cols : list
+        (Optional) A specified list of columns to be plotted
+
+    Notes
+    -----
+    This function displays the plot.
+
+    Examples
+    --------
+    >>> import arkouda as ak
+    >>> from arkouda.plotting import hist_all
+    >>> ak_df = ak.DataFrame({"a": ak.array(np.random.randn(100)),
+                              "b": ak.array(np.random.randn(100)),
+                              "c": ak.array(np.random.randn(100)),
+                              "d": ak.array(np.random.randn(100))
+                              })
+    >>> hist_all(ak_df)
+    """
+
+    if len(cols) == 0:
+        cols = ak_df.columns
+
+    num_rows = int(math.ceil(len(cols) ** 0.5))
+    num_cols = (len(cols) + num_rows - 1) // num_rows
+    fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, 10))
+    fig.tight_layout(pad=2.0)
+
+    if num_rows > 1:
+        axes = axes.flatten()
+    else:
+        axes = [axes]
+
+    for col in cols:
+        try:
+            ax = axes[cols.index(col)]
+            x = ak_df[col]
+
+            if x.dtype == "float64":
+                x = x[~isnan(x)]
+
+            n = len(x)
+            g1 = skew(x)
+
+        except ValueError:
+            GB_df = GroupBy(ak_df[col])
+            new_labels = arange(GB_df.unique_keys.size)
+            newcol = GB_df.broadcast(new_labels)
+            x = newcol[:ak_df.size]
+
+            if x.dtype == "float64":
+                x = x[~isnan(x)]
+
+            n = len(x)
+            g1 = skew(x)
+
+        sigma_g1 = math.sqrt(6 * (n - 2) / ((n + 1) * (n + 3)))
+        # Doane's Formula
+        num_bins = int(1 + math.log2(n) + math.log2(1 + abs(g1) / sigma_g1))
+
+        # Compute histogram counts in arkouda
+        h = histogram(x, num_bins)
+        # Compute bins in numpy
+        if isinstance(x, Datetime):
+            # Matplotlib has trouble plotting np.datetime64 and np.timedelta64
+            bins = (
+                date_range(x.min(), x.max(), periods=num_bins)
+                .to_ndarray()
+                .astype("int")
+            )
+        elif isinstance(x, Timedelta):
+            bins = (
+                timedelta_range(x.min(), x.max(), periods=num_bins)
+                .to_ndarray()
+                .astype("int")
+            )
+        else:
+            bins = np.linspace(x.min(), x.max(), num_bins + 1)[:-1]
+
+        ax.bar(bins, h[1].to_ndarray(), width=bins[1] - bins[0])
+        ax.set_title(col, size=8)
+        if x.max() > 100 * x.min():
+            ax.set_yscale("log")