Skip to content

Commit

Permalink
Closes #2744 - Implement skew and hist_all (#2756)
Browse files Browse the repository at this point in the history
* init commit

* removed excess math imports

* fix flake8 errors

* remove extraneous print

* isnan issue

* adding ceil function to handle axes error

* add bias to skew method

* bleh flake8

* remove log arg

---------

Co-authored-by: Eddie <[email protected]>
Co-authored-by: pierce <[email protected]>
  • Loading branch information
3 people authored Sep 12, 2023
1 parent cdc2457 commit 5fa1e52
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 0 deletions.
46 changes: 46 additions & 0 deletions arkouda/pdarrayclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -3135,6 +3135,52 @@ def sqrt(pda: pdarray, where: Union[bool, pdarray] = True) -> pdarray:
return power(pda, 0.5, where)


@typechecked
def skew(pda: pdarray, bias: bool = True) -> np.float64:

"""
Computes the sample skewness of an array.
Skewness > 0 means there's greater weight in the right tail of the distribution.
Skewness < 0 means there's greater weight in the left tail of the distribution.
Skewness == 0 means the data is normally distributed.
Based on the `scipy.stats.skew` function.
Parameters
----------
pda : pdarray
A pdarray of values that will be calculated to find the skew
bias : bool, optional
If False, then the calculations are corrected for statistical bias.
Returns
-------
np.float64
The skew of all elements in the array
Examples:
>>> a = ak.array([1, 1, 1, 5, 10])
>>> ak.skew(a)
0.9442193396379163
"""

deviations = pda - pda.mean()
cubed_deviations = deviations ** 3

std_dev = pda.std()

if std_dev != 0:
skewness = cubed_deviations.mean() / (std_dev ** 3)
# Apply bias correction using the Fisher-Pearson method
if not bias:
n = len(pda)
correction = np.sqrt((n-1)*n)/(n-2)
skewness = correction * skewness
else:
skewness = 0

return skewness


# there's no need for typechecking, % can handle that
def mod(dividend, divisor) -> pdarray:
"""
Expand Down
99 changes: 99 additions & 0 deletions arkouda/plotting.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
import math
import numpy as np # type: ignore
from matplotlib import pyplot as plt # type: ignore
from arkouda.dataframe import DataFrame
from arkouda.timeclass import Datetime, Timedelta, date_range, timedelta_range
from arkouda.pdarrayclass import skew
from arkouda.pdarraycreation import arange
from arkouda.numeric import histogram, isnan
from arkouda.groupbyclass import GroupBy


def plot_dist(b, h, log=True, xlabel=None, newfig=True):
Expand Down Expand Up @@ -50,3 +57,95 @@ def plot_dist(b, h, log=True, xlabel=None, newfig=True):
plt.gca().set_title("cumulative distribution")
if xlabel is not None:
plt.gca().set_xlabel(xlabel, fontsize=14)


def hist_all(ak_df: DataFrame, cols: list = []):
"""
Create a grid plot histogramming all numeric columns in ak dataframe
Parameters
----------
ak_df : ak.DataFrame
Full Arkouda DataFrame containing data to be visualized
cols : list
(Optional) A specified list of columns to be plotted
Notes
-----
This function displays the plot.
Examples
--------
>>> import arkouda as ak
>>> from arkouda.plotting import hist_all
>>> ak_df = ak.DataFrame({"a": ak.array(np.random.randn(100)),
"b": ak.array(np.random.randn(100)),
"c": ak.array(np.random.randn(100)),
"d": ak.array(np.random.randn(100))
})
>>> hist_all(ak_df)
"""

if len(cols) == 0:
cols = ak_df.columns

num_rows = int(math.ceil(len(cols) ** 0.5))
num_cols = (len(cols) + num_rows - 1) // num_rows
fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, 10))
fig.tight_layout(pad=2.0)

if num_rows > 1:
axes = axes.flatten()
else:
axes = [axes]

for col in cols:
try:
ax = axes[cols.index(col)]
x = ak_df[col]

if x.dtype == "float64":
x = x[~isnan(x)]

n = len(x)
g1 = skew(x)

except ValueError:
GB_df = GroupBy(ak_df[col])
new_labels = arange(GB_df.unique_keys.size)
newcol = GB_df.broadcast(new_labels)
x = newcol[:ak_df.size]

if x.dtype == "float64":
x = x[~isnan(x)]

n = len(x)
g1 = skew(x)

sigma_g1 = math.sqrt(6 * (n - 2) / ((n + 1) * (n + 3)))
# Doane's Formula
num_bins = int(1 + math.log2(n) + math.log2(1 + abs(g1) / sigma_g1))

# Compute histogram counts in arkouda
h = histogram(x, num_bins)
# Compute bins in numpy
if isinstance(x, Datetime):
# Matplotlib has trouble plotting np.datetime64 and np.timedelta64
bins = (
date_range(x.min(), x.max(), periods=num_bins)
.to_ndarray()
.astype("int")
)
elif isinstance(x, Timedelta):
bins = (
timedelta_range(x.min(), x.max(), periods=num_bins)
.to_ndarray()
.astype("int")
)
else:
bins = np.linspace(x.min(), x.max(), num_bins + 1)[:-1]

ax.bar(bins, h[1].to_ndarray(), width=bins[1] - bins[0])
ax.set_title(col, size=8)
if x.max() > 100 * x.min():
ax.set_yscale("log")

0 comments on commit 5fa1e52

Please sign in to comment.