From 2d14d7fa2223c7104fc72ed554412091f251155a Mon Sep 17 00:00:00 2001 From: Jim Eichert Date: Mon, 23 May 2022 07:33:45 -0400 Subject: [PATCH] Closes #1363: ak.Series with only values and more robust typechecking This PR (closes #1363): - Deprecates the `ar_tuple` parameter inside of the `__init__` method. - Adjusts the tests to use the other 2 parameters `(index, data)` when instantiating. --- arkouda/series.py | 54 ++++++++++++++++++--------- tests/series_test.py | 87 ++++++++++++++++++++++++++++++-------------- 2 files changed, 95 insertions(+), 46 deletions(-) diff --git a/arkouda/series.py b/arkouda/series.py index 7dfda1bbba..d9be2b1805 100644 --- a/arkouda/series.py +++ b/arkouda/series.py @@ -1,8 +1,10 @@ +from typeguard import typechecked +from typing import Union from arkouda.pdarrayclass import pdarray, argmaxk, attach_pdarray from arkouda.pdarraycreation import arange, array, zeros from arkouda.pdarraysetops import argsort, concatenate, in1d from arkouda.index import Index -from arkouda.groupbyclass import GroupBy +from arkouda.groupbyclass import GroupBy, groupable_element_type from arkouda.dtypes import int64, float64 from arkouda.numeric import value_counts, cast as akcast from arkouda.util import get_callback @@ -17,6 +19,8 @@ import numpy as np # type: ignore from warnings import warn +import warnings + __all__ = [ "Series", ] @@ -75,15 +79,25 @@ class Series: Parameters ---------- - ar_tuple : 2-tuple of arkouda arrays with the first being the grouping key(s) and the - second being the value. The grouping key(s) will be the index of the series. + ar_tuple : tuple, list. + 2-tuple of arkouda arrays with the first being the grouping key(s) and the + second being the value. The grouping key(s) will be the index of the series. + This will be deprecated in future releases. + index : pdarray + an array of indexes associated with the data array. + If empty, it will default to a range of ints whose size match the size of the data. + data : pdarray, Strings, Categorical + a 1D array. Must not be empty if ar_tuple is not provided. """ - def __init__(self, ar_tuple=None, data=None, index=None): + @typechecked + def __init__(self, ar_tuple: Union[tuple, list] = None, index: pdarray = None, + data: groupable_element_type = None): if ar_tuple is not None: self.index = Index.factory(ar_tuple[0]) self.values = ar_tuple[1] + warnings.warn('ar_tuple will be deprecated in future releases', DeprecationWarning) elif data is None: raise TypeError("ar_tuple and data cannot both be null") @@ -91,12 +105,10 @@ def __init__(self, ar_tuple=None, data=None, index=None): if not isinstance(data, (pdarray, Strings, Categorical)): data = array(data) self.values = data + self.index = Index.factory(index) if index is not None else Index(arange(self.values.size)) - if index is None: - index = arange(data.size) - self.index = Index.factory(index) if self.index.size != self.values.size: - raise ValueError("Index and data must have same length") + raise ValueError("Index size does not match data size") self.size = self.index.size def __len__(self): @@ -196,7 +208,7 @@ def locate(self, key): else: # scalar value idx = self.index == key - return Series((self.index[idx], self.values[idx])) + return Series(index=self.index.index[idx], data=self.values[idx]) @classmethod def _make_binop(cls, operator): @@ -233,7 +245,12 @@ def add(self, b): index = self.index.concat(b.index).index values = concatenate([self.values, b.values], ordered=False) - return Series(GroupBy(index).sum(values)) + + vals = GroupBy(index).sum(values) + idx = vals[0] + val = vals[1] + return Series(data=val, index=idx) + # return Series(GroupBy(index).sum(values)) def topn(self, n=10): """ Return the top values of the series @@ -252,7 +269,7 @@ def topn(self, n=10): idx = argmaxk(v, n) idx = idx[-1:-n - 1:-1] - return Series((k[idx], v[idx])) + return Series(index=k.index[idx], data=v[idx]) def sort_index(self, ascending=True): """ Sort the series by its index @@ -263,7 +280,7 @@ def sort_index(self, ascending=True): """ idx = self.index.argsort(ascending=ascending) - return Series((self.index[idx], self.values[idx])) + return Series(index=self.index.index[idx], data=self.values[idx]) def sort_values(self, ascending=True): """ Sort the series numerically @@ -282,19 +299,19 @@ def sort_values(self, ascending=True): idx = argsort(self.values)[arange(self.values.size - 1, -1, -1)] else: idx = argsort(self.values) - return Series((self.index[idx], self.values[idx])) + return Series(index=self.index.index[idx], data=self.values[idx]) def tail(self, n=10): """Return the last n values of the series""" idx_series = (self.index[-n:]) - return Series((idx_series, self.values[-n:])) + return Series(data=idx_series.index, index=self.values[-n:]) def head(self, n=10): """Return the first n values of the series""" idx_series = (self.index[0:n]) - return Series((idx_series, self.values[0:n])) + return Series(index=idx_series.index, data=self.values[0:n]) def to_pandas(self): """Convert the series to a local PANDAS series""" @@ -316,7 +333,8 @@ def value_counts(self, sort=True): """ dtype = get_callback(self.values) - s = Series(value_counts(self.values)) + vc = value_counts(self.values) + s = Series(index=vc[0], data=vc[1]) if sort: s = s.sort_values(ascending=False) s.index.set_dtype(dtype) @@ -477,7 +495,7 @@ def concat(arrays, axis=0, index_labels=None, value_labels=None): for other in arrays[1:]: idx = idx.concat(other.index) v = concatenate([v, other.values], ordered=True) - retval = Series((idx, v)) + retval = Series(index=idx.index, data=v) return retval @@ -513,7 +531,7 @@ def pdconcat(arrays, axis=0, labels=None): cols = [] for col in arrays: - cols.append(pd.Series(col.values.to_ndarray(), index=idx)) + cols.append(pd.Series(data=col.values.to_ndarray(), index=idx)) retval = pd.concat(cols, axis=1) if labels is not None: retval.columns = labels diff --git a/tests/series_test.py b/tests/series_test.py index f3b468869f..e51699be6a 100644 --- a/tests/series_test.py +++ b/tests/series_test.py @@ -7,21 +7,39 @@ class SeriesTest(ArkoudaTest): def test_series_creation(self): - ar_tuple = (ak.arange(3), ak.array(['A', 'B', 'C'])) - s = ak.Series(ar_tuple=ar_tuple) + # Both data and index are supplied + v = ak.array(['A', 'B', 'C']) + i = ak.arange(3) + s = ak.Series(data=v, index=i) + + self.assertIsInstance(s, ak.Series) + self.assertIsInstance(s.index, ak.Index) + + # Just data is supplied + s = ak.Series(data=v) self.assertIsInstance(s, ak.Series) self.assertIsInstance(s.index, ak.Index) + # Just index is supplied + with self.assertRaises(TypeError): + s = ak.Series(index=i) + + # No arguments are supplied with self.assertRaises(TypeError): s = ak.Series() with self.assertRaises(ValueError): s = ak.Series(data=ak.arange(3), index=ak.arange(6)) - def test_lookup(self): ar_tuple = (ak.arange(3), ak.array(['A', 'B', 'C'])) - s = ak.Series(ar_tuple=ar_tuple) + with self.assertWarns(DeprecationWarning): + s = ak.Series(ar_tuple=ar_tuple) + + def test_lookup(self): + v = ak.array(['A', 'B', 'C']) + i = ak.arange(3) + s = ak.Series(data=v, index=i) l = s.locate(1) self.assertIsInstance(l, ak.Series) @@ -36,8 +54,9 @@ def test_lookup(self): self.assertEqual(l.values[1], 'C') def test_shape(self): - ar_tuple = (ak.arange(3), ak.array(['A', 'B', 'C'])) - s = ak.Series(ar_tuple=ar_tuple) + v = ak.array(['A', 'B', 'C']) + i = ak.arange(3) + s = ak.Series(data=v, index=i) l, = s.shape self.assertEqual(l, 3) @@ -46,9 +65,11 @@ def test_add(self): ar_tuple = (ak.arange(3), ak.arange(3)) ar_tuple_add = (ak.arange(3, 6, 1), ak.arange(3, 6, 1)) - s = ak.Series(ar_tuple=ar_tuple) + i = ak.arange(3) + v = ak.arange(3, 6, 1) + s = ak.Series(data=i, index=i) - s_add = ak.Series(ar_tuple=ar_tuple_add) + s_add = ak.Series(data=v, index=v) added = s.add(s_add) @@ -59,32 +80,36 @@ def test_add(self): self.assertIn(i, val_list) def test_topn(self): - ar_tuple = (ak.arange(3), ak.arange(3)) - s = ak.Series(ar_tuple=ar_tuple) + v = ak.arange(3) + i = ak.arange(3) + s = ak.Series(data=v, index=i) top = s.topn(2) self.assertListEqual(top.index.to_pandas().tolist(), [2, 1]) self.assertListEqual(top.values.to_ndarray().tolist(), [2, 1]) def test_sort_idx(self): - ar_tuple = (ak.array([3, 1, 4, 0, 2]), ak.arange(5)) - s = ak.Series(ar_tuple=ar_tuple) + v = ak.arange(5) + i = ak.array([3, 1, 4, 0, 2]) + s = ak.Series(data=v, index=i) sorted = s.sort_index() self.assertListEqual(sorted.index.to_pandas().tolist(), [i for i in range(5)]) self.assertListEqual(sorted.values.to_ndarray().tolist(), [3, 1, 4, 0, 2]) def test_sort_value(self): - ar_tuple = (ak.arange(5), ak.array([3, 1, 4, 0, 2])) - s = ak.Series(ar_tuple=ar_tuple) + v = ak.array([3, 1, 4, 0, 2]) + i = ak.arange(5) + s = ak.Series(data=v, index=i) sorted = s.sort_values() self.assertListEqual(sorted.index.to_pandas().tolist(), [3, 1, 4, 0, 2]) self.assertListEqual(sorted.values.to_ndarray().tolist(), [i for i in range(5)]) def test_head_tail(self): - ar_tuple = (ak.arange(5), ak.arange(5)) - s = ak.Series(ar_tuple=ar_tuple) + v = ak.arange(5) + i = ak.arange(5) + s = ak.Series(data=v, index=i) head = s.head(2) self.assertListEqual(head.index.to_pandas().tolist(), [0, 1]) @@ -95,8 +120,9 @@ def test_head_tail(self): self.assertListEqual(tail.values.to_ndarray().tolist(), [2, 3, 4]) def test_value_counts(self): - ar_tuple = (ak.arange(5), ak.array([0, 0, 1, 2, 2])) - s = ak.Series(ar_tuple=ar_tuple) + v = ak.array([0, 0, 1, 2, 2]) + i = ak.arange(5) + s = ak.Series(data=v, index=i) c = s.value_counts() self.assertListEqual(c.index.to_pandas().tolist(), [0, 2, 1]) @@ -107,11 +133,13 @@ def test_value_counts(self): self.assertListEqual(c.values.to_ndarray().tolist(), [2, 2, 1]) def test_concat(self): - ar_tuple = (ak.arange(5), ak.arange(5)) - s = ak.Series(ar_tuple=ar_tuple) + v = ak.arange(5) + i = ak.arange(5) + s = ak.Series(data=v, index=i) - ar_tuple_2 = (ak.arange(5, 11, 1), ak.arange(5, 11, 1)) - s2 = ak.Series(ar_tuple_2) + v = ak.arange(5, 11, 1) + i = ak.arange(5, 11, 1) + s2 = ak.Series(data=v, index=i) c = ak.Series.concat([s, s2]) self.assertListEqual(c.index.to_pandas().tolist(), [i for i in range(11)]) @@ -125,19 +153,22 @@ def test_concat(self): self.assertTrue(((ref_df == df.to_pandas()).all()).all()) def test_pdconcat(self): - ar_tuple = (ak.arange(5), ak.arange(5)) - s = ak.Series(ar_tuple=ar_tuple) + v = ak.arange(5) + i = ak.arange(5) + s = ak.Series(data=v, index=i) - ar_tuple_2 = (ak.arange(5, 11, 1), ak.arange(5, 11, 1)) - s2 = ak.Series(ar_tuple_2) + v = ak.arange(5, 11, 1) + i = ak.arange(5, 11, 1) + s2 = ak.Series(data=v, index=i) c = ak.Series.pdconcat([s, s2]) self.assertIsInstance(c, pd.Series) self.assertListEqual(c.index.tolist(), [i for i in range(11)]) self.assertListEqual(c.values.tolist(), [i for i in range(11)]) - ar_tuple_2 = (ak.arange(5, 10, 1), ak.arange(5, 10, 1)) - s2 = ak.Series(ar_tuple_2) + v = ak.arange(5, 10, 1) + i = ak.arange(5, 10, 1) + s2 = ak.Series(data=v, index=i) df = ak.Series.pdconcat([s, s2], axis=1) self.assertIsInstance(df, pd.DataFrame)