Skip to content

Commit

Permalink
Closes #1363: ak.Series with only values and more robust typechecking (
Browse files Browse the repository at this point in the history
…#1441)

This PR (closes #1363):

- Deprecates the `ar_tuple` parameter inside of the `__init__` method.
- Adjusts the tests to use the other 2 parameters `(index, data)` when instantiating.

Co-authored-by: Jim Eichert <[email protected]>
  • Loading branch information
jeichert60 and jeichert60 authored May 31, 2022
1 parent 6bf6b84 commit c0786e6
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 58 deletions.
2 changes: 1 addition & 1 deletion arkouda/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def __getattr__(self, key):
if key not in self.columns:
raise AttributeError(f'Attribute {key} not found')
# Should this be cached?
return Series(data=self[key], index=self.index)
return Series(data=self[key], index=self.index.index)

def __dir__(self):
return dir(DataFrame) + self.columns
Expand Down
88 changes: 60 additions & 28 deletions arkouda/series.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typeguard import typechecked
from typing import List, Optional, Tuple, Union, Iterable
from arkouda.pdarrayclass import pdarray, argmaxk, attach_pdarray
from arkouda.pdarraycreation import arange, array, zeros
from arkouda.pdarraysetops import argsort, concatenate, in1d
from arkouda.index import Index
from arkouda.groupbyclass import GroupBy
from arkouda.groupbyclass import GroupBy, groupable_element_type
from arkouda.dtypes import int64, float64
from arkouda.numeric import value_counts, cast as akcast
from arkouda.util import get_callback
Expand Down Expand Up @@ -75,28 +77,55 @@ class Series:
Parameters
----------
ar_tuple : 2-tuple of arkouda arrays with the first being the grouping key(s) and the
second being the value. The grouping key(s) will be the index of the series.
index : pdarray, Strings
an array of indices associated with the data array.
If empty, it will default to a range of ints whose size match the size of the data.
optional
data : Tuple, List, groupable_element_type
a 1D array. Must not be None if ar_tuple is not provided.
Raises
------
TypeError
Raised if index is not a pdarray or Strings object
Raised if data is not a pdarray, Strings, or Categorical object
ValueError
Raised if the index size does not match data size
Notes
-----
The Series class accepts either positional arguments or keyword arguments.
If entering positional arguments,
2 arguments entered:
argument 1 - index
argument 2 - data
1 argument entered:
argument 1 - data
If entering 1 positional argument, it is assumed that this is the data argument.
If entering keywords,
'data' (see Parameters)
'index' (optional) must match size of 'data'
"""

def __init__(self, ar_tuple=None, data=None, index=None):
if ar_tuple is not None:
self.index = Index.factory(ar_tuple[0])
self.values = ar_tuple[1]
elif data is None:
raise TypeError("ar_tuple and data cannot both be null")

@typechecked
def __init__(self, data: Union[Tuple, List, groupable_element_type],
index: Optional[Union[pdarray, Strings]] = None):
# TODO: Allow index to be an Index when index.py is updated
if isinstance(data, (tuple, list)) and len(data) == 2:
# handles the previous `ar_tuple` case
if not isinstance(data[0], (pdarray, Strings)):
raise TypeError("indices must be a pdarray or Strings")
if not isinstance(data[1], (pdarray, Strings, Categorical)):
raise TypeError("values must be a pdarray, Strings, or Categorical")
self.values = data[1]
self.index = Index.factory(data[0])
else:
if not isinstance(data, (pdarray, Strings, Categorical)):
data = array(data)
self.values = data
# When only 1 positional argument it will be treated as data and not index
self.values = array(data) if not isinstance(data, (Strings, Categorical)) else data
self.index = Index.factory(index) if index is not None else Index(arange(self.values.size))

if index is None:
index = arange(data.size)
self.index = Index.factory(index)
if self.index.size != self.values.size:
raise ValueError("Index and data must have same length")
raise ValueError("Index size does not match data size")
self.size = self.index.size

def __len__(self):
Expand Down Expand Up @@ -196,7 +225,7 @@ def locate(self, key):
else:
# scalar value
idx = self.index == key
return Series((self.index[idx], self.values[idx]))
return Series(index=self.index.index[idx], data=self.values[idx])

@classmethod
def _make_binop(cls, operator):
Expand Down Expand Up @@ -233,7 +262,9 @@ def add(self, b):
index = self.index.concat(b.index).index

values = concatenate([self.values, b.values], ordered=False)
return Series(GroupBy(index).sum(values))

idx, vals = GroupBy(index).sum(values)
return Series(data=vals, index=idx)

def topn(self, n=10):
""" Return the top values of the series
Expand All @@ -252,7 +283,7 @@ def topn(self, n=10):
idx = argmaxk(v, n)
idx = idx[-1:-n - 1:-1]

return Series((k[idx], v[idx]))
return Series(index=k.index[idx], data=v[idx])

def sort_index(self, ascending=True):
""" Sort the series by its index
Expand All @@ -263,7 +294,7 @@ def sort_index(self, ascending=True):
"""

idx = self.index.argsort(ascending=ascending)
return Series((self.index[idx], self.values[idx]))
return Series(index=self.index.index[idx], data=self.values[idx])

def sort_values(self, ascending=True):
""" Sort the series numerically
Expand All @@ -282,19 +313,19 @@ def sort_values(self, ascending=True):
idx = argsort(self.values)[arange(self.values.size - 1, -1, -1)]
else:
idx = argsort(self.values)
return Series((self.index[idx], self.values[idx]))
return Series(index=self.index.index[idx], data=self.values[idx])

def tail(self, n=10):
"""Return the last n values of the series"""

idx_series = (self.index[-n:])
return Series((idx_series, self.values[-n:]))
return Series(index=idx_series.index, data=self.values[-n:])

def head(self, n=10):
"""Return the first n values of the series"""

idx_series = (self.index[0:n])
return Series((idx_series, self.values[0:n]))
return Series(index=idx_series.index, data=self.values[0:n])

def to_pandas(self):
"""Convert the series to a local PANDAS series"""
Expand All @@ -316,7 +347,8 @@ def value_counts(self, sort=True):
"""

dtype = get_callback(self.values)
s = Series(value_counts(self.values))
idx, vals = value_counts(self.values)
s = Series(index=idx, data=vals)
if sort:
s = s.sort_values(ascending=False)
s.index.set_dtype(dtype)
Expand Down Expand Up @@ -477,7 +509,7 @@ def concat(arrays, axis=0, index_labels=None, value_labels=None):
for other in arrays[1:]:
idx = idx.concat(other.index)
v = concatenate([v, other.values], ordered=True)
retval = Series((idx, v))
retval = Series(index=idx.index, data=v)

return retval

Expand Down Expand Up @@ -513,7 +545,7 @@ def pdconcat(arrays, axis=0, labels=None):

cols = []
for col in arrays:
cols.append(pd.Series(col.values.to_ndarray(), index=idx))
cols.append(pd.Series(data=col.values.to_ndarray(), index=idx))
retval = pd.concat(cols, axis=1)
if labels is not None:
retval.columns = labels
Expand Down
2 changes: 1 addition & 1 deletion tests/registration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def test_attach_weak_binding(self):

def test_series_register_attach(self):
ar_tuple = (ak.arange(5), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
s = ak.Series(ar_tuple)

# At this time, there is no unregister() in Series. Register one piece to check partial registration
s.values.register("seriesTest_values")
Expand Down
100 changes: 72 additions & 28 deletions tests/series_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,52 @@
class SeriesTest(ArkoudaTest):

def test_series_creation(self):
ar_tuple = (ak.arange(3), ak.array(['A', 'B', 'C']))
s = ak.Series(ar_tuple=ar_tuple)
# Use positional arguments
ar_tuple = ak.arange(3), ak.arange(3)
s = ak.Series(ar_tuple)
self.assertIsInstance(s, ak.Series)

ar_tuple = ak.array(['A', 'B', 'C']), ak.arange(3)
s = ak.Series(ar_tuple)
self.assertIsInstance(s, ak.Series)

# Both data and index are supplied
v = ak.array(['A', 'B', 'C'])
i = ak.arange(3)
s = ak.Series(data=v, index=i)

self.assertIsInstance(s, ak.Series)
self.assertIsInstance(s.index, ak.Index)

# Just data is supplied
s = ak.Series(data=v)
self.assertIsInstance(s, ak.Series)
self.assertIsInstance(s.index, ak.Index)

# Just index is supplied (keyword argument)
with self.assertRaises(TypeError):
s = ak.Series(index=i)

# Just data is supplied (positional argument)
s = ak.Series(ak.array(['A', 'B', 'C']))
self.assertIsInstance(s, ak.Series)

# Just index is supplied (ar_tuple argument)
ar_tuple = (ak.arange(3),)
with self.assertRaises(TypeError):
s = ak.Series(ar_tuple)

# No arguments are supplied
with self.assertRaises(TypeError):
s = ak.Series()

with self.assertRaises(ValueError):
s = ak.Series(data=ak.arange(3), index=ak.arange(6))

def test_lookup(self):
ar_tuple = (ak.arange(3), ak.array(['A', 'B', 'C']))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.array(['A', 'B', 'C'])
i = ak.arange(3)
s = ak.Series(data=v, index=i)

l = s.locate(1)
self.assertIsInstance(l, ak.Series)
Expand All @@ -36,8 +67,9 @@ def test_lookup(self):
self.assertEqual(l.values[1], 'C')

def test_shape(self):
ar_tuple = (ak.arange(3), ak.array(['A', 'B', 'C']))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.array(['A', 'B', 'C'])
i = ak.arange(3)
s = ak.Series(data=v, index=i)

l, = s.shape
self.assertEqual(l, 3)
Expand All @@ -46,9 +78,11 @@ def test_add(self):
ar_tuple = (ak.arange(3), ak.arange(3))
ar_tuple_add = (ak.arange(3, 6, 1), ak.arange(3, 6, 1))

s = ak.Series(ar_tuple=ar_tuple)
i = ak.arange(3)
v = ak.arange(3, 6, 1)
s = ak.Series(data=i, index=i)

s_add = ak.Series(ar_tuple=ar_tuple_add)
s_add = ak.Series(data=v, index=v)

added = s.add(s_add)

Expand All @@ -59,32 +93,36 @@ def test_add(self):
self.assertIn(i, val_list)

def test_topn(self):
ar_tuple = (ak.arange(3), ak.arange(3))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(3)
i = ak.arange(3)
s = ak.Series(data=v, index=i)

top = s.topn(2)
self.assertListEqual(top.index.to_pandas().tolist(), [2, 1])
self.assertListEqual(top.values.to_ndarray().tolist(), [2, 1])

def test_sort_idx(self):
ar_tuple = (ak.array([3, 1, 4, 0, 2]), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(5)
i = ak.array([3, 1, 4, 0, 2])
s = ak.Series(data=v, index=i)

sorted = s.sort_index()
self.assertListEqual(sorted.index.to_pandas().tolist(), [i for i in range(5)])
self.assertListEqual(sorted.values.to_ndarray().tolist(), [3, 1, 4, 0, 2])

def test_sort_value(self):
ar_tuple = (ak.arange(5), ak.array([3, 1, 4, 0, 2]))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.array([3, 1, 4, 0, 2])
i = ak.arange(5)
s = ak.Series(data=v, index=i)

sorted = s.sort_values()
self.assertListEqual(sorted.index.to_pandas().tolist(), [3, 1, 4, 0, 2])
self.assertListEqual(sorted.values.to_ndarray().tolist(), [i for i in range(5)])

def test_head_tail(self):
ar_tuple = (ak.arange(5), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(5)
i = ak.arange(5)
s = ak.Series(data=v, index=i)

head = s.head(2)
self.assertListEqual(head.index.to_pandas().tolist(), [0, 1])
Expand All @@ -95,8 +133,9 @@ def test_head_tail(self):
self.assertListEqual(tail.values.to_ndarray().tolist(), [2, 3, 4])

def test_value_counts(self):
ar_tuple = (ak.arange(5), ak.array([0, 0, 1, 2, 2]))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.array([0, 0, 1, 2, 2])
i = ak.arange(5)
s = ak.Series(data=v, index=i)

c = s.value_counts()
self.assertListEqual(c.index.to_pandas().tolist(), [0, 2, 1])
Expand All @@ -107,11 +146,13 @@ def test_value_counts(self):
self.assertListEqual(c.values.to_ndarray().tolist(), [2, 2, 1])

def test_concat(self):
ar_tuple = (ak.arange(5), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(5)
i = ak.arange(5)
s = ak.Series(data=v, index=i)

ar_tuple_2 = (ak.arange(5, 11, 1), ak.arange(5, 11, 1))
s2 = ak.Series(ar_tuple_2)
v = ak.arange(5, 11, 1)
i = ak.arange(5, 11, 1)
s2 = ak.Series(data=v, index=i)

c = ak.Series.concat([s, s2])
self.assertListEqual(c.index.to_pandas().tolist(), [i for i in range(11)])
Expand All @@ -125,19 +166,22 @@ def test_concat(self):
self.assertTrue(((ref_df == df.to_pandas()).all()).all())

def test_pdconcat(self):
ar_tuple = (ak.arange(5), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(5)
i = ak.arange(5)
s = ak.Series(data=v, index=i)

ar_tuple_2 = (ak.arange(5, 11, 1), ak.arange(5, 11, 1))
s2 = ak.Series(ar_tuple_2)
v = ak.arange(5, 11, 1)
i = ak.arange(5, 11, 1)
s2 = ak.Series(data=v, index=i)

c = ak.Series.pdconcat([s, s2])
self.assertIsInstance(c, pd.Series)
self.assertListEqual(c.index.tolist(), [i for i in range(11)])
self.assertListEqual(c.values.tolist(), [i for i in range(11)])

ar_tuple_2 = (ak.arange(5, 10, 1), ak.arange(5, 10, 1))
s2 = ak.Series(ar_tuple_2)
v = ak.arange(5, 10, 1)
i = ak.arange(5, 10, 1)
s2 = ak.Series(data=v, index=i)

df = ak.Series.pdconcat([s, s2], axis=1)
self.assertIsInstance(df, pd.DataFrame)
Expand Down

0 comments on commit c0786e6

Please sign in to comment.