Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #1363: ak.Series with only values and more robust typechecking #1441

Merged
merged 1 commit into from
May 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion arkouda/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def __getattr__(self, key):
if key not in self.columns:
raise AttributeError(f'Attribute {key} not found')
# Should this be cached?
return Series(data=self[key], index=self.index)
return Series(data=self[key], index=self.index.index)

def __dir__(self):
return dir(DataFrame) + self.columns
Expand Down
88 changes: 60 additions & 28 deletions arkouda/series.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typeguard import typechecked
from typing import List, Optional, Tuple, Union, Iterable
from arkouda.pdarrayclass import pdarray, argmaxk, attach_pdarray
from arkouda.pdarraycreation import arange, array, zeros
from arkouda.pdarraysetops import argsort, concatenate, in1d
from arkouda.index import Index
from arkouda.groupbyclass import GroupBy
from arkouda.groupbyclass import GroupBy, groupable_element_type
from arkouda.dtypes import int64, float64
from arkouda.numeric import value_counts, cast as akcast
from arkouda.util import get_callback
Expand Down Expand Up @@ -75,28 +77,55 @@ class Series:

Parameters
----------
ar_tuple : 2-tuple of arkouda arrays with the first being the grouping key(s) and the
second being the value. The grouping key(s) will be the index of the series.

index : pdarray, Strings
an array of indices associated with the data array.
If empty, it will default to a range of ints whose size match the size of the data.
optional
data : Tuple, List, groupable_element_type
a 1D array. Must not be None if ar_tuple is not provided.
stress-tess marked this conversation as resolved.
Show resolved Hide resolved

Raises
------
TypeError
Raised if index is not a pdarray or Strings object
Raised if data is not a pdarray, Strings, or Categorical object
ValueError
Raised if the index size does not match data size

Notes
-----
The Series class accepts either positional arguments or keyword arguments.
If entering positional arguments,
2 arguments entered:
argument 1 - index
argument 2 - data
stress-tess marked this conversation as resolved.
Show resolved Hide resolved
1 argument entered:
argument 1 - data
If entering 1 positional argument, it is assumed that this is the data argument.
If entering keywords,
'data' (see Parameters)
'index' (optional) must match size of 'data'
"""

def __init__(self, ar_tuple=None, data=None, index=None):
if ar_tuple is not None:
self.index = Index.factory(ar_tuple[0])
self.values = ar_tuple[1]
elif data is None:
raise TypeError("ar_tuple and data cannot both be null")

@typechecked
def __init__(self, data: Union[Tuple, List, groupable_element_type],
index: Optional[Union[pdarray, Strings]] = None):
# TODO: Allow index to be an Index when index.py is updated
if isinstance(data, (tuple, list)) and len(data) == 2:
# handles the previous `ar_tuple` case
if not isinstance(data[0], (pdarray, Strings)):
raise TypeError("indices must be a pdarray or Strings")
if not isinstance(data[1], (pdarray, Strings, Categorical)):
raise TypeError("values must be a pdarray, Strings, or Categorical")
self.values = data[1]
self.index = Index.factory(data[0])
else:
if not isinstance(data, (pdarray, Strings, Categorical)):
data = array(data)
self.values = data
# When only 1 positional argument it will be treated as data and not index
self.values = array(data) if not isinstance(data, (Strings, Categorical)) else data
self.index = Index.factory(index) if index is not None else Index(arange(self.values.size))

if index is None:
index = arange(data.size)
self.index = Index.factory(index)
if self.index.size != self.values.size:
raise ValueError("Index and data must have same length")
raise ValueError("Index size does not match data size")
self.size = self.index.size

def __len__(self):
Expand Down Expand Up @@ -196,7 +225,7 @@ def locate(self, key):
else:
# scalar value
idx = self.index == key
return Series((self.index[idx], self.values[idx]))
return Series(index=self.index.index[idx], data=self.values[idx])

@classmethod
def _make_binop(cls, operator):
Expand Down Expand Up @@ -233,7 +262,9 @@ def add(self, b):
index = self.index.concat(b.index).index

values = concatenate([self.values, b.values], ordered=False)
return Series(GroupBy(index).sum(values))

idx, vals = GroupBy(index).sum(values)
return Series(data=vals, index=idx)

def topn(self, n=10):
""" Return the top values of the series
Expand All @@ -252,7 +283,7 @@ def topn(self, n=10):
idx = argmaxk(v, n)
idx = idx[-1:-n - 1:-1]

return Series((k[idx], v[idx]))
return Series(index=k.index[idx], data=v[idx])

def sort_index(self, ascending=True):
""" Sort the series by its index
Expand All @@ -263,7 +294,7 @@ def sort_index(self, ascending=True):
"""

idx = self.index.argsort(ascending=ascending)
return Series((self.index[idx], self.values[idx]))
return Series(index=self.index.index[idx], data=self.values[idx])

def sort_values(self, ascending=True):
""" Sort the series numerically
Expand All @@ -282,19 +313,19 @@ def sort_values(self, ascending=True):
idx = argsort(self.values)[arange(self.values.size - 1, -1, -1)]
else:
idx = argsort(self.values)
return Series((self.index[idx], self.values[idx]))
return Series(index=self.index.index[idx], data=self.values[idx])

def tail(self, n=10):
"""Return the last n values of the series"""

idx_series = (self.index[-n:])
return Series((idx_series, self.values[-n:]))
return Series(index=idx_series.index, data=self.values[-n:])

def head(self, n=10):
"""Return the first n values of the series"""

idx_series = (self.index[0:n])
return Series((idx_series, self.values[0:n]))
return Series(index=idx_series.index, data=self.values[0:n])

def to_pandas(self):
"""Convert the series to a local PANDAS series"""
Expand All @@ -316,7 +347,8 @@ def value_counts(self, sort=True):
"""

dtype = get_callback(self.values)
s = Series(value_counts(self.values))
idx, vals = value_counts(self.values)
s = Series(index=idx, data=vals)
if sort:
s = s.sort_values(ascending=False)
s.index.set_dtype(dtype)
Expand Down Expand Up @@ -477,7 +509,7 @@ def concat(arrays, axis=0, index_labels=None, value_labels=None):
for other in arrays[1:]:
idx = idx.concat(other.index)
v = concatenate([v, other.values], ordered=True)
retval = Series((idx, v))
retval = Series(index=idx.index, data=v)

return retval

Expand Down Expand Up @@ -513,7 +545,7 @@ def pdconcat(arrays, axis=0, labels=None):

cols = []
for col in arrays:
cols.append(pd.Series(col.values.to_ndarray(), index=idx))
cols.append(pd.Series(data=col.values.to_ndarray(), index=idx))
retval = pd.concat(cols, axis=1)
if labels is not None:
retval.columns = labels
Expand Down
2 changes: 1 addition & 1 deletion tests/registration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ def test_attach_weak_binding(self):

def test_series_register_attach(self):
ar_tuple = (ak.arange(5), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
s = ak.Series(ar_tuple)

# At this time, there is no unregister() in Series. Register one piece to check partial registration
s.values.register("seriesTest_values")
Expand Down
100 changes: 72 additions & 28 deletions tests/series_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,52 @@
class SeriesTest(ArkoudaTest):

def test_series_creation(self):
ar_tuple = (ak.arange(3), ak.array(['A', 'B', 'C']))
s = ak.Series(ar_tuple=ar_tuple)
# Use positional arguments
ar_tuple = ak.arange(3), ak.arange(3)
s = ak.Series(ar_tuple)
self.assertIsInstance(s, ak.Series)

ar_tuple = ak.array(['A', 'B', 'C']), ak.arange(3)
s = ak.Series(ar_tuple)
self.assertIsInstance(s, ak.Series)

# Both data and index are supplied
v = ak.array(['A', 'B', 'C'])
i = ak.arange(3)
s = ak.Series(data=v, index=i)

self.assertIsInstance(s, ak.Series)
self.assertIsInstance(s.index, ak.Index)

# Just data is supplied
s = ak.Series(data=v)
self.assertIsInstance(s, ak.Series)
self.assertIsInstance(s.index, ak.Index)

# Just index is supplied (keyword argument)
with self.assertRaises(TypeError):
s = ak.Series(index=i)

# Just data is supplied (positional argument)
s = ak.Series(ak.array(['A', 'B', 'C']))
self.assertIsInstance(s, ak.Series)

# Just index is supplied (ar_tuple argument)
ar_tuple = (ak.arange(3),)
with self.assertRaises(TypeError):
s = ak.Series(ar_tuple)

# No arguments are supplied
with self.assertRaises(TypeError):
s = ak.Series()

with self.assertRaises(ValueError):
s = ak.Series(data=ak.arange(3), index=ak.arange(6))

def test_lookup(self):
ar_tuple = (ak.arange(3), ak.array(['A', 'B', 'C']))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.array(['A', 'B', 'C'])
i = ak.arange(3)
s = ak.Series(data=v, index=i)

l = s.locate(1)
self.assertIsInstance(l, ak.Series)
Expand All @@ -36,8 +67,9 @@ def test_lookup(self):
self.assertEqual(l.values[1], 'C')

def test_shape(self):
ar_tuple = (ak.arange(3), ak.array(['A', 'B', 'C']))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.array(['A', 'B', 'C'])
i = ak.arange(3)
s = ak.Series(data=v, index=i)

l, = s.shape
self.assertEqual(l, 3)
Expand All @@ -46,9 +78,11 @@ def test_add(self):
ar_tuple = (ak.arange(3), ak.arange(3))
ar_tuple_add = (ak.arange(3, 6, 1), ak.arange(3, 6, 1))
stress-tess marked this conversation as resolved.
Show resolved Hide resolved

s = ak.Series(ar_tuple=ar_tuple)
i = ak.arange(3)
v = ak.arange(3, 6, 1)
s = ak.Series(data=i, index=i)

s_add = ak.Series(ar_tuple=ar_tuple_add)
s_add = ak.Series(data=v, index=v)

added = s.add(s_add)

Expand All @@ -59,32 +93,36 @@ def test_add(self):
self.assertIn(i, val_list)

def test_topn(self):
ar_tuple = (ak.arange(3), ak.arange(3))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(3)
i = ak.arange(3)
s = ak.Series(data=v, index=i)

top = s.topn(2)
self.assertListEqual(top.index.to_pandas().tolist(), [2, 1])
self.assertListEqual(top.values.to_ndarray().tolist(), [2, 1])

def test_sort_idx(self):
ar_tuple = (ak.array([3, 1, 4, 0, 2]), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(5)
i = ak.array([3, 1, 4, 0, 2])
s = ak.Series(data=v, index=i)

sorted = s.sort_index()
self.assertListEqual(sorted.index.to_pandas().tolist(), [i for i in range(5)])
self.assertListEqual(sorted.values.to_ndarray().tolist(), [3, 1, 4, 0, 2])

def test_sort_value(self):
ar_tuple = (ak.arange(5), ak.array([3, 1, 4, 0, 2]))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.array([3, 1, 4, 0, 2])
i = ak.arange(5)
s = ak.Series(data=v, index=i)

sorted = s.sort_values()
self.assertListEqual(sorted.index.to_pandas().tolist(), [3, 1, 4, 0, 2])
self.assertListEqual(sorted.values.to_ndarray().tolist(), [i for i in range(5)])

def test_head_tail(self):
ar_tuple = (ak.arange(5), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(5)
i = ak.arange(5)
s = ak.Series(data=v, index=i)

head = s.head(2)
self.assertListEqual(head.index.to_pandas().tolist(), [0, 1])
Expand All @@ -95,8 +133,9 @@ def test_head_tail(self):
self.assertListEqual(tail.values.to_ndarray().tolist(), [2, 3, 4])

def test_value_counts(self):
ar_tuple = (ak.arange(5), ak.array([0, 0, 1, 2, 2]))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.array([0, 0, 1, 2, 2])
i = ak.arange(5)
s = ak.Series(data=v, index=i)

c = s.value_counts()
self.assertListEqual(c.index.to_pandas().tolist(), [0, 2, 1])
Expand All @@ -107,11 +146,13 @@ def test_value_counts(self):
self.assertListEqual(c.values.to_ndarray().tolist(), [2, 2, 1])

def test_concat(self):
ar_tuple = (ak.arange(5), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(5)
i = ak.arange(5)
s = ak.Series(data=v, index=i)

ar_tuple_2 = (ak.arange(5, 11, 1), ak.arange(5, 11, 1))
s2 = ak.Series(ar_tuple_2)
v = ak.arange(5, 11, 1)
i = ak.arange(5, 11, 1)
s2 = ak.Series(data=v, index=i)

c = ak.Series.concat([s, s2])
self.assertListEqual(c.index.to_pandas().tolist(), [i for i in range(11)])
Expand All @@ -125,19 +166,22 @@ def test_concat(self):
self.assertTrue(((ref_df == df.to_pandas()).all()).all())

def test_pdconcat(self):
ar_tuple = (ak.arange(5), ak.arange(5))
s = ak.Series(ar_tuple=ar_tuple)
v = ak.arange(5)
i = ak.arange(5)
s = ak.Series(data=v, index=i)

ar_tuple_2 = (ak.arange(5, 11, 1), ak.arange(5, 11, 1))
s2 = ak.Series(ar_tuple_2)
v = ak.arange(5, 11, 1)
i = ak.arange(5, 11, 1)
s2 = ak.Series(data=v, index=i)

c = ak.Series.pdconcat([s, s2])
self.assertIsInstance(c, pd.Series)
self.assertListEqual(c.index.tolist(), [i for i in range(11)])
self.assertListEqual(c.values.tolist(), [i for i in range(11)])

ar_tuple_2 = (ak.arange(5, 10, 1), ak.arange(5, 10, 1))
s2 = ak.Series(ar_tuple_2)
v = ak.arange(5, 10, 1)
i = ak.arange(5, 10, 1)
s2 = ak.Series(data=v, index=i)

df = ak.Series.pdconcat([s, s2], axis=1)
self.assertIsInstance(df, pd.DataFrame)
Expand Down