Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH/API: Add count parameter to limit generator in Series, DataFrame, and DataFrame.from_records() #5898

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 18 additions & 21 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import collections
import warnings
import types
from itertools import islice, chain

from numpy import nan as NA
import numpy as np
Expand Down Expand Up @@ -159,6 +160,8 @@ class DataFrame(NDFrame):
Data type to force, otherwise infer
copy : boolean, default False
Copy data from inputs. Only affects DataFrame / 2d ndarray input
count : int or None, when data it's a generator, number of values to
read. If None reads the whole generator

Examples
--------
Expand All @@ -185,7 +188,7 @@ def _constructor(self):
_constructor_sliced = Series

def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=False):
copy=False, count=None):
if data is None:
data = {}
if dtype is not None:
Expand Down Expand Up @@ -232,7 +235,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None,
copy=copy)
elif isinstance(data, (list, types.GeneratorType)):
if isinstance(data, types.GeneratorType):
data = list(data)
data = list(islice(data, count))
if len(data) > 0:
if index is None and isinstance(data[0], Series):
index = _get_names_from_index(data)
Expand Down Expand Up @@ -705,7 +708,7 @@ def to_gbq(self, destination_table, schema=None, col_order=None,

@classmethod
def from_records(cls, data, index=None, exclude=None, columns=None,
coerce_float=False, nrows=None):
coerce_float=False, count=None, nrows=None):
"""
Convert structured or record ndarray to DataFrame

Expand All @@ -726,44 +729,38 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
coerce_float : boolean, default False
Attempt to convert values to non-string, non-numeric objects (like
decimal.Decimal) to floating point, useful for SQL result sets
count : int or None, number of records to read from a generator.
If None reads the whole generator

Returns
-------
df : DataFrame
"""
#Deprecate undocumented nrows
if nrows is not None:
warnings.warn("nrows is deprecated, use count",
FutureWarning)
count = nrows

# Make a copy of the input columns so we can modify it
if columns is not None:
columns = _ensure_index(columns)

if com.is_iterator(data):
if nrows == 0:
if count == 0:
return cls()

try:
if compat.PY3:
first_row = next(data)
else:
first_row = next(data)
first_row = next(data)
except StopIteration:
return cls(index=index, columns=columns)

dtype = None
if hasattr(first_row, 'dtype') and first_row.dtype.names:
dtype = first_row.dtype

values = [first_row]

# if unknown length iterable (generator)
if nrows is None:
# consume whole generator
values += list(data)
else:
i = 1
for row in data:
values.append(row)
i += 1
if i >= nrows:
break
# put the generator in a list
values = list(islice(chain([first_row], data), count))

if dtype is not None:
data = np.array(values, dtype=dtype)
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Data structure for 1-dimensional cross-sectional and time series data
"""
from __future__ import division
from itertools import islice

# pylint: disable=E1101,E1103
# pylint: disable=W0703,W0622,W0613,W0201
Expand Down Expand Up @@ -118,11 +119,13 @@ class Series(generic.NDFrame):
dtype : numpy.dtype or None
If None, dtype will be inferred
copy : boolean, default False, copy input data
count : int or None, number of values to read from a generator.
If None reads the whole generator
"""
_metadata = ['name']

def __init__(self, data=None, index=None, dtype=None, name=None,
copy=False, fastpath=False):
copy=False, count=None, fastpath=False):

# we are called internally, so short-circuit
if fastpath:
Expand Down Expand Up @@ -192,7 +195,7 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
name = data.name
data = np.asarray(data)
elif isinstance(data, types.GeneratorType):
data = list(data)
data = list(islice(data, count))
elif isinstance(data, (set, frozenset)):
raise TypeError("{0!r} type is unordered"
"".format(data.__class__.__name__))
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2791,6 +2791,15 @@ def test_constructor_generator(self):
expected = DataFrame({ 0 : range(10), 1 : 'a' })
assert_frame_equal(result, expected, check_dtype=False)

def test_constructor_generator_count_limit(self):
generator_length = 10
expected_length = 5

#only works when data it'a a generator, not a collection
gen = ([ i, 'a'] for i in range(generator_length))
result = DataFrame(gen, count=expected_length)
self.assertEqual(len(result), expected_length)

def test_constructor_list_of_dicts(self):
data = [OrderedDict([['a', 1.5], ['b', 3], ['c', 4], ['d', 6]]),
OrderedDict([['a', 1.5], ['b', 3], ['d', 6]]),
Expand Down Expand Up @@ -3820,6 +3829,14 @@ def list_generator(length):
result = DataFrame.from_records(generator, columns=columns_names)
assert_frame_equal(result, expected)

def test_from_records_generator_count_limit(self):
def generator(length):
for i in range(length):
yield (i, i/2)
expected_length = 5
df = DataFrame.from_records(generator(10), count=expected_length)
self.assertEqual(len(df), expected_length)

def test_from_records_columns_not_modified(self):
tuples = [(1, 2, 3),
(1, 2, 3),
Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,14 @@ def test_constructor_generator(self):
exp.index = lrange(10, 20)
assert_series_equal(result, exp)

def test_constructor_generator_count_limit(self):
generator_length = 10
expected_length = 5
gen = (i for i in range(generator_length))

result = Series(gen, count=expected_length)
self.assertEqual(len(result), expected_length)

def test_constructor_categorical(self):
cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'])
res = Series(cat)
Expand Down