Skip to content

Commit

Permalink
ENH: Add support for compact_ints and use_unsigned in Python engine
Browse files Browse the repository at this point in the history
Title is self-explanatory.    xref #12686 - I don't quite understand
why these are marked (if at all) as internal to the C engine only, as
the benefits for having these options accepted for the Python engine
is quite clear based on the documentation I added as well.
Implementation simply just calls the already-written function in
`pandas/parsers.pyx` - as it isn't specific to the `TextReader` class,
crossing over to grab this function from Cython (instead of
duplicating in pure Python) seems reasonable while maintaining that
separation between the C and Python engines.

Author: gfyoung <[email protected]>

Closes #13323 from gfyoung/python-engine-compact-ints and squashes the following commits:

95f7ba8 [gfyoung] ENH: Add support for compact_ints and use_unsigned in Python engine
  • Loading branch information
gfyoung authored and jreback committed Jun 2, 2016
1 parent ce56542 commit 0c6226c
Show file tree
Hide file tree
Showing 9 changed files with 246 additions and 104 deletions.
11 changes: 11 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,17 @@ low_memory : boolean, default ``True``
Note that the entire file is read into a single DataFrame regardless,
use the ``chunksize`` or ``iterator`` parameter to return the data in chunks.
(Only valid with C parser)
compact_ints : boolean, default False
DEPRECATED: this argument will be removed in a future version

If ``compact_ints`` is ``True``, then for any column that is of integer dtype, the
parser will attempt to cast it as the smallest integer ``dtype`` possible, either
signed or unsigned depending on the specification from the ``use_unsigned`` parameter.
use_unsigned : boolean, default False
DEPRECATED: this argument will be removed in a future version

If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether
the column should be compacted to the smallest signed or unsigned integer dtype.

NA and Missing Data Handling
++++++++++++++++++++++++++++
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,7 @@ Other API changes
Deprecations
^^^^^^^^^^^^

- ``compact_ints`` and ``use_unsigned`` have been deprecated in ``pd.read_csv`` and will be removed in a future version (:issue:`13320`)

.. _whatsnew_0182.performance:

Expand Down
35 changes: 33 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,20 @@
Note that the entire file is read into a single DataFrame regardless,
use the `chunksize` or `iterator` parameter to return the data in chunks.
(Only valid with C parser)
compact_ints : boolean, default False
DEPRECATED: this argument will be removed in a future version
If compact_ints is True, then for any column that is of integer dtype,
the parser will attempt to cast it as the smallest integer dtype possible,
either signed or unsigned depending on the specification from the
`use_unsigned` parameter.
use_unsigned : boolean, default False
DEPRECATED: this argument will be removed in a future version
If integer columns are being compacted (i.e. `compact_ints=True`), specify
whether the column should be compacted to the smallest signed or unsigned
integer dtype.
Returns
-------
Expand Down Expand Up @@ -425,8 +439,6 @@ def _read(filepath_or_buffer, kwds):
_c_unsupported = set(['skip_footer'])
_python_unsupported = set([
'as_recarray',
'compact_ints',
'use_unsigned',
'low_memory',
'memory_map',
'buffer_lines',
Expand All @@ -435,6 +447,10 @@ def _read(filepath_or_buffer, kwds):
'dtype',
'float_precision',
])
_deprecated_args = set([
'compact_ints',
'use_unsigned',
])


def _make_parser_function(name, sep=','):
Expand Down Expand Up @@ -789,6 +805,12 @@ def _clean_options(self, options, engine):

_validate_header_arg(options['header'])

for arg in _deprecated_args:
if result[arg] != _c_parser_defaults[arg]:
warnings.warn("The '{arg}' argument has been deprecated "
"and will be removed in a future version"
.format(arg=arg), FutureWarning, stacklevel=2)

if index_col is True:
raise ValueError("The value of index_col couldn't be 'True'")
if _is_index_col(index_col):
Expand Down Expand Up @@ -1206,6 +1228,12 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False,

cvals, na_count = self._convert_types(
values, set(col_na_values) | col_na_fvalues, coerce_type)

if issubclass(cvals.dtype.type, np.integer) and self.compact_ints:
cvals = lib.downcast_int64(
cvals, _parser.na_values,
self.use_unsigned)

result[c] = cvals
if verbose and na_count:
print('Filled %d NA values in column %s' % (na_count, str(c)))
Expand Down Expand Up @@ -1648,8 +1676,11 @@ def __init__(self, f, **kwds):
self.verbose = kwds['verbose']
self.converters = kwds['converters']

self.compact_ints = kwds['compact_ints']
self.use_unsigned = kwds['use_unsigned']
self.thousands = kwds['thousands']
self.decimal = kwds['decimal']

self.comment = kwds['comment']
self._comment_lines = []

Expand Down
46 changes: 15 additions & 31 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,45 +172,29 @@ def error(val):
self.assertTrue(sum(precise_errors) <= sum(normal_errors))
self.assertTrue(max(precise_errors) <= max(normal_errors))

def test_compact_ints(self):
if compat.is_platform_windows() and not self.low_memory:
raise nose.SkipTest(
"segfaults on win-64, only when all tests are run")

data = ('0,1,0,0\n'
'1,1,0,0\n'
'0,1,0,1')

result = self.read_csv(StringIO(data), delimiter=',', header=None,
compact_ints=True, as_recarray=True)
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

result = self.read_csv(StringIO(data), delimiter=',', header=None,
as_recarray=True, compact_ints=True,
use_unsigned=True)
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

def test_compact_ints_as_recarray(self):
if compat.is_platform_windows() and self.low_memory:
if compat.is_platform_windows():
raise nose.SkipTest(
"segfaults on win-64, only when all tests are run")

data = ('0,1,0,0\n'
'1,1,0,0\n'
'0,1,0,1')

result = self.read_csv(StringIO(data), delimiter=',', header=None,
compact_ints=True, as_recarray=True)
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

result = self.read_csv(StringIO(data), delimiter=',', header=None,
as_recarray=True, compact_ints=True,
use_unsigned=True)
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = self.read_csv(StringIO(data), delimiter=',', header=None,
compact_ints=True, as_recarray=True)
ex_dtype = np.dtype([(str(i), 'i1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
result = self.read_csv(StringIO(data), delimiter=',', header=None,
as_recarray=True, compact_ints=True,
use_unsigned=True)
ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)])
self.assertEqual(result.dtype, ex_dtype)

def test_pass_dtype(self):
data = """\
Expand Down
43 changes: 43 additions & 0 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1330,3 +1330,46 @@ def test_raise_on_no_columns(self):
# test with more than a single newline
data = "\n\n\n"
self.assertRaises(EmptyDataError, self.read_csv, StringIO(data))

def test_compact_ints_use_unsigned(self):
# see gh-13323
data = 'a,b,c\n1,9,258'

# sanity check
expected = DataFrame({
'a': np.array([1], dtype=np.int64),
'b': np.array([9], dtype=np.int64),
'c': np.array([258], dtype=np.int64),
})
out = self.read_csv(StringIO(data))
tm.assert_frame_equal(out, expected)

expected = DataFrame({
'a': np.array([1], dtype=np.int8),
'b': np.array([9], dtype=np.int8),
'c': np.array([258], dtype=np.int16),
})

# default behaviour for 'use_unsigned'
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
out = self.read_csv(StringIO(data), compact_ints=True)
tm.assert_frame_equal(out, expected)

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
out = self.read_csv(StringIO(data), compact_ints=True,
use_unsigned=False)
tm.assert_frame_equal(out, expected)

expected = DataFrame({
'a': np.array([1], dtype=np.uint8),
'b': np.array([9], dtype=np.uint8),
'c': np.array([258], dtype=np.uint16),
})

with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
out = self.read_csv(StringIO(data), compact_ints=True,
use_unsigned=True)
tm.assert_frame_equal(out, expected)
21 changes: 21 additions & 0 deletions pandas/io/tests/parser/test_unsupported.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,27 @@ def test_python_engine(self):
with tm.assertRaisesRegexp(ValueError, msg):
read_csv(StringIO(data), engine=engine, **kwargs)


class TestDeprecatedFeatures(tm.TestCase):
def test_deprecated_args(self):
data = '1,2,3'

# deprecated arguments with non-default values
deprecated = {
'compact_ints': True,
'use_unsigned': True,
}

engines = 'c', 'python'

for engine in engines:
for arg, non_default_val in deprecated.items():
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False):
kwargs = {arg: non_default_val}
read_csv(StringIO(data), engine=engine,
**kwargs)

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
72 changes: 1 addition & 71 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1018,7 +1018,7 @@ cdef class TextReader:
col_res = _maybe_upcast(col_res)

if issubclass(col_res.dtype.type, np.integer) and self.compact_ints:
col_res = downcast_int64(col_res, self.use_unsigned)
col_res = lib.downcast_int64(col_res, na_values, self.use_unsigned)

if col_res is None:
raise CParserError('Unable to parse column %d' % i)
Expand Down Expand Up @@ -1866,76 +1866,6 @@ cdef raise_parser_error(object base, parser_t *parser):
raise CParserError(message)


def downcast_int64(ndarray[int64_t] arr, bint use_unsigned=0):
cdef:
Py_ssize_t i, n = len(arr)
int64_t mx = INT64_MIN + 1, mn = INT64_MAX
int64_t NA = na_values[np.int64]
int64_t val
ndarray[uint8_t] mask
int na_count = 0

_mask = np.empty(n, dtype=bool)
mask = _mask.view(np.uint8)

for i in range(n):
val = arr[i]

if val == NA:
mask[i] = 1
na_count += 1
continue

# not NA
mask[i] = 0

if val > mx:
mx = val

if val < mn:
mn = val

if mn >= 0 and use_unsigned:
if mx <= UINT8_MAX - 1:
result = arr.astype(np.uint8)
if na_count:
np.putmask(result, _mask, na_values[np.uint8])
return result

if mx <= UINT16_MAX - 1:
result = arr.astype(np.uint16)
if na_count:
np.putmask(result, _mask, na_values[np.uint16])
return result

if mx <= UINT32_MAX - 1:
result = arr.astype(np.uint32)
if na_count:
np.putmask(result, _mask, na_values[np.uint32])
return result

else:
if mn >= INT8_MIN + 1 and mx <= INT8_MAX:
result = arr.astype(np.int8)
if na_count:
np.putmask(result, _mask, na_values[np.int8])
return result

if mn >= INT16_MIN + 1 and mx <= INT16_MAX:
result = arr.astype(np.int16)
if na_count:
np.putmask(result, _mask, na_values[np.int16])
return result

if mn >= INT32_MIN + 1 and mx <= INT32_MAX:
result = arr.astype(np.int32)
if na_count:
np.putmask(result, _mask, na_values[np.int32])
return result

return arr


def _concatenate_chunks(list chunks):
cdef:
list names = list(chunks[0].keys())
Expand Down
Loading

0 comments on commit 0c6226c

Please sign in to comment.