From 3b0b184f746350a106a72e66d664b0be17a7b694 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 12 Sep 2013 21:20:00 -0400 Subject: [PATCH] ENH: DataFrame constructor now accepts a numpy masked record array (GH3478) --- doc/source/release.rst | 1 + doc/source/v0.13.0.txt | 1 + pandas/core/frame.py | 76 +++++++++++++++++++++++++++++++------- pandas/tests/test_frame.py | 48 +++++++++++++++++++++++- 4 files changed, 111 insertions(+), 15 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 1d9fec688525a..ba7993bfed9bd 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -115,6 +115,7 @@ Improvements to existing features its ``DataFrame``'s ``to_excel()`` methods. (:issue:`4750`) - allow DataFrame constructor to accept more list-like objects, e.g. list of ``collections.Sequence`` and ``array.Array`` objects (:issue:`3783`,:issue:`42971`) + - DataFrame constructor now accepts a numpy masked record array (:issue:`3478`) API Changes ~~~~~~~~~~~ diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index f0a23b46373e9..d4c1eba1194ac 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -266,6 +266,7 @@ Enhancements ``ind``, passed to scipy.stats.gaussian_kde() (for scipy >= 0.11.0) to set the bandwidth, and to gkde.evaluate() to specify the indicies at which it is evaluated, respecttively. See scipy docs. + - DataFrame constructor now accepts a numpy masked record array (:issue:`3478`) .. _whatsnew_0130.refactoring: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fb08c5eaa4822..f56b6bc00cf15 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -394,14 +394,22 @@ def __init__(self, data=None, index=None, columns=None, dtype=None, elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, ma.MaskedArray): - mask = ma.getmaskarray(data) - if mask.any(): - data, fill_value = _maybe_upcast(data, copy=True) - data[mask] = fill_value + + # masked recarray + if isinstance(data, ma.mrecords.MaskedRecords): + mgr = _masked_rec_array_to_mgr(data, index, columns, dtype, copy) + + # a masked array else: - data = data.copy() - mgr = self._init_ndarray(data, index, columns, dtype=dtype, - copy=copy) + mask = ma.getmaskarray(data) + if mask.any(): + data, fill_value = _maybe_upcast(data, copy=True) + data[mask] = fill_value + else: + data = data.copy() + mgr = self._init_ndarray(data, index, columns, dtype=dtype, + copy=copy) + elif isinstance(data, (np.ndarray, Series)): if data.dtype.names: data_columns = list(data.dtype.names) @@ -1009,13 +1017,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns.append(k) arrays.append(v) - # reorder according to the columns - if len(columns) and len(arr_columns): - indexer = _ensure_index( - arr_columns).get_indexer(columns) - arr_columns = _ensure_index( - [arr_columns[i] for i in indexer]) - arrays = [arrays[i] for i in indexer] + arrays, arr_columns = _reorder_arrays(arrays, arr_columns, columns) elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = _to_arrays(data, columns) @@ -4817,6 +4819,52 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): dtype=dtype) +def _masked_rec_array_to_mgr(data, index, columns, dtype, copy): + """ extract from a masked rec array and create the manager """ + + # essentially process a record array then fill it + fill_value = data.fill_value + fdata = ma.getdata(data) + if index is None: + index = _get_names_from_index(fdata) + if index is None: + index = _default_index(len(data)) + index = _ensure_index(index) + + if columns is not None: + columns = _ensure_index(columns) + arrays, arr_columns = _to_arrays(fdata, columns) + + # fill if needed + new_arrays = [] + for fv, arr, col in zip(fill_value, arrays, arr_columns): + mask = ma.getmaskarray(data[col]) + if mask.any(): + arr, fv = _maybe_upcast(arr, fill_value=fv, copy=True) + arr[mask] = fv + new_arrays.append(arr) + + # create the manager + arrays, arr_columns = _reorder_arrays(new_arrays, arr_columns, columns) + if columns is None: + columns = arr_columns + + mgr = _arrays_to_mgr(arrays, arr_columns, index, columns) + + if copy: + mgr = mgr.copy() + return mgr + +def _reorder_arrays(arrays, arr_columns, columns): + # reorder according to the columns + if columns is not None and len(columns) and arr_columns is not None and len(arr_columns): + indexer = _ensure_index( + arr_columns).get_indexer(columns) + arr_columns = _ensure_index( + [arr_columns[i] for i in indexer]) + arrays = [arrays[i] for i in indexer] + return arrays, arr_columns + def _list_to_arrays(data, columns, coerce_float=False, dtype=None): if len(data) > 0 and isinstance(data[0], tuple): content = list(lib.to_object_array_tuples(data).T) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 507c2055e1b68..201212d27c4b0 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -9,7 +9,8 @@ import csv import unittest import nose - +import functools +import itertools from pandas.compat import( map, zip, range, long, lrange, lmap, lzip, OrderedDict, cPickle as pickle, u, StringIO @@ -21,6 +22,7 @@ import numpy as np import numpy.ma as ma from numpy.testing import assert_array_equal +import numpy.ma.mrecords as mrecords import pandas as pan import pandas.core.nanops as nanops @@ -2510,6 +2512,50 @@ def test_constructor_maskedarray_nonfloat(self): self.assertEqual(True, frame['A'][1]) self.assertEqual(False, frame['C'][2]) + def test_constructor_mrecarray(self): + """ + Ensure mrecarray produces frame identical to dict of masked arrays + from GH3479 + + """ + assert_fr_equal = functools.partial(assert_frame_equal, + check_index_type=True, + check_column_type=True, + check_frame_type=True) + arrays = [ + ('float', np.array([1.5, 2.0])), + ('int', np.array([1, 2])), + ('str', np.array(['abc', 'def'])), + ] + for name, arr in arrays[:]: + arrays.append(('masked1_' + name, + np.ma.masked_array(arr, mask=[False, True]))) + arrays.append(('masked_all', np.ma.masked_all((2,)))) + arrays.append(('masked_none', + np.ma.masked_array([1.0, 2.5], mask=False))) + + # call assert_frame_equal for all selections of 3 arrays + for comb in itertools.combinations(arrays, 3): + names, data = zip(*comb) + mrecs = mrecords.fromarrays(data, names=names) + + # fill the comb + comb = dict([ (k, v.filled()) if hasattr(v,'filled') else (k, v) for k, v in comb ]) + + expected = DataFrame(comb,columns=names) + result = DataFrame(mrecs) + assert_fr_equal(result,expected) + + # specify columns + expected = DataFrame(comb,columns=names[::-1]) + result = DataFrame(mrecs, columns=names[::-1]) + assert_fr_equal(result,expected) + + # specify index + expected = DataFrame(comb,columns=names,index=[1,2]) + result = DataFrame(mrecs, index=[1,2]) + assert_fr_equal(result,expected) + def test_constructor_corner(self): df = DataFrame(index=[]) self.assertEqual(df.values.shape, (0, 0))