diff --git a/.gitignore b/.gitignore index 6fc0b4d6e9b5e..3782509c5c048 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ MANIFEST *.pyd pandas/src/tseries.c pandas/src/sparse.c +pandas/version.py doc/source/generated *flymake* scikits diff --git a/RELEASE.rst b/RELEASE.rst index b293dda62e135..685e69f4094ec 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -1,10 +1,83 @@ -======================== -pandas 0.4 Release Notes -======================== +============= +Release Notes +============= -What is it +This is the list of changes to pandas between each release. For full details, +see the commit logs at http://github.com/wesm/pandas + + +pandas 0.4.1 +============ + +**Release date:** Not yet released + +This is primarily a bug fix release but includes some new features and +improvements + +**New features / modules** + + - Added new `DataFrame` methods `get_dtype_counts` and property `dtypes` + - Setting of values using ``.ix`` indexing attribute in mixed-type DataFrame + objects has been implemented (fixes GH #135) + - `read_csv` can read multiple columns into a `MultiIndex`. DataFrame's + `to_csv` method will properly write out a `MultiIndex` which can be read + back (PR #151, thanks to Skipper Seabold) + - Wrote fast time series merging / joining methods in Cython. Will be + integrated later into DataFrame.join and related functions + - Added `ignore_index` option to `DataFrame.append` for combining unindexed + records stored in a DataFrame + +**Improvements to existing features** + + - Some speed enhancements with internal Index type-checking function + - `DataFrame.rename` has a new `copy` parameter which can rename a DataFrame + in place + - Enable unstacking by level name (PR #142) + - Enable sortlevel to work by level name (PR #141) + - `read_csv` can automatically "sniff" other kinds of delimiters using + `csv.Sniffer` (PR #146) + - Improved speed of unit test suite by about 40% + - Exception will not be raised calling `HDFStore.remove` on non-existent node + with where clause + - Optimized `_ensure_index` function resulting in performance savings in + type-checking Index objects + +**Bug fixes** + + - Fixed DataFrame constructor bug causing downstream problems (e.g. .copy() + failing) when passing a Series as the values along with a column name and + index + - Fixed single-key groupby on DataFrame with as_index=False (GH #160) + - `Series.shift` was failing on integer Series (GH #154) + - `unstack` methods were producing incorrect output in the case of duplicate + hierarchical labels. An exception will now be raised (GH #147) + - Calling `count` with level argument caused reduceat failure or segfault in + earlier NumPy (GH #169) + - Fixed `DataFrame.corrwith` to automatically exclude non-numeric data (GH + #144) + - Unicode handling bug fixes in `DataFrame.to_string` (GH #138) + - Excluding OLS degenerate unit test case that was causing platform specific + failure (GH #149) + - Skip blosc-dependent unit tests for PyTables < 2.2 (PR #137) + - Calling `copy` on `DateRange` did not copy over attributes to the new object + (GH #168) + - Fix bug in `HDFStore` in which Panel data could be appended to a Table with + different item order, thus resulting in an incorrect result read back + +Thanks +------ +- Yaroslav Halchenko +- Jeff Reback +- Skipper Seabold +- Dan Lovell +- Nick Pentreath + +pandas 0.4 ========== +What is it +---------- + **pandas** is a library of powerful labeled-axis data structures, statistical tools, and general code for working with relational data sets, including time series and cross-sectional data. It was designed with the practical needs of @@ -13,14 +86,14 @@ particularly well suited for, among other things, financial data analysis applications. Where to get it -=============== +--------------- Source code: http://github.com/wesm/pandas Binary installers on PyPI: http://pypi.python.org/pypi/pandas Documentation: http://pandas.sourceforge.net Release notes -============= +------------- **Release date:** 9/12/2011 @@ -279,12 +352,8 @@ Thanks - Skipper Seabold - Chris Jordan-Squire -======================== -pandas 0.3 Release Notes -======================== - -Release Notes -============= +pandas 0.3 +========== This major release of pandas represents approximately 1 year of continuous development work and brings with it many new features, bug fixes, speed @@ -293,7 +362,7 @@ change from the 0.2 release has been the completion of a rigorous unit test suite covering all of the core functionality. What is it -========== +---------- **pandas** is a library of labeled data structures, statistical models, and general code for working with time series and cross-sectional data. It was @@ -301,14 +370,14 @@ designed with the practical needs of statistical modeling and large, inhomogeneous data sets in mind. Where to get it -=============== +--------------- Source code: http://github.com/wesm/pandas Binary installers on PyPI: http://pypi.python.org/pypi/pandas Documentation: http://pandas.sourceforge.net Release notes -============= +------------- **Release date:** February 20, 2011 diff --git a/bench/bench_join_panel.py b/bench/bench_join_panel.py new file mode 100644 index 0000000000000..59a4711c4b6d2 --- /dev/null +++ b/bench/bench_join_panel.py @@ -0,0 +1,77 @@ +# reasonably effecient + +def create_panels_append(cls, panels): + """ return an append list of panels """ + panels = [ a for a in panels if a is not None ] + # corner cases + if len(panels) == 0: + return None + elif len(panels) == 1: + return panels[0] + elif len(panels) == 2 and panels[0] == panels[1]: + return panels[0] + #import pdb; pdb.set_trace() + # create a joint index for the axis + def joint_index_for_axis(panels, axis): + s = set() + for p in panels: + s.update(list(getattr(p,axis))) + return sorted(list(s)) + def reindex_on_axis(panels, axis, axis_reindex): + new_axis = joint_index_for_axis(panels, axis) + new_panels = [ p.reindex(**{ axis_reindex : new_axis, 'copy' : False}) for p in panels ] + return new_panels, new_axis + # create the joint major index, dont' reindex the sub-panels - we are appending + major = joint_index_for_axis(panels, 'major_axis') + # reindex on minor axis + panels, minor = reindex_on_axis(panels, 'minor_axis', 'minor') + # reindex on items + panels, items = reindex_on_axis(panels, 'items', 'items') + # concatenate values + try: + values = np.concatenate([ p.values for p in panels ],axis=1) + except (Exception), detail: + raise Exception("cannot append values that dont' match dimensions! -> [%s] %s" % (','.join([ "%s" % p for p in panels ]),str(detail))) + #pm('append - create_panel') + p = Panel(values, items = items, major_axis = major, minor_axis = minor ) + #pm('append - done') + return p + + + +# does the job but inefficient (better to handle like you read a table in pytables...e.g create a LongPanel then convert to Wide) + +def create_panels_join(cls, panels): + """ given an array of panels's, create a single panel """ + panels = [ a for a in panels if a is not None ] + # corner cases + if len(panels) == 0: + return None + elif len(panels) == 1: + return panels[0] + elif len(panels) == 2 and panels[0] == panels[1]: + return panels[0] + d = dict() + minor, major, items = set(), set(), set() + for panel in panels: + items.update(panel.items) + major.update(panel.major_axis) + minor.update(panel.minor_axis) + values = panel.values + for item, item_index in panel.items.indexMap.items(): + for minor_i, minor_index in panel.minor_axis.indexMap.items(): + for major_i, major_index in panel.major_axis.indexMap.items(): + try: + d[(minor_i,major_i,item)] = values[item_index,major_index,minor_index] + except: + pass + # stack the values + minor = sorted(list(minor)) + major = sorted(list(major)) + items = sorted(list(items)) + # create the 3d stack (items x columns x indicies) + data = np.dstack([ np.asarray([ np.asarray([ d.get((minor_i,major_i,item),np.nan) for item in items ]) for major_i in major ]).transpose() for minor_i in minor ]) + # construct the panel + return Panel(data, items, major, minor) +add_class_method(Panel, create_panels_join, 'join_many') + diff --git a/bench/bench_take_indexing.py b/bench/bench_take_indexing.py new file mode 100644 index 0000000000000..fc8a3c6b743ea --- /dev/null +++ b/bench/bench_take_indexing.py @@ -0,0 +1,52 @@ +import numpy as np + +from pandas import * +import pandas._tseries as lib + +from pandas import DataFrame +import timeit + +setup = """ +from pandas import Series +import pandas._tseries as lib +import random +import numpy as np + +import random +n = %d +k = %d +arr = np.random.randn(n, k) +indexer = np.arange(n, dtype=np.int32) +indexer = indexer[::-1] +""" + +sizes = [100, 1000, 10000, 100000] +iters = [1000, 1000, 100, 1] + +fancy_2d = [] +take_2d = [] +cython_2d = [] + +n = 1000 + +def _timeit(stmt, size, k=5, iters=1000): + timer = timeit.Timer(stmt=stmt, setup=setup % (sz, k)) + return timer.timeit(n) / n + +for sz, its in zip(sizes, iters): + print sz + fancy_2d.append(_timeit('arr[indexer]', sz, iters=its)) + take_2d.append(_timeit('arr.take(indexer, axis=0)', sz, iters=its)) + cython_2d.append(_timeit('lib.take_axis0(arr, indexer)', sz, iters=its)) + +df = DataFrame({'fancy' : fancy_2d, + 'take' : take_2d, + 'cython' : cython_2d}) + +print df + +from pandas.rpy.common import r +r('mat <- matrix(rnorm(50000), nrow=10000, ncol=5)') +r('set.seed(12345') +r('indexer <- sample(1:10000)') +r('mat[indexer,]') diff --git a/doc/data/mindex_ex.csv b/doc/data/mindex_ex.csv new file mode 100644 index 0000000000000..935ff936cd842 --- /dev/null +++ b/doc/data/mindex_ex.csv @@ -0,0 +1,16 @@ +year,indiv,zit,xit +1977,"A",1.2,.6 +1977,"B",1.5,.5 +1977,"C",1.7,.8 +1978,"A",.2,.06 +1978,"B",.7,.2 +1978,"C",.8,.3 +1978,"D",.9,.5 +1978,"E",1.4,.9 +1979,"C",.2,.15 +1979,"D",.14,.05 +1979,"E",.5,.15 +1979,"F",1.2,.5 +1979,"G",3.4,1.9 +1979,"H",5.4,2.7 +1979,"I",6.4,1.2 diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index eeff9f845baba..eca3e2ccde4c6 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -513,11 +513,6 @@ The API for insertion and deletion is the same as for DataFrame. Indexing / Selection ~~~~~~~~~~~~~~~~~~~~ -As of this writing, indexing with Panel is a bit more restrictive than in -DataFrame. Notably, :ref:`advanced indexing ` via the **ix** property -has not yet been integrated in Panel. This will be done, however, in a -future release. - .. csv-table:: :header: "Operation", "Syntax", "Result" :widths: 30, 20, 10 diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index c324f5bfcba69..905f004e955f1 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -291,7 +291,8 @@ than integer locations. Therefore, advanced indexing with ``.ix`` will always Setting values in mixed-type objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Setting values on a mixed-type DataFrame or Panel is not yet supported: +Setting values on a mixed-type DataFrame or Panel is supported when using scalar +values, though setting arbitrary vectors is not yet supported: .. ipython:: python @@ -299,11 +300,7 @@ Setting values on a mixed-type DataFrame or Panel is not yet supported: df2['foo'] = 'bar' df2.ix[3] df2.ix[3] = np.nan - -The reason it has not been implemented yet is simply due to difficulty of -implementation relative to its utility. Handling the full spectrum of -exceptional cases for setting values is trickier than getting values (which is -relatively straightforward). + df2 .. _indexing.hierarchical: @@ -523,6 +520,16 @@ However: >>> s.ix[('a', 'b'):('b', 'a')] Exception: MultiIndex lexsort depth 1, key was length 2 +Swapping levels with ``swaplevel`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``swaplevel`` function can switch the order of two levels: + +.. ipython:: python + + df[:5] + df[:5].swaplevel(0, 1, axis=0) + The ``delevel`` DataFrame function ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/io.rst b/doc/source/io.rst index 2c630d1f43bc0..6e92a5ec166d9 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -96,6 +96,24 @@ fragile. Type inference is a pretty big deal. So if a column can be coerced to integer dtype without altering the contents, it will do so. Any non-numeric columns will come through as object dtype as with the rest of pandas objects. +Reading DataFrame objects with ``MultiIndex`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose you have data indexed by two columns: + +.. ipython:: python + + print open('data/mindex_ex.csv').read() + +The ``index_col`` argument to ``read_csv`` and ``read_table`` can take a list of +column numbers to turn multiple columns into a ``MultiIndex``: + +.. ipython:: python + + df = read_csv("data/mindex_ex.csv", index_col=[0,1]) + df + df.ix[1978] + Excel 2003 files ---------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index 7206a25550c36..7928c3832dc2e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -138,13 +138,6 @@ def _mut_exclusive(arg1, arg2): else: return arg2 -def _ensure_index(index_like): - from pandas.core.index import Index - if not isinstance(index_like, Index): - index_like = Index(index_like) - - return index_like - def _any_none(*args): for arg in args: if arg is None: @@ -203,7 +196,8 @@ def _pfixed(s, space, nanRep=None, float_format=None): return formatted.ljust(space) else: - return (' %s' % s)[:space].ljust(space) + stringified = _stringify(s) + return (' %s' % stringified)[:space].ljust(space) def _stringify(col): # unicode workaround @@ -217,7 +211,7 @@ def _format(s, nanRep=None, float_format=None): if nanRep is not None and isnull(s): if np.isnan(s): s = nanRep - return (' %s' % s) + return ' %s' % s if float_format: formatted = float_format(s) @@ -232,7 +226,7 @@ def _format(s, nanRep=None, float_format=None): return formatted else: - return ' %s' % s + return ' %s' % _stringify(s) #------------------------------------------------------------------------------- # miscellaneous python tools diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a8e3d1d9c4770..c6844b15ef459 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -19,15 +19,15 @@ from numpy import nan import numpy as np -from pandas.core.common import (isnull, notnull, PandasError, _ensure_index, +from pandas.core.common import (isnull, notnull, PandasError, _try_sort, _pfixed, _default_index, _infer_dtype, _stringify) from pandas.core.daterange import DateRange from pandas.core.generic import AxisProperty, NDFrame -from pandas.core.index import Index, MultiIndex, NULL_INDEX +from pandas.core.index import Index, MultiIndex, NULL_INDEX, _ensure_index from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels from pandas.core.internals import BlockManager, make_block, form_blocks -from pandas.core.series import Series, _is_bool_indexer +from pandas.core.series import Series, _is_bool_indexer, _maybe_upcast from pandas.util.decorators import deprecate import pandas.core.common as common import pandas.core.datetools as datetools @@ -64,13 +64,7 @@ def f(self, other, axis=default_axis, fill_value=None): if isinstance(other, DataFrame): # Another DataFrame return self._combine_frame(other, func, fill_value) elif isinstance(other, Series): - if axis is not None: - axis = self._get_axis_name(axis) - if axis == 'index': - return self._combine_match_index(other, func, fill_value) - else: - return self._combine_match_columns(other, func, fill_value) - return self._combine_series_infer(other, func, fill_value) + return self._combine_series(other, func, fill_value, axis) else: return self._combine_const(other, func) @@ -101,6 +95,7 @@ def f(self, other): class DataFrame(NDFrame): _auto_consolidate = True _verbose_info = True + _het_axis = 1 _AXIS_NUMBERS = { 'index' : 0, @@ -431,8 +426,9 @@ def from_csv(cls, path, header=0, delimiter=',', index_col=0): header : int, default 0 Row to use at header (skip prior rows) delimiter : string, default ',' - index_col : int, default 0 - Column to use for index + index_col : int or sequence, default 0 + Column to use for index. If a sequence is given, a MultiIndex + is used. Notes ----- @@ -482,8 +478,10 @@ def to_csv(self, path, nanRep='', cols=None, header=True, Write out column names index : boolean, default True Write row names (index) - index_label : string, default None - Column label for index column if desired + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. mode : Python write mode, default 'wb' """ f = open(path, mode) @@ -494,15 +492,25 @@ def to_csv(self, path, nanRep='', cols=None, header=True, series = self._series if header: joined_cols = ','.join([str(c) for c in cols]) - if index and index_label: - f.write('%s,%s' % (index_label, joined_cols)) + if index: + # should write something for index label + if index_label is None: + index_label = getattr(self.index, 'names', ['index']) + elif not isinstance(index_label, (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] + f.write('%s,%s' % (",".join(index_label), joined_cols)) else: f.write(joined_cols) f.write('\n') + nlevels = getattr(self.index, 'nlevels', 1) for idx in self.index: if index: - f.write(str(idx)) + if nlevels == 1: + f.write(str(idx)) + else: # handle MultiIndex + f.write(",".join([str(i) for i in idx])) for i, col in enumerate(cols): val = series[col].get(idx) if isnull(val): @@ -634,11 +642,15 @@ def info(self, verbose=True, buf=None): % (_stringify(cols[0]), _stringify(cols[-1]))) - counts = self._get_dtype_counts() + counts = self.get_dtype_counts() dtypes = ['%s(%d)' % k for k in sorted(counts.iteritems())] buf.write(u'dtypes: %s' % ', '.join(dtypes)) - def _get_dtype_counts(self): + @property + def dtypes(self): + return self.apply(lambda x: x.dtype) + + def get_dtype_counts(self): counts = {} for _, series in self.iteritems(): if series.dtype in counts: @@ -646,7 +658,7 @@ def _get_dtype_counts(self): else: counts[series.dtype] = 1 - return counts + return Series(counts) #---------------------------------------------------------------------- # properties for index and columns @@ -1345,7 +1357,7 @@ def fillna(self, value=None, method='pad'): #---------------------------------------------------------------------- # Rename - def rename(self, index=None, columns=None): + def rename(self, index=None, columns=None, copy=True): """ Alter index and / or columns using input function or functions. Function / dict values must be unique (1-to-1). Labels not @@ -1357,6 +1369,8 @@ def rename(self, index=None, columns=None): Transformation to apply to index values columns : dict-like or function, optional Transformation to apply to column values + copy : boolean, default True + Also copy underlying data See also -------- @@ -1389,7 +1403,7 @@ def columns_f(x): self._consolidate_inplace() - result = self.copy() + result = self.copy(deep=copy) if index is not None: result._rename_index_inplace(index_f) @@ -1404,7 +1418,7 @@ def _rename_index_inplace(self, mapper): self._series_cache.clear() def _rename_columns_inplace(self, mapper): - self._data = self._data.rename_items(mapper) + self._data = self._data.rename_items(mapper, copydata=False) self._series_cache.clear() #---------------------------------------------------------------------- @@ -1457,6 +1471,15 @@ def _indexed_same(self, other): same_columns = self.columns.equals(other.columns) return same_index and same_columns + def _combine_series(self, other, func, fill_value=None, axis=None): + if axis is not None: + axis = self._get_axis_name(axis) + if axis == 'index': + return self._combine_match_index(other, func, fill_value) + else: + return self._combine_match_columns(other, func, fill_value) + return self._combine_series_infer(other, func, fill_value) + def _combine_series_infer(self, other, func, fill_value=None): if len(other) == 0: return self * nan @@ -2007,11 +2030,18 @@ def f(x): #---------------------------------------------------------------------- # Merging / joining methods - def append(self, other): + def append(self, other, ignore_index=False): """ Append columns of other to end of this frame's columns and index. Columns not in this frame are added as new columns. + Parameters + ---------- + other : DataFrame + ignore_index : boolean, default False + If True do not use the index labels. Useful for gluing together + record arrays + Returns ------- appended : DataFrame @@ -2021,28 +2051,56 @@ def append(self, other): if not self: return other.copy() - new_index = np.concatenate((self.index, other.index)) - new_data = {} + if ignore_index: + new_index = None + else: + new_index = np.concatenate((self.index, other.index)) + + if self.columns.equals(other.columns): + return self._append_same_columns(other, new_index) + else: + return self._append_different_columns(other, new_index) - new_columns = self.columns + def _append_different_columns(self, other, new_index): + new_columns = self.columns + other.columns + new_data = self._append_column_by_column(other) + return self._constructor(data=new_data, index=new_index, + columns=new_columns) + + def _append_same_columns(self, other, new_index): + if self._is_mixed_type: + new_data = self._append_column_by_column(other) + else: + new_data= np.concatenate((self.values, other.values), axis=0) + return self._constructor(new_data, index=new_index, + columns=self.columns) - if not new_columns.equals(other.columns): - new_columns = self.columns + other.columns + def _append_column_by_column(self, other): + def _concat_missing(values, n): + values = _maybe_upcast(values) + missing_values = np.empty(n, dtype=values.dtype) + missing_values.fill(np.nan) + return values, missing_values - for column, series in self.iteritems(): - values = series.values - if column in other: - other_values = other[column].values - new_data[column] = np.concatenate((values, other_values)) + new_data = {} + for col in self: + values = self._get_raw_column(col) + if col in other: + other_values = other._get_raw_column(col) else: - new_data[column] = series + values, other_values = _concat_missing(values, len(other)) + new_data[col] = np.concatenate((values, other_values)) - for column, series in other.iteritems(): - if column not in self: - new_data[column] = series + for col in other: + values = other._get_raw_column(col) + if col not in self: + values, missing_values = _concat_missing(values, len(self)) + new_data[col] = np.concatenate((missing_values, values)) - return self._constructor(data=new_data, index=new_index, - columns=new_columns) + return new_data + + def _get_raw_column(self, col): + return self._data.get(col) def join(self, other, on=None, how=None, lsuffix='', rsuffix=''): """ @@ -2162,20 +2220,23 @@ def corrwith(self, other, axis=0, drop=False): ------- correls : Series """ - com_index = self._intersect_index(other) - com_cols = self._intersect_columns(other) + this = self._get_numeric_data() + other = other._get_numeric_data() + + com_index = this._intersect_index(other) + com_cols = this._intersect_columns(other) # feels hackish if axis == 0: result_index = com_index if not drop: - result_index = self.columns.union(other.columns) + result_index = this.columns.union(other.columns) else: result_index = com_cols if not drop: - result_index = self.index.union(other.index) + result_index = this.index.union(other.index) - left = self.reindex(index=com_index, columns=com_cols) + left = this.reindex(index=com_index, columns=com_cols) right = other.reindex(index=com_index, columns=com_cols) # mask missing values @@ -2262,6 +2323,11 @@ def _count_level(self, level, axis=0, numeric_only=False): level_index = axis_index.levels[level] + if len(self) == 0: + return DataFrame(np.zeros((len(level_index), + len(self.columns)), dtype=int), + index=level_index, columns=self.columns) + n = len(level_index) locs = axis_index.labels[level].searchsorted(np.arange(n)) @@ -2640,6 +2706,15 @@ def _get_numeric_columns(self): return cols + def _get_numeric_data(self): + if self._is_mixed_type: + return self.ix[:, self._get_numeric_columns()] + else: + if self.values.dtype != np.object_: + return self + else: + return self.ix[:, []] + def clip(self, upper=None, lower=None): """ Trim values at input threshold(s) @@ -3057,6 +3132,8 @@ def _prep_ndarray(values, copy=True): arr = np.array(values, dtype=object, copy=True) values = arr else: + # drop subclass info, do not copy data + values = np.asarray(values) if copy: values = values.copy() @@ -3136,7 +3213,6 @@ def _homogenize(data, index, columns, dtype=None): return homogenized - def _put_str(s, space): return ('%s' % s)[:space].ljust(space) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 74eb9449efa58..be61619b74424 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,8 +1,7 @@ import numpy as np import cPickle -from pandas.core.common import _ensure_index -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, MultiIndex, _ensure_index import pandas.core.datetools as datetools #------------------------------------------------------------------------------- @@ -398,9 +397,23 @@ def _values_aggregate(self, func, axis, fill_value, skipna=True): return result - def copy(self): - """Make a deep copy of this object""" - return self._constructor(self._data.copy()) + def copy(self, deep=True): + """ + Make a copy of this object + + Parameters + ---------- + deep : boolean, default True + Make a deep copy, i.e. also copy data + + Returns + ------- + copy : type of caller + """ + data = self._data + if deep: + data = data.copy() + return self._constructor(data) def swaplevel(self, i, j, axis=0): """ @@ -410,6 +423,7 @@ def swaplevel(self, i, j, axis=0): ------- swapped : type of caller (new object) """ + axis = self._get_axis_number(axis) result = self.copy() labels = result._data.axes[axis] result._data.set_axis(axis, labels.swaplevel(i, j)) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 898ca4694f850..77b8429422467 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -923,8 +923,12 @@ def _wrap_aggregated_output(self, output, mask): result = DataFrame(output, index=index, columns=output_keys) else: name_list = self._get_names() - result = DataFrame(output, index=name_list[0][1], - columns=output_keys) + name, labels = name_list[0] + if not self.as_index: + result = DataFrame(output, columns=output_keys) + result.insert(0, name, labels) + else: + result = DataFrame(output, index=labels, columns=output_keys) if self.axis == 1: result = result.T diff --git a/pandas/core/index.py b/pandas/core/index.py index 99b2ff809013d..9111528d97682 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -6,8 +6,7 @@ import numpy as np from pandas.core.common import (_format, adjoin as _adjoin, _stringify, - _ensure_index, _is_bool_indexer, - _asarray_tuplesafe) + _is_bool_indexer, _asarray_tuplesafe) from pandas.util.decorators import deprecate, cache_readonly import pandas.core.common as common import pandas._tseries as _tseries @@ -434,6 +433,14 @@ def drop(self, labels): raise ValueError('labels %s not contained in axis' % labels[-mask]) return self.delete(indexer) + def copy(self, order='C'): + """ + Overridden ndarray.copy to copy over attributes + """ + cp = self.view(np.ndarray).copy(order).view(type(self)) + cp.__dict__.update(self.__dict__) + return cp + #---------------------------------------------------------------------- # deprecated stuff @@ -532,6 +539,13 @@ def __iter__(self): for lev, lab in zip(self.levels, self.labels)] return izip(*values) + def _get_level_number(self, level): + if not isinstance(level, int): + level = self.names.index(level) + elif level < 0: + level += self.nlevels + return level + @property def values(self): result = np.empty(len(self), dtype=object) @@ -790,7 +804,8 @@ def sortlevel(self, level=0, ascending=True): Parameters ---------- - level : int, default 0 + level : int or str, default 0 + If a string is given, must be a name of the level ascending : boolean, default True False to sort in descending order @@ -799,6 +814,7 @@ def sortlevel(self, level=0, ascending=True): sorted_index : MultiIndex """ labels = list(self.labels) + level = self._get_level_number(level) primary = labels.pop(level) # Lexsort starts from END @@ -1218,3 +1234,7 @@ def _sparsify(label_list): return zip(*result) +def _ensure_index(index_like): + if isinstance(index_like, Index): + return index_like + return Index(index_like) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 48f8d90af73d3..93898e0cc7960 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -36,11 +36,6 @@ def _getitem_xs(self, idx, axis=0): return self.obj.xs(idx, axis=axis, copy=True) def __setitem__(self, key, value): - # also has the side effect of consolidating in-place - if self.obj._is_mixed_type: - raise IndexingError('setting on mixed-type frames not ' - 'yet supported') - if isinstance(key, tuple): if len(key) > self.ndim: raise IndexingError('only tuples of length <= %d supported', @@ -54,7 +49,31 @@ def __setitem__(self, key, value): else: indexer = self._convert_to_indexer(key) - self.obj.values[indexer] = value + self._setitem_with_indexer(indexer, value) + + def _setitem_with_indexer(self, indexer, value): + # also has the side effect of consolidating in-place + if self.obj._is_mixed_type: + if not isinstance(indexer, tuple): + indexer = self._tuplify(indexer) + + het_axis = self.obj._het_axis + het_idx = indexer[het_axis] + + if isinstance(het_idx, (int, long)): + het_idx = [het_idx] + + if not np.isscalar(value): + raise IndexingError('setting on mixed-type frames only ' + 'allowed with scalar values') + + plane_indexer = indexer[:het_axis] + indexer[het_axis+1:] + item_labels = self.obj._get_axis(het_axis) + for item in item_labels[het_idx]: + data = self.obj[item] + data.values[plane_indexer] = value + else: + self.obj.values[indexer] = value def _getitem_tuple(self, tup): # a bit kludgy @@ -205,6 +224,11 @@ def _convert_to_indexer(self, obj, axis=0): return obj return index.get_loc(obj) + def _tuplify(self, loc): + tup = [slice(None, None) for _ in range(self.ndim)] + tup[0] = loc + return tuple(tup) + def _get_slice_axis(self, slice_obj, axis=0): obj = self.obj diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 158409400f336..9fa4e6a842e1a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3,8 +3,7 @@ from numpy import nan import numpy as np -from pandas.core.index import Index -from pandas.core.common import _ensure_index +from pandas.core.index import Index, _ensure_index import pandas.core.common as common import pandas._tseries as _tseries @@ -85,8 +84,11 @@ def shape(self): def dtype(self): return self.values.dtype - def copy(self): - return make_block(self.values.copy(), self.items, self.ref_items) + def copy(self, deep=True): + values = self.values + if deep: + values = values.copy() + return make_block(values, self.items, self.ref_items) def merge(self, other): assert(self.ref_items.equals(other.ref_items)) @@ -683,13 +685,13 @@ def rename_axis(self, mapper, axis=1): new_axes[axis] = new_axis return BlockManager(self.blocks, new_axes) - def rename_items(self, mapper): + def rename_items(self, mapper, copydata=True): new_items = Index([mapper(x) for x in self.items]) new_items._verify_integrity() new_blocks = [] for block in self.blocks: - newb = block.copy() + newb = block.copy(deep=copydata) newb.set_ref_items(new_items, maybe_rename=True) new_blocks.append(newb) new_axes = list(self.axes) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index c87dbf6f9422c..b4f2589270a4a 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -10,9 +10,9 @@ import numpy as np -from pandas.core.common import (PandasError, _mut_exclusive, _ensure_index, +from pandas.core.common import (PandasError, _mut_exclusive, _try_sort, _default_index, _infer_dtype) -from pandas.core.index import Factor, Index, MultiIndex +from pandas.core.index import Factor, Index, MultiIndex, _ensure_index from pandas.core.indexing import _NDFrameIndexer from pandas.core.internals import BlockManager, make_block, form_blocks from pandas.core.frame import DataFrame, _union_indexes @@ -22,6 +22,66 @@ import pandas.core.common as common import pandas._tseries as _tseries + +def _ensure_like_indices(time, panels): + """ + Makes sure that time and panels are conformable + """ + n_time = len(time) + n_panel = len(panels) + u_panels = np.unique(panels) # this sorts! + u_time = np.unique(time) + if len(u_time) == n_time: + time = np.tile(u_time, len(u_panels)) + if len(u_panels) == n_panel: + panels = np.repeat(u_panels, len(u_time)) + return time, panels + +def panel_index(time, panels, names=['time', 'panel']): + """ + Returns a multi-index suitable for a panel-like DataFrame + + Parameters + ---------- + time : array-like + Time index, does not have to repeat + panels : array-like + Panel index, does not have to repeat + names : list, optional + List containing the names of the indices + + Returns + ------- + multi_index : MultiIndex + Time index is the first level, the panels are the second level. + + Examples + -------- + >>> years = range(1960,1963) + >>> panels = ['A', 'B', 'C'] + >>> panel_idx = panel_index(years, panels) + >>> panel_idx + MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'), (1961, 'B'), + (1962, 'B'), (1960, 'C'), (1961, 'C'), (1962, 'C')], dtype=object) + + or + + >>> import numpy as np + >>> years = np.repeat(range(1960,1963), 3) + >>> panels = np.tile(['A', 'B', 'C'], 3) + >>> panel_idx = panel_index(years, panels) + >>> panel_idx + MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'), (1961, 'B'), + (1961, 'C'), (1962, 'A'), (1962, 'B'), (1962, 'C')], dtype=object) + """ + time, panels = _ensure_like_indices(time, panels) + time_factor = Factor(time) + panel_factor = Factor(panels) + + labels = [time_factor.labels, panel_factor.labels] + levels = [time_factor.levels, panel_factor.levels] + return MultiIndex(levels, labels, sortorder=None, names=names) + class PanelError(Exception): pass @@ -106,6 +166,7 @@ class Panel(NDFrame): # major _default_stat_axis = 1 + _het_axis = 0 items = AxisProperty(0) major_axis = AxisProperty(1) @@ -1041,13 +1102,15 @@ class LongPanel(DataFrame): def consistent(self): offset = max(len(self.major_axis), len(self.minor_axis)) + major_labels = self.major_labels + minor_labels = self.minor_labels + # overflow risk - if (offset + 1) ** 2 > 2**32: - keys = (self.major_labels.astype(np.int64) * offset + - self.minor_labels.astype(np.int64)) - else: - keys = self.major_labels * offset + self.minor_labels + if (offset + 1) ** 2 > 2**32: # pragma: no cover + major_labels = major_labels.astype(np.int64) + minor_labels = minor_labels.astype(np.int64) + keys = major_labels * offset + minor_labels unique_keys = np.unique(keys) if len(unique_keys) < len(keys): @@ -1127,6 +1190,8 @@ def _combine(self, other, func, axis='items'): return self._combine_frame(other, func) elif isinstance(other, DataFrame): return self._combine_panel_frame(other, func, axis=axis) + elif isinstance(other, Series): + return self._combine_series(other, func, axis=axis) elif np.isscalar(other): return LongPanel(func(self.values, other), columns=self.items, index=self.index) @@ -1152,9 +1217,9 @@ def _combine_panel_frame(self, other, func, axis='items'): return result.to_long() add = _panel_arith_method(operator.add, 'add') - subtract = _panel_arith_method(operator.sub, 'subtract') - divide = _panel_arith_method(operator.div, 'divide') - multiply = _panel_arith_method(operator.mul, 'multiply') + subtract = sub = _panel_arith_method(operator.sub, 'subtract') + divide = div = _panel_arith_method(operator.div, 'divide') + multiply = mul = _panel_arith_method(operator.mul, 'multiply') def to_wide(self): """ diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 28944b4db6df4..8c545307a89a1 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -11,14 +11,18 @@ from pandas.core.common import notnull from pandas.core.index import MultiIndex +class ReshapeError(Exception): + pass + + class _Unstacker(object): """ Helper class to unstack data / pivot with multi-level index Parameters ---------- - level : int, default last level - Level to "unstack" + level : int or str, default last level + Level to "unstack". Accepts a name for the level. Examples -------- @@ -52,10 +56,7 @@ def __init__(self, values, index, level=-1, value_columns=None): raise ValueError('must pass column labels for multi-column data') self.index = index - - if level < 0: - level += index.nlevels - self.level = level + self.level = self.index._get_level_number(level) self.new_index_levels = list(index.levels) self.removed_level = self.new_index_levels.pop(level) @@ -82,7 +83,8 @@ def _make_selectors(self): # make the mask group_index = self.sorted_labels[0] - prev_stride = np.prod([len(x) for x in new_levels[1:]]) + prev_stride = np.prod([len(x) for x in new_levels[1:]], + dtype=int) for lev, lab in zip(new_levels[1:], self.sorted_labels[1:-1]): group_index = group_index * prev_stride + lab @@ -100,6 +102,10 @@ def _make_selectors(self): unique_groups = np.arange(self.full_shape[0])[group_mask] compressor = group_index.searchsorted(unique_groups) + if mask.sum() < len(self.index): + raise ReshapeError('Index contains duplicate entries, ' + 'cannot reshape') + self.group_mask = group_mask self.group_index = group_index self.mask = mask diff --git a/pandas/core/series.py b/pandas/core/series.py index d074ede1e0e66..2ceb33fd4c4bb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -13,11 +13,11 @@ from numpy import nan, ndarray import numpy as np -from pandas.core.common import (isnull, notnull, _ensure_index, - _is_bool_indexer, _default_index) +from pandas.core.common import (isnull, notnull, _is_bool_indexer, + _default_index) from pandas.core.daterange import DateRange from pandas.core.generic import PandasObject -from pandas.core.index import Index, MultiIndex +from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import _SeriesIndexer, _maybe_droplevels from pandas.util.decorators import deprecate import pandas.core.datetools as datetools @@ -99,8 +99,8 @@ def __new__(cls, data, index=None, dtype=None, name=None, copy=False): except ValueError: if dtype: raise - - subarr = np.array(data, dtype=object) + else: # pragma: no cover + subarr = np.array(data, dtype=object) if subarr.ndim == 0: if isinstance(data, list): # pragma: no cover @@ -493,6 +493,9 @@ def _count_level(self, level): level_index = obj.index.levels[level] + if len(self) == 0: + return Series(0, index=level_index) + n = len(level_index) locs = obj.index.labels[level].searchsorted(np.arange(n)) @@ -1242,9 +1245,7 @@ def map(self, arg): new_values = arg.view(np.ndarray).take(indexer) if notmask.any(): - if issubclass(new_values.dtype.type, np.integer): - new_values = new_values.astype(float) - + new_values = _maybe_upcast(new_values) np.putmask(new_values, notmask, np.nan) newSer = Series(new_values, index=self.index) @@ -1307,11 +1308,7 @@ def reindex(self, index=None, method=None, copy=True): notmask = -mask if notmask.any(): - if issubclass(new_values.dtype.type, np.int_): - new_values = new_values.astype(float) - elif issubclass(new_values.dtype.type, np.bool_): - new_values = new_values.astype(object) - + new_values = _maybe_upcast(new_values) np.putmask(new_values, notmask, nan) return Series(new_values, index=new_index) @@ -1581,6 +1578,7 @@ def shift(self, periods, offset=None, **kwds): if offset is None: new_values = np.empty(len(self), dtype=self.dtype) + new_values = _maybe_upcast(new_values) if periods > 0: new_values[periods:] = self.values[:-periods] @@ -1776,6 +1774,14 @@ def remove_na(arr): """ return arr[notnull(arr)] +def _maybe_upcast(values): + if issubclass(values.dtype.type, np.int_): + values = values.astype(float) + elif issubclass(values.dtype.type, np.bool_): + values = values.astype(object) + + return values + def _seriesRepr(index, vals, nanRep='NaN'): string_index = index.format() maxlen = max(len(x) for x in string_index) diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py index b8e389c3aeffc..f9bc1ed81bd0d 100644 --- a/pandas/core/sparse.py +++ b/pandas/core/sparse.py @@ -11,8 +11,8 @@ import operator from pandas.core.common import (isnull, _pickle_array, _unpickle_array, - _mut_exclusive, _ensure_index, _try_sort) -from pandas.core.index import Index, MultiIndex, NULL_INDEX + _mut_exclusive, _try_sort) +from pandas.core.index import Index, MultiIndex, NULL_INDEX, _ensure_index from pandas.core.series import Series, TimeSeries from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, _default_index) @@ -410,12 +410,15 @@ def astype(self, dtype=None): return self.copy() - def copy(self): + def copy(self, deep=True): """ Make a copy of the SparseSeries. Only the actual sparse values need to be copied """ - values = self.sp_values.copy() + if deep: + values = self.sp_values.copy() + else: + values = self.sp_values return SparseSeries(values, index=self.index, sparse_index=self.sp_index, fill_value=self.fill_value) @@ -624,7 +627,7 @@ class SparseDataFrame(DataFrame): _verbose_info = False _columns = None _series = None - + _is_mixed_type = False ndim = 2 def __init__(self, data=None, index=None, columns=None, @@ -775,12 +778,12 @@ def to_dense(self): data = dict((k, v.to_dense()) for k, v in self.iteritems()) return DataFrame(data, index=self.index) - def copy(self): + def copy(self, deep=True): """ - Make a deep copy of this SparseDataFrame + Make a copy of this SparseDataFrame """ - return SparseDataFrame(self._series, index=self.index, - columns=self.columns, + series = self._series.copy() + return SparseDataFrame(series, index=self.index, columns=self.columns, default_fill_value=self.default_fill_value, default_kind=self.default_kind) @@ -1056,6 +1059,9 @@ def _rename_columns_inplace(self, mapper): self.columns = new_columns self._series = new_series + def _get_raw_column(self, col): + return self._series[col].values + def add_prefix(self, prefix): f = (('%s' % prefix) + '%s').__mod__ return self.rename(columns=f) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 06a0aae6328d9..c4e1c962cebac 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -9,10 +9,10 @@ import numpy as np -from pandas.core.index import Index +from pandas.core.index import Index, MultiIndex from pandas.core.frame import DataFrame -def read_csv(filepath_or_buffer, header=0, skiprows=None, index_col=0, +def read_csv(filepath_or_buffer, sep=None, header=0, skiprows=None, index_col=0, na_values=None, date_parser=None, names=None): """ Read CSV file into DataFrame @@ -20,13 +20,16 @@ def read_csv(filepath_or_buffer, header=0, skiprows=None, index_col=0, Parameters ---------- filepath_or_buffer : string or file handle / StringIO + sep : string, default None + Delimiter to use. By default will try to automatically determine + this header : int, default 0 Row to use for the column labels of the parsed DataFrame skiprows : list-like Row numbers to skip (0-indexed) - index_col : int, default 0 + index_col : int or sequence., default 0 Column to use as the row labels of the DataFrame. Pass None if there is - no such column + no such column. If a sequence is given, a MultiIndex is used. na_values : list-like, default None List of additional strings to recognize as NA/NaN date_parser : function @@ -50,7 +53,20 @@ def read_csv(filepath_or_buffer, header=0, skiprows=None, index_col=0, except Exception: # pragma: no cover f = open(filepath_or_buffer, 'r') - reader = csv.reader(f, dialect='excel') + sniff_sep = True + # default dialect + dia = csv.excel + if sep is not None: + sniff_sep = False + dia.delimiter = sep + # attempt to sniff the delimiter + if sniff_sep: + sample = f.readline() + sniffed = csv.Sniffer().sniff(sample) + dia.delimiter = sniffed.delimiter + f.seek(0) + + reader = csv.reader(f, dialect=dia) if skiprows is not None: skiprows = set(skiprows) @@ -63,8 +79,7 @@ def read_csv(filepath_or_buffer, header=0, skiprows=None, index_col=0, date_parser=date_parser) def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None, - index_col=0, na_values=None, names=None, - date_parser=None): + index_col=0, na_values=None, date_parser=None, names=None): """ Read delimited file into DataFrame @@ -77,9 +92,9 @@ def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None, Row to use for the column labels of the parsed DataFrame skiprows : list-like Row numbers to skip (0-indexed) - index_col : int, default 0 + index_col : int or sequence, default 0 Column to use as the row labels of the DataFrame. Pass None if there is - no such column + no such column. If a sequence is given, a MultiIndex is used. na_values : list-like, default None List of additional strings to recognize as NA/NaN date_parser : function @@ -92,25 +107,8 @@ def read_table(filepath_or_buffer, sep='\t', header=0, skiprows=None, ------- parsed : DataFrame """ - if hasattr(filepath_or_buffer, 'read'): - reader = filepath_or_buffer - else: - try: - # universal newline mode - reader = open(filepath_or_buffer, 'U') - except Exception: # pragma: no cover - reader = open(filepath_or_buffer, 'r') - - if skiprows is not None: - skiprows = set(skiprows) - lines = [l for i, l in enumerate(reader) if i not in skiprows] - else: - lines = [l for l in reader] - - lines = [re.split(sep, l.rstrip()) for l in lines] - return _simple_parser(lines, header=header, indexCol=index_col, - colNames=names, na_values=na_values, - date_parser=date_parser) + return read_csv(filepath_or_buffer, sep, header, skiprows, + index_col, na_values, date_parser, names) def _simple_parser(lines, colNames=None, header=0, indexCol=0, na_values=None, date_parser=None, parse_dates=True): @@ -151,19 +149,35 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0, # no index column specified, so infer that's what is wanted if indexCol is not None: - if indexCol == 0 and len(content[0]) == len(columns) + 1: - index = zipped_content[0] - zipped_content = zipped_content[1:] + if np.isscalar(indexCol): + if indexCol == 0 and len(content[0]) == len(columns) + 1: + index = zipped_content[0] + zipped_content = zipped_content[1:] + else: + index = zipped_content.pop(indexCol) + columns.pop(indexCol) + else: # given a list of index + idx_names = [] + index = [] + for idx in indexCol: + idx_names.append(columns[idx]) + index.append(zipped_content[idx]) + #remove index items from content and columns, don't pop in loop + for i in range(len(indexCol)): + columns.remove(idx_names[i]) + zipped_content.remove(index[i]) + + + if np.isscalar(indexCol): + if parse_dates: + index = _try_parse_dates(index, parser=date_parser) + index = Index(_maybe_convert_int(np.array(index, dtype=object))) else: - index = zipped_content.pop(indexCol) - columns.pop(indexCol) - - if parse_dates: - index = _try_parse_dates(index, parser=date_parser) - - index = _maybe_convert_int(np.array(index, dtype=object)) + index = MultiIndex.from_arrays(_maybe_convert_int_mindex(index, + parse_dates, date_parser), + names=idx_names) else: - index = np.arange(len(content)) + index = Index(np.arange(len(content))) if len(columns) != len(zipped_content): raise Exception('wrong number of columns') @@ -171,7 +185,7 @@ def _simple_parser(lines, colNames=None, header=0, indexCol=0, data = dict(izip(columns, zipped_content)) data = _floatify(data, na_values=na_values) data = _convert_to_ndarrays(data) - return DataFrame(data=data, columns=columns, index=Index(index)) + return DataFrame(data=data, columns=columns, index=index) def _floatify(data_dict, na_values=None): """ @@ -220,6 +234,20 @@ def _maybe_convert_int(arr): return arr +def _maybe_convert_int_mindex(index, parse_dates, date_parser): + if len(index) == 0: + return index + + for i in range(len(index)): + try: + int(index[i][0]) + index[i] = map(int, index[i]) + except ValueError: + if parse_dates: + index[i] = _try_parse_dates(index[i], date_parser) + + return index + def _convert_to_ndarrays(dct): result = {} for c, values in dct.iteritems(): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 38e6c30a3dbd3..0fa2aeb80d1ce 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -281,8 +281,9 @@ def remove(self, key, where=None): if where is None: self.handle.removeNode(self.handle.root, key, recursive=True) else: - group = getattr(self.handle.root, key) - self._delete_from_table(group, where) + group = getattr(self.handle.root, key,None) + if group is not None: + self._delete_from_table(group, where) def append(self, key, value): """ @@ -566,6 +567,13 @@ def _write_table(self, group, items=None, index=None, columns=None, # add kinds table._v_attrs.index_kind = index_kind table._v_attrs.columns_kind = cols_kind + if append: + existing_fields = getattr(table._v_attrs,'fields',None) + if (existing_fields is not None and + existing_fields != list(items)): + raise Exception("appended items do not match existing items" + " in table!") + # this depends on creation order of the table table._v_attrs.fields = list(items) # add the rows diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index fb809956c3210..c40cc3e15beee 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -15,6 +15,11 @@ except ImportError: raise nose.SkipTest('no pytables') +from distutils.version import LooseVersion + +_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ + and 'blosc' or 'zlib' + class TesttHDFStore(unittest.TestCase): path = '__test__.h5' scratchpath = '__scratch__.h5' @@ -82,16 +87,25 @@ def test_put(self): def test_put_compression(self): df = tm.makeTimeDataFrame() - self.store.put('c', df, table=True, compression='blosc') - tm.assert_frame_equal(self.store['c'], df) self.store.put('c', df, table=True, compression='zlib') tm.assert_frame_equal(self.store['c'], df) + # can't compress if table=False + self.assertRaises(ValueError, self.store.put, 'b', df, + table=False, compression='zlib') + + def test_put_compression_blosc(self): + tm.skip_if_no_package('tables', '2.2', app='blosc support') + df = tm.makeTimeDataFrame() + # can't compress if table=False self.assertRaises(ValueError, self.store.put, 'b', df, table=False, compression='blosc') + self.store.put('c', df, table=True, compression='blosc') + tm.assert_frame_equal(self.store['c'], df) + def test_put_integer(self): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) @@ -103,6 +117,15 @@ def test_append(self): self.store.append('c', df[10:]) tm.assert_frame_equal(self.store['c'], df) + def test_append_diff_item_order(self): + wp = tm.makePanel() + wp1 = wp.ix[:, :10, :] + wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :] + + self.store.put('panel', wp1, table=True) + self.assertRaises(Exception, self.store.put, 'panel', wp2, + append=True) + def test_remove(self): ts = tm.makeTimeSeries() df = tm.makeDataFrame() @@ -115,6 +138,14 @@ def test_remove(self): self.store.remove('b') self.assertEquals(len(self.store), 0) + def test_remove_where_not_exist(self): + crit1 = { + 'field' : 'index', + 'op' : '>', + 'value' : 'foo' + } + self.store.remove('a', where=[crit1]) + def test_remove_crit(self): wp = tm.makePanel() self.store.put('wp', wp, table=True) @@ -346,7 +377,7 @@ def test_select_filter_corner(self): def _check_roundtrip(self, obj, comparator, compression=False): options = {} if compression: - options['complib'] = 'blosc' + options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: @@ -360,7 +391,7 @@ def _check_roundtrip(self, obj, comparator, compression=False): def _check_roundtrip_table(self, obj, comparator, compression=False): options = {} if compression: - options['complib'] = 'blosc' + options['complib'] = _default_compressor store = HDFStore(self.scratchpath, 'w', **options) try: diff --git a/pandas/src/reindex.pyx b/pandas/src/reindex.pyx index 51fdb54eb789c..731e49a18b482 100644 --- a/pandas/src/reindex.pyx +++ b/pandas/src/reindex.pyx @@ -309,3 +309,438 @@ def getMergeVec(ndarray[object] values, dict oldMap): fillVec[i] = -1 return fillVec, mask.astype(bool) + +def ordered_left_join(ndarray[object] left, ndarray[object] right): + # cdef dict right_map = map_indices_buf(right) + # return getMergeVec(left, right_map) + cdef: + Py_ssize_t i, j, k, n + ndarray[int32_t] indexer + ndarray[uint8_t, cast=True] mask + object val + + i = 0 + j = 0 + n = len(left) + k = len(right) + + indexer = np.zeros(n, dtype=np.int32) + mask = np.ones(n, dtype=np.bool) + + for i from 0 <= i < n: + val = left[i] + + while j < k and right[j] < val: + j += 1 + + if j == k: + break + + if val == right[j]: + indexer[i] = j + mask[i] = 0 + + return indexer, mask + +@cython.wraparound(False) +@cython.boundscheck(False) +def ordered_left_join_int64(ndarray[int64_t] left, ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, k, n + ndarray[int32_t] indexer + ndarray[uint8_t, cast=True] mask + int64_t val + + i = 0 + j = 0 + n = len(left) + k = len(right) + + indexer = np.zeros(n, dtype=np.int32) + mask = np.ones(n, dtype=np.bool) + + for i from 0 <= i < n: + val = left[i] + + while j < k and right[j] < val: + j += 1 + + if j == k: + break + + if val == right[j]: + indexer[i] = j + mask[i] = 0 + + return indexer, mask + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_2d(ndarray[int64_t] left, ndarray[int64_t] right, + ndarray[float64_t, ndim=2] lvalues, + ndarray[float64_t, ndim=2] rvalues, + ndarray[float64_t, ndim=2] out): + cdef: + Py_ssize_t i, j, k, nright, nleft, kright, kleft + int64_t val + + nleft, kleft = ( lvalues).shape + nright, kright = ( rvalues).shape + + j = 0 + for i from 0 <= i < nleft: + for k from 0 <= k < kleft: + out[i, k] = lvalues[i, k] + + val = left[i] + + while j < nright and right[j] < val: + j += 1 + + if j == nright: + for k from kleft <= k < kleft + kright: + out[i, k] = NaN + continue + + if val == right[j]: + for k from kleft <= k < kleft + kright: + out[i, k] = rvalues[j, k - kleft] + else: + for k from kleft <= k < kleft + kright: + out[i, k] = NaN + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_1d(ndarray[int64_t] left, ndarray[int64_t] right, + ndarray[float64_t] lvalues, + ndarray[float64_t] rvalues, + ndarray[float64_t, ndim=2] out): + cdef: + Py_ssize_t i, j, nright, nleft + int64_t val + + nleft = len(lvalues) + nright = len(rvalues) + + j = 0 + for i from 0 <= i < nleft: + out[i, 0] = lvalues[i] + + val = left[i] + + while j < nright and right[j] < val: + j += 1 + + if j == nright: + out[i, 1] = NaN + continue + + if val == right[j]: + out[i, 1] = rvalues[j] + else: + out[i, 1] = NaN + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right): + ''' + Two-pass algorithm? + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int64_t val + ndarray[int32_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + assert(left.flags.contiguous) + assert(right.flags.contiguous) + + cdef int64_t *lptr = left.data + cdef int64_t *rptr = right.data + + i = 0 + j = 0 + count = 0 + while i < nleft: + while j < nright and rptr[j] < lptr[i]: + j += 1 + + if j == nright: + break + + if lptr[i] == rptr[j]: + count += 1 + i += 1 + j += 1 + else: + while lptr[i] < rptr[j]: + i += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=np.int64) + + cdef int32_t *liptr = lindexer.data + cdef int32_t *riptr = rindexer.data + cdef int64_t *resptr = result.data + + i = 0 + j = 0 + count = 0 + while i < nleft: + val = lptr[i] + while j < nright and rptr[j] < val: + j += 1 + + if j == nright: + break + + if val == rptr[j]: + liptr[count] = i + riptr[count] = j + resptr[count] = val + count += 1 + i += 1 + j += 1 + else: + while lptr[i] < rptr[j]: + i += 1 + + return result, lindexer, rindexer + +def _inner_join_count(ndarray[int64_t] left, ndarray[int64_t] right): + pass + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer(ndarray[int64_t] left, ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + int64_t lval, rval + ndarray[int32_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + i += 1 + count += 1 + break + else: + if left[i] == right[j]: + i += 1 + j += 1 + elif left[i] < right[j]: + i += 1 + else: + j += 1 + + count += 1 + + lindexer = np.empty(count, dtype=np.int32) + rindexer = np.empty(count, dtype=np.int32) + result = np.empty(count, dtype=np.int64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[j] + i += 1 + count += 1 + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + j += 1 + + count += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_axis0(ndarray[float64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + + if idx == -1: + for j from 0 <= j < k: + outbuf[i, j] = NaN + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_join_contiguous(ndarray[float64_t, ndim=2] lvalues, + ndarray[float64_t, ndim=2] rvalues, + ndarray[int32_t] lindexer, + ndarray[int32_t] rindexer, + ndarray out): + cdef: + Py_ssize_t i, j, rk, lk, n, lidx, ridx + float64_t *outbuf + + assert(out.flags.contiguous) + + outbuf = out.data + + n = len(lindexer) + lk = lvalues.shape[1] + rk = rvalues.shape[1] + + for i from 0 <= i < n: + lidx = lindexer[i] + ridx = rindexer[i] + + if lidx == -1: + for j from 0 <= j < lk: + outbuf[0] = NaN + outbuf = outbuf + 1 + else: + for j from 0 <= j < lk: + outbuf[0] = lvalues[lidx, j] + outbuf = outbuf + 1 + + if lidx == -1: + for j from 0 <= j < rk: + outbuf[0] = NaN + outbuf = outbuf + 1 + else: + for j from 0 <= j < rk: + outbuf[0] = rvalues[ridx, j] + outbuf = outbuf + 1 + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_axis1(ndarray[float64_t, ndim=2] values, + ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + for j from 0 <= j < k: + idx = indexer[j] + + if idx == -1: + for i from 0 <= i < n: + outbuf[i, j] = NaN + else: + for i from 0 <= i < n: + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_1d(ndarray[float64_t] values, ndarray[int32_t] indexer, + out=None): + cdef: + Py_ssize_t i, n, idx + ndarray[float64_t] outbuf + + n = len(indexer) + + if out is None: + outbuf = np.empty(n, dtype=values.dtype) + else: + outbuf = out + + for i from 0 <= i < n: + idx = indexer[i] + if idx == -1: + outbuf[i] = NaN + else: + outbuf[i] = values[idx] + +def ordered_put_indexer(ndarray[int64_t] left, ndarray[int64_t] right, + ndarray[float64_t, ndim=2] lvalues, + ndarray[float64_t, ndim=2] rvalues, + ndarray[float64_t, ndim=2] out): + pass + +def ordered_outer_join(ndarray[int64_t] left, ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, k, nright, nleft, kright, kleft + int64_t val + pass + + +def ordered_inner_join(ndarray[object] left, ndarray[object] right): + pass + diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py index 1a7bfc1154fac..0af72752e17e8 100644 --- a/pandas/stats/tests/test_ols.py +++ b/pandas/stats/tests/test_ols.py @@ -53,10 +53,12 @@ def testOLSWithDatasets(self): self.checkDataSet(datasets.cpunish.load(), skip_moving=True) self.checkDataSet(datasets.longley.load(), skip_moving=True) self.checkDataSet(datasets.stackloss.load(), skip_moving=True) - self.checkDataSet(datasets.ccard.load(), 39, 49) # one col in X all 0s self.checkDataSet(datasets.copper.load()) self.checkDataSet(datasets.scotland.load()) + # degenerate case fails on some platforms + # self.checkDataSet(datasets.ccard.load(), 39, 49) # one col in X all 0s + def checkDataSet(self, dataset, start=None, end=None, skip_moving=False): exog = dataset.exog[start : end] endog = dataset.endog[start : end] diff --git a/pandas/tests/test_daterange.py b/pandas/tests/test_daterange.py index bcb408ab43fc6..b3c4651b03ac1 100644 --- a/pandas/tests/test_daterange.py +++ b/pandas/tests/test_daterange.py @@ -83,6 +83,11 @@ def test_comparison(self): self.assert_(comp[11]) self.assert_(not comp[9]) + def test_copy(self): + cp = self.rng.copy() + repr(cp) + self.assert_(cp.equals(self.rng)) + def test_repr(self): # only really care that it works repr(self.rng) diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 0d4b12cff76fc..1b0625d41c5e5 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -13,7 +13,8 @@ import pandas.core.datetools as datetools from pandas.core.index import NULL_INDEX -from pandas.core.api import (DataFrame, Index, Series, notnull, isnull) +from pandas.core.api import (DataFrame, Index, Series, notnull, isnull, + MultiIndex) from pandas.util.testing import (assert_almost_equal, assert_series_equal, @@ -409,8 +410,15 @@ def test_fancy_index_int_labels_exceptions(self): (slice(None, None), 'E'), 1) def test_setitem_fancy_mixed_2d(self): + self.mixed_frame.ix[:5, ['C', 'B', 'A']] = 5 + result = self.mixed_frame.ix[:5, ['C', 'B', 'A']] + self.assert_((result.values == 5).all()) + + self.mixed_frame.ix[5] = np.nan + self.assert_(isnull(self.mixed_frame.ix[5]).all()) + self.assertRaises(Exception, self.mixed_frame.ix.__setitem__, - (slice(0, 5), ['C', 'B', 'A']), 5) + 5, self.mixed_frame.ix[6]) def test_getitem_fancy_1d(self): f = self.frame @@ -591,6 +599,14 @@ def test_getitem_setitem_fancy_exceptions(self): self.assertRaises(Exception, ix.__getitem__, mask) self.assertRaises(Exception, ix.__setitem__, mask, 1.) + def test_setitem_single_column_mixed(self): + df = DataFrame(randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], + columns=['foo', 'bar', 'baz']) + df['str'] = 'qux' + df.ix[::2, 'str'] = nan + expected = [nan, 'qux', nan, 'qux', nan] + assert_almost_equal(df['str'].values, expected) + def test_setitem_fancy_exceptions(self): pass @@ -686,7 +702,8 @@ def test_add_prefix_suffix(self): self.assert_(np.array_equal(with_suffix.columns, expected)) -class TestDataFrame(unittest.TestCase, CheckIndexing): +class TestDataFrame(unittest.TestCase, CheckIndexing, + SafeForSparse): klass = DataFrame def setUp(self): @@ -1005,6 +1022,10 @@ def test_constructor_scalar(self): expected = DataFrame({"a" : [0, 0, 0]}, index=idx) assert_frame_equal(df, expected) + def test_constructor_Series_copy_bug(self): + df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A']) + df.copy() + def test_astype(self): casted = self.frame.astype(int) expected = DataFrame(self.frame.values.astype(int), @@ -1151,6 +1172,34 @@ def test_repr(self): common.set_printoptions(precision=3, column_space=10) repr(self.frame) + def test_repr_tuples(self): + buf = StringIO() + + arr = np.empty(10, dtype=object) + arr[:] = zip(range(10), range(10)) + df = DataFrame({'tups' : arr}) + repr(df) + df.to_string(colSpace=10, buf=buf) + + def test_to_string_unicode(self): + buf = StringIO() + + unicode_values = [u'\u03c3'] * 10 + unicode_values = np.array(unicode_values, dtype=object) + df = DataFrame({'unicode' : unicode_values}) + df.to_string(colSpace=10, buf=buf) + + def test_to_string_unicode_columns(self): + df = DataFrame({u'\u03c3' : np.arange(10.)}) + + buf = StringIO() + df.to_string(buf=buf) + buf.getvalue() + + buf = StringIO() + df.info(buf=buf) + buf.getvalue() + def test_head_tail(self): assert_frame_equal(self.frame.head(), self.frame[:5]) assert_frame_equal(self.frame.tail(), self.frame[-5:]) @@ -1183,17 +1232,6 @@ def test_to_string(self): frame = DataFrame(index=np.arange(1000)) frame.to_string(buf=buf) - def test_to_string_unicode_columns(self): - df = DataFrame({u'\u03c3' : np.arange(10.)}) - - buf = StringIO() - df.to_string(buf=buf) - buf.getvalue() - - buf = StringIO() - df.info(buf=buf) - buf.getvalue() - def test_insert(self): df = DataFrame(np.random.randn(5, 3), index=np.arange(5), columns=['c', 'b', 'a']) @@ -1462,11 +1500,59 @@ def test_to_csv_from_csv(self): os.remove(path) + def test_to_csv_multiindex(self): + path = '__tmp__' + + frame = self.frame + old_index = frame.index + new_index = MultiIndex.from_arrays(np.arange(len(old_index)*2).reshape(2,-1)) + frame.index = new_index + frame.to_csv(path, header=False) + frame.to_csv(path, cols=['A', 'B']) + + + # round trip + frame.to_csv(path) + + df = DataFrame.from_csv(path, index_col=[0,1]) + + assert_frame_equal(frame, df) + self.frame.index = old_index # needed if setUP becomes a classmethod + + # try multiindex with dates + tsframe = self.tsframe + old_index = tsframe.index + new_index = [old_index, np.arange(len(old_index))] + tsframe.index = MultiIndex.from_arrays(new_index) + + tsframe.to_csv(path, index_label = ['time','foo']) + recons = DataFrame.from_csv(path, index_col=[0,1]) + assert_frame_equal(tsframe, recons) + + # do not load index + tsframe.to_csv(path) + recons = DataFrame.from_csv(path, index_col=None) + np.testing.assert_equal(len(recons.columns), len(tsframe.columns) + 2) + + # no index + tsframe.to_csv(path, index=False) + recons = DataFrame.from_csv(path, index_col=None) + assert_almost_equal(recons.values, self.tsframe.values) + self.tsframe.index = old_index # needed if setUP becomes classmethod + + os.remove(path) + def test_info(self): io = StringIO() self.frame.info(buf=io) self.tsframe.info(buf=io) + def test_dtypes(self): + self.mixed_frame['bool'] = self.mixed_frame['A'] > 0 + result = self.mixed_frame.dtypes + expected = self.mixed_frame.dtypes + assert_series_equal(result, expected) + def test_append(self): begin_index = self.frame.index[:5] end_index = self.frame.index[5:] @@ -1505,6 +1591,22 @@ def test_append(self): assert_frame_equal(self.frame, appended) self.assert_(appended is not self.frame) + def test_append_records(self): + arr1 = np.zeros((2,),dtype=('i4,f4,a10')) + arr1[:] = [(1,2.,'Hello'),(2,3.,"World")] + + arr2 = np.zeros((3,),dtype=('i4,f4,a10')) + arr2[:] = [(3, 4.,'foo'), + (5, 6.,"bar"), + (7., 8., 'baz')] + + df1 = DataFrame(arr1) + df2 = DataFrame(arr2) + + result = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate((arr1, arr2))) + assert_frame_equal(result, expected) + def test_asfreq(self): offset_monthly = self.tsframe.asfreq(datetools.bmonthEnd) rule_monthly = self.tsframe.asfreq('EOM') @@ -1606,6 +1708,22 @@ def test_corrwith(self): for row in index[:4]: assert_almost_equal(correls[row], df1.ix[row].corr(df2.ix[row])) + def test_corrwith_with_objects(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + cols = ['A', 'B', 'C', 'D'] + + df1['obj'] = 'foo' + df2['obj'] = 'bar' + + result = df1.corrwith(df2) + expected = df1.ix[:, cols].corrwith(df2.ix[:, cols]) + assert_series_equal(result, expected) + + result = df1.corrwith(df2, axis=1) + expected = df1.ix[:, cols].corrwith(df2.ix[:, cols], axis=1) + assert_series_equal(result, expected) + def test_dropEmptyRows(self): N = len(self.frame.index) mat = randn(N) @@ -1820,6 +1938,17 @@ def test_pivot(self): df = DataFrame.from_records(lp.toRecords()) assert_frame_equal(df.pivot('major', 'minor'), lp.unstack()) + def test_pivot_duplicates(self): + data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo', 'foo'], + 'b' : ['one', 'two', 'one', 'one', 'two'], + 'c' : [1., 2., 3., 3., 4.]}) + # expected = DataFrame([[1., 2.], [3., 4.]], index=['bar', 'foo'], + # columns=['one', 'two']) + # result = data.pivot('a', 'b', 'c') + # assert_frame_equal(result, expected) + + self.assertRaises(Exception, data.pivot, 'a', 'b', 'c') + def test_reindex(self): newFrame = self.frame.reindex(self.ts1.index) @@ -1986,6 +2115,11 @@ def test_rename(self): renamed = self.frame.T.rename(index={'C' : 'foo', 'D' : 'bar'}) self.assert_(np.array_equal(renamed.index, ['A', 'B', 'foo', 'bar'])) + def test_rename_nocopy(self): + renamed = self.frame.rename(columns={'C' : 'foo'}, copy=False) + renamed['foo'] = 1. + self.assert_((self.frame['C'] == 1.).all()) + #---------------------------------------------------------------------- # Time series related @@ -2362,6 +2496,19 @@ def test_get_X_columns(self): self.assertEquals(df._get_numeric_columns(), ['a', 'e']) # self.assertEquals(df._get_object_columns(), ['c', 'd']) + def test_get_numeric_data(self): + df = DataFrame({'a' : 1., 'b' : 2, 'c' : 'foo'}, + index=np.arange(10)) + + result = df._get_numeric_data() + expected = df.ix[:, ['a', 'b']] + assert_frame_equal(result, expected) + + only_obj = df.ix[:, ['c']] + result = only_obj._get_numeric_data() + expected = df.ix[:, []] + assert_frame_equal(result, expected) + def test_statistics(self): # unnecessary? sumFrame = self.frame.apply(np.sum) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index aa1cdd61e012b..d9418c2d85688 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -519,12 +519,15 @@ def _check_op(op): def test_groupby_as_index(self): data = self.df - grouped = data.groupby(['A'], as_index=False) + # single-key + grouped = data.groupby('A', as_index=False) result = grouped.mean() expected = data.groupby(['A']).mean() expected.insert(0, 'A', expected.index) expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) + # multi-key grouped = data.groupby(['A', 'B'], as_index=False) result = grouped.mean() expected = data.groupby(['A', 'B']).mean() @@ -533,6 +536,7 @@ def test_groupby_as_index(self): expected.insert(0, 'A', arrays[0]) expected.insert(1, 'B', arrays[1]) expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 7ccdc5cc5ef4b..c1ecd405d08c0 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -9,7 +9,7 @@ import pandas.core.datetools as datetools from pandas.core.index import MultiIndex, NULL_INDEX -from pandas.core.api import (DataFrame, Index, Series, notnull, isnull) +from pandas import Panel, DataFrame, Index, Series, notnull, isnull from pandas.util.testing import (assert_almost_equal, assert_series_equal, @@ -220,6 +220,12 @@ def test_sortlevel(self): self.assertRaises(Exception, self.frame.delevel()['A'].sortlevel) + def test_sortlevel_by_name(self): + self.frame.index.names = ['first', 'second'] + result = self.frame.sortlevel(level='second') + expected = self.frame.sortlevel(level=1) + assert_frame_equal(result, expected) + def test_sortlevel_mixed(self): sorted_before = self.frame.sortlevel(1) @@ -252,6 +258,18 @@ def _check_counts(frame, axis=0): df = tm.makeTimeDataFrame() self.assertRaises(Exception, df.count, level=0) + def test_count_level_corner(self): + s = self.frame['A'][:0] + result = s.count(level=0) + expected = Series(0, index=s.index.levels[0]) + assert_series_equal(result, expected) + + df = self.frame[:0] + result = df.count(level=0) + expected = DataFrame({}, index=s.index.levels[0], + columns=df.columns).fillna(0).astype(int) + assert_frame_equal(result, expected) + def test_unstack(self): # just check that it works for now unstacked = self.ymd.unstack() @@ -325,6 +343,14 @@ def test_swaplevel(self): back = swapped.swaplevel(0, 1) self.assert_(back.index.equals(self.frame.index)) + def test_swaplevel_panel(self): + panel = Panel({'ItemA' : self.frame, + 'ItemB' : self.frame * 2}) + + result = panel.swaplevel(0, 1, axis='major') + expected = panel.copy() + expected.major_axis = expected.major_axis.swaplevel(0, 1) + def test_insert_index(self): df = self.ymd[:5].T df[2000, 1, 10] = df[2000, 1, 7] diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 31421aacccd6d..b4cafb42bf485 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -8,7 +8,7 @@ import numpy as np -from pandas.core.api import DataFrame, Index, isnull, notnull, pivot +from pandas import DataFrame, Index, isnull, notnull, pivot, MultiIndex from pandas.core.datetools import bday from pandas.core.frame import group_agg from pandas.core.panel import Panel, LongPanel @@ -925,6 +925,11 @@ def test_ops_differently_indexed(self): assert_series_equal(self.panel['foo'].reindex(lp2.index), lp2['ItemA']) + def test_ops_scalar(self): + result = self.panel.mul(2) + expected = DataFrame.__mul__(self.panel, 2) + assert_frame_equal(result, expected) + def test_combineFrame(self): wp = self.panel.to_wide() result = self.panel.add(wp['ItemA']) @@ -1107,22 +1112,22 @@ def test_add_prefix(self): def test_pivot(self): from pandas.core.reshape import _slow_pivot - df = pivot(np.array([1, 2, 3, 4, 5]), - np.array(['a', 'b', 'c', 'd', 'e']), - np.array([1, 2, 3, 5, 4.])) + one, two, three = (np.array([1, 2, 3, 4, 5]), + np.array(['a', 'b', 'c', 'd', 'e']), + np.array([1, 2, 3, 5, 4.])) + df = pivot(one, two, three) self.assertEqual(df['a'][1], 1) self.assertEqual(df['b'][2], 2) self.assertEqual(df['c'][3], 3) self.assertEqual(df['d'][4], 5) self.assertEqual(df['e'][5], 4) + assert_frame_equal(df, _slow_pivot(one, two, three)) # weird overlap, TODO: test? a, b, c = (np.array([1, 2, 3, 4, 4]), np.array(['a', 'a', 'a', 'a', 'a']), np.array([1., 2., 3., 4., 5.])) - df = pivot(a, b, c) - expected = _slow_pivot(a, b, c) - assert_frame_equal(df, expected) + self.assertRaises(Exception, pivot, a, b, c) # corner case, empty df = pivot(np.array([]), np.array([]), np.array([])) @@ -1150,6 +1155,12 @@ def test_monotonic(): assert not panelm._monotonic(neg2) +def test_panel_index(): + index = panelm.panel_index([1,2,3,4], [1,2,3]) + expected = MultiIndex.from_arrays([np.tile([1,2,3,4], 3), + np.repeat([1,2,3], 4)]) + assert(index.equals(expected)) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 97c61dc7f789a..cda6ecd98b44d 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -752,6 +752,12 @@ def test_shift(self): unshifted = self.ts.shift(0) assert_series_equal(unshifted, self.ts) + def test_shift_int(self): + ts = self.ts.astype(int) + shifted = ts.shift(1) + expected = ts.astype(float).shift(1) + assert_series_equal(shifted, expected) + def test_truncate(self): offset = datetools.bday diff --git a/pandas/tests/test_sparse.py b/pandas/tests/test_sparse.py index ec948729266ea..025ec042d10af 100644 --- a/pandas/tests/test_sparse.py +++ b/pandas/tests/test_sparse.py @@ -233,6 +233,11 @@ def test_copy_astype(self): assert_sp_series_equal(zbcop, self.zbseries) assert_sp_series_equal(zicop, self.ziseries) + # no deep copy + view = self.bseries.copy(deep=False) + view.sp_values[:5] = 5 + self.assert_((self.bseries.sp_values[:5] == 5).all()) + def test_astype(self): self.assertRaises(Exception, self.bseries.astype, np.int_) @@ -909,6 +914,11 @@ def test_append(self): appended = a.append(b) assert_sp_frame_equal(appended, self.frame) + a = self.frame.ix[:5, :3] + b = self.frame.ix[5:] + appended = a.append(b) + assert_sp_frame_equal(appended.ix[:, :3], self.frame.ix[:, :3]) + def test_apply(self): applied = self.frame.apply(np.sqrt) self.assert_(isinstance(applied, SparseDataFrame)) diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py index d2cb8119ac566..73227e624baa5 100644 --- a/pandas/tests/test_tseries.py +++ b/pandas/tests/test_tseries.py @@ -2,8 +2,9 @@ import numpy as np from pandas import Index +from pandas.util.testing import assert_almost_equal import pandas.util.testing as common -import pandas._tseries as tseries +import pandas._tseries as lib class TestTseriesUtil(unittest.TestCase): @@ -26,7 +27,7 @@ def test_getMergeVec(self): old = Index([1, 5, 10]) new = Index(range(12)) - filler, mask = tseries.getFillVec(old, new, old.indexMap, + filler, mask = lib.getFillVec(old, new, old.indexMap, new.indexMap, None) expect_filler = [-1, 0, -1, -1, -1, 1, -1, -1, -1, -1, 2, -1] @@ -39,7 +40,7 @@ def test_getMergeVec(self): # corner case old = Index([1, 4]) new = Index(range(5, 10)) - filler, mask = tseries.getFillVec(old, new, old.indexMap, + filler, mask = lib.getFillVec(old, new, old.indexMap, new.indexMap, None) expect_filler = [-1, -1, -1, -1, -1] @@ -51,7 +52,7 @@ def test_backfill(self): old = Index([1, 5, 10]) new = Index(range(12)) - filler, mask = tseries.getFillVec(old, new, old.indexMap, + filler, mask = lib.getFillVec(old, new, old.indexMap, new.indexMap, 'BACKFILL') expect_filler = [0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1] @@ -64,7 +65,7 @@ def test_backfill(self): # corner case old = Index([1, 4]) new = Index(range(5, 10)) - filler, mask = tseries.getFillVec(old, new, old.indexMap, + filler, mask = lib.getFillVec(old, new, old.indexMap, new.indexMap, 'BACKFILL') expect_filler = [-1, -1, -1, -1, -1] @@ -76,7 +77,7 @@ def test_pad(self): old = Index([1, 5, 10]) new = Index(range(12)) - filler, mask = tseries.getFillVec(old, new, old.indexMap, + filler, mask = lib.getFillVec(old, new, old.indexMap, new.indexMap, 'PAD') expect_filler = [-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2] @@ -89,7 +90,7 @@ def test_pad(self): # corner case old = Index([5, 10]) new = Index(range(5)) - filler, mask = tseries.getFillVec(old, new, old.indexMap, + filler, mask = lib.getFillVec(old, new, old.indexMap, new.indexMap, 'PAD') expect_filler = [-1, -1, -1, -1, -1] @@ -97,5 +98,39 @@ def test_pad(self): self.assert_(np.array_equal(filler, expect_filler)) self.assert_(np.array_equal(mask, expect_mask)) +def test_inner_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = lib.inner_join_indexer(a, b) + + index_exp = np.array([3, 5], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([2, 4]) + bexp = np.array([1, 2]) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + +def test_outer_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = lib.outer_join_indexer(a, b) + + index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int32) + bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4]) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + class TestMoments(unittest.TestCase): pass + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/util/testing.py b/pandas/util/testing.py index db314f6eb7cd2..1fae8fed79ea2 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -5,6 +5,8 @@ import string import sys +from distutils.version import LooseVersion + from numpy.random import randn import numpy as np @@ -72,8 +74,11 @@ def assert_almost_equal(a, b): if isiterable(a): np.testing.assert_(isiterable(b)) np.testing.assert_equal(len(a), len(b)) - for i in xrange(len(a)): - assert_almost_equal(a[i], b[i]) + if np.array_equal(a, b): + return True + else: + for i in xrange(len(a)): + assert_almost_equal(a[i], b[i]) return True err_msg = lambda a, b: 'expected %.5f but got %.5f' % (a, b) @@ -216,3 +221,66 @@ def makeLongPanel(): return wp.to_long() +# Dependency checks. Copied this from Nipy/Nipype (Copyright of +# respective developers, license: BSD-3) +def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion, + exc_failed_import=ImportError, + exc_failed_check=RuntimeError): + """Check that the minimal version of the required package is installed. + + Parameters + ---------- + pkg_name : string + Name of the required package. + version : string, optional + Minimal version number for required package. + app : string, optional + Application that is performing the check. For instance, the + name of the tutorial being executed that depends on specific + packages. + checker : object, optional + The class that will perform the version checking. Default is + distutils.version.LooseVersion. + exc_failed_import : Exception, optional + Class of the exception to be thrown if import failed. + exc_failed_check : Exception, optional + Class of the exception to be thrown if version check failed. + + Examples + -------- + package_check('numpy', '1.3') + package_check('networkx', '1.0', 'tutorial1') + + """ + + if app: + msg = '%s requires %s' % (app, pkg_name) + else: + msg = 'module requires %s' % pkg_name + if version: + msg += ' with version >= %s' % (version,) + try: + mod = __import__(pkg_name) + except ImportError: + raise exc_failed_import(msg) + if not version: + return + try: + have_version = mod.__version__ + except AttributeError: + raise exc_failed_check('Cannot find version for %s' % pkg_name) + if checker(have_version) < checker(version): + raise exc_failed_check(msg) + +def skip_if_no_package(*args, **kwargs): + """Raise SkipTest if package_check fails + + Parameters + ---------- + *args Positional parameters passed to `package_check` + *kwargs Keyword parameters passed to `package_check` + """ + from nose import SkipTest + package_check(exc_failed_import=SkipTest, + exc_failed_check=SkipTest, + *args, **kwargs) diff --git a/pandas/version.py b/pandas/version.py deleted file mode 100644 index 9baf1668d4acc..0000000000000 --- a/pandas/version.py +++ /dev/null @@ -1 +0,0 @@ -version = '0.4.0' diff --git a/scripts/bench_join.R b/scripts/bench_join.R new file mode 100644 index 0000000000000..dfbbcea86d587 --- /dev/null +++ b/scripts/bench_join.R @@ -0,0 +1,39 @@ +library(xts) + +iterations <- 50 + +ns = c(100, 1000, 10000, 100000, 1000000) +kinds = c("outer", "left", "inner") + +result = matrix(0, nrow=3, ncol=length(ns)) +n <- 100000 +pct.overlap <- 0.2 + +k <- 1 + +for (ni in 1:length(ns)){ + n <- ns[ni] + rng1 <- 1:n + offset <- as.integer(n * pct.overlap) + rng2 <- rng1 + offset + x <- xts(matrix(rnorm(n * k), nrow=n, ncol=k), + as.POSIXct(Sys.Date()) + rng1) + y <- xts(matrix(rnorm(n * k), nrow=n, ncol=k), + as.POSIXct(Sys.Date()) + rng2) + timing <- numeric() + for (i in 1:3) { + kind = kinds[i] + for(j in 1:iterations) { + gc() # just to be sure + timing[j] <- system.time(merge(x,y,join=kind))[3] + } + #timing <- system.time(for (j in 1:iterations) merge.xts(x, y, join=kind), + # gcFirst=F) + #timing <- as.list(timing) + result[i, ni] <- mean(timing) * 1000 + #result[i, ni] = (timing$elapsed / iterations) * 1000 + } +} + +rownames(result) <- kinds +colnames(result) <- log10(ns) diff --git a/scripts/bench_join.py b/scripts/bench_join.py new file mode 100644 index 0000000000000..4ab81067c2349 --- /dev/null +++ b/scripts/bench_join.py @@ -0,0 +1,189 @@ +import numpy as np +import pandas._tseries as lib +from pandas import * +from copy import deepcopy +import time + +n = 1000000 +K = 1 +pct_overlap = 0.2 + +a = np.arange(n, dtype=np.int64) +b = np.arange(n * pct_overlap, n*(1+pct_overlap), dtype=np.int64) + +dr1 = DateRange('1/1/2000', periods=n, offset=datetools.Minute()) +dr2 = DateRange(dr1[int(pct_overlap*n)], periods=n, offset=datetools.Minute(2)) + +aobj = a.astype(object) +bobj = b.astype(object) + +av = np.random.randn(n) +bv = np.random.randn(n) + +avf = np.random.randn(n, K) +bvf = np.random.randn(n, K) + +a_series = Series(av, index=a) +b_series = Series(bv, index=b) + +a_frame = DataFrame(avf, index=dr1, columns=range(K)) +b_frame = DataFrame(bvf, index=dr2, columns=range(K, 2 * K)) + +def do_left_join(a, b, av, bv): + out = np.empty((len(a), 2)) + lib.left_join_1d(a, b, av, bv, out) + return out + +def do_outer_join(a, b, av, bv): + result_index, aindexer, bindexer = lib.outer_join_indexer(a, b) + result = np.empty((2, len(result_index))) + lib.take_1d(av, aindexer, result[0]) + lib.take_1d(bv, bindexer, result[1]) + return result_index, result + +def do_inner_join(a, b, av, bv): + result_index, aindexer, bindexer = lib.inner_join_indexer(a, b) + result = np.empty((2, len(result_index))) + lib.take_1d(av, aindexer, result[0]) + lib.take_1d(bv, bindexer, result[1]) + return result_index, result + +from line_profiler import LineProfiler +prof = LineProfiler() + +from pandas.util.testing import set_trace + +def do_left_join_python(a, b, av, bv): + indexer, mask = lib.ordered_left_join_int64(a, b) + + n, ak = av.shape + _, bk = bv.shape + result_width = ak + bk + + result = np.empty((result_width, n), dtype=np.float64) + result[:ak] = av.T + + bchunk = result[ak:] + _take_multi(bv.T, indexer, bchunk) + np.putmask(bchunk, np.tile(mask, bk), np.nan) + return result + +def _take_multi(data, indexer, out): + if not data.flags.c_contiguous: + data = data.copy() + for i in xrange(data.shape[0]): + data[i].take(indexer, out=out[i]) + +def do_left_join_multi(a, b, av, bv): + n, ak = av.shape + _, bk = bv.shape + result = np.empty((n, ak + bk), dtype=np.float64) + lib.left_join_2d(a, b, av, bv, result) + return result + +def do_outer_join_multi(a, b, av, bv): + n, ak = av.shape + _, bk = bv.shape + result_index, rindexer, lindexer = lib.outer_join_indexer(a, b) + result = np.empty((len(result_index), ak + bk), dtype=np.float64) + lib.take_join_contiguous(av, bv, lindexer, rindexer, result) + # result = np.empty((ak + bk, len(result_index)), dtype=np.float64) + # lib.take_axis0(av, rindexer, out=result[:ak].T) + # lib.take_axis0(bv, lindexer, out=result[ak:].T) + return result_index, result + +def do_inner_join_multi(a, b, av, bv): + n, ak = av.shape + _, bk = bv.shape + result_index, rindexer, lindexer = lib.inner_join_indexer(a, b) + result = np.empty((len(result_index), ak + bk), dtype=np.float64) + lib.take_join_contiguous(av, bv, lindexer, rindexer, result) + # result = np.empty((ak + bk, len(result_index)), dtype=np.float64) + # lib.take_axis0(av, rindexer, out=result[:ak].T) + # lib.take_axis0(bv, lindexer, out=result[ak:].T) + return result_index, result + +def do_left_join_multi_v2(a, b, av, bv): + indexer, mask = lib.ordered_left_join_int64(a, b) + bv_taken = bv.take(indexer, axis=0) + np.putmask(bv_taken, mask.repeat(bv.shape[1]), np.nan) + return np.concatenate((av, bv_taken), axis=1) + + +def do_left_join_series(a, b): + return b.reindex(a.index) + +def do_left_join_frame(a, b): + a.index._indexMap = None + b.index._indexMap = None + return a.join(b, how='left') + + +# a = np.array([1, 2, 3, 4, 5], dtype=np.int64) +# b = np.array([0, 3, 5, 7, 9], dtype=np.int64) +# print lib.inner_join_indexer(a, b) + +out = np.empty((10, 120000)) + +def join(a, b, av, bv, how="left"): + func_dict = {'left' : do_left_join_multi, + 'outer' : do_outer_join_multi, + 'inner' : do_inner_join_multi} + + f = func_dict[how] + return f(a, b, av, bv) + +def bench_python(n=100000, pct_overlap=0.20): + import gc + ns = [2, 3, 4, 5, 6] + iterations = 50 + K = 5 + pct_overlap = 0.2 + kinds = ['outer', 'left', 'inner'] + + all_results = {} + for logn in ns: + n = 10**logn + a = np.arange(n, dtype=np.int64) + b = np.arange(n * pct_overlap, n * pct_overlap + n, dtype=np.int64) + + avf = np.random.randn(n, K) + bvf = np.random.randn(n, K) + + all_results[logn] = result = {} + + for kind in kinds: + gc.disable() + _s = time.clock() + for _ in range(iterations): + join(a, b, avf, bvf, how=kind) + elapsed = time.clock() - _s + gc.enable() + result[kind] = (elapsed / iterations) * 1000 + + return DataFrame(all_results, index=kinds) + +def bench_xts(n=100000, pct_overlap=0.20): + from pandas.rpy.common import r + r('a <- 5') + + xrng = '1:%d' % n + + start = n * pct_overlap + 1 + end = n + start - 1 + yrng = '%d:%d' % (start, end) + + r('library(xts)') + + iterations = 500 + + kinds = ['left', 'outer', 'inner'] + result = {} + for kind in kinds: + r('x <- xts(rnorm(%d), as.POSIXct(Sys.Date()) + %s)' % (n, xrng)) + r('y <- xts(rnorm(%d), as.POSIXct(Sys.Date()) + %s)' % (n, yrng)) + stmt = 'for (i in 1:%d) merge(x, y, join="%s")' % (iterations, kind) + elapsed = r('as.list(system.time(%s, gcFirst=F))$elapsed' % stmt)[0] + result[kind] = (elapsed / iterations) * 1000 + return Series(result) + diff --git a/setup.py b/setup.py index f384deb7cd730..a350ba5ac8aa1 100755 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ AUTHOR = "AQR Capital Management, LLC" MAINTAINER = "Wes McKinney" MAINTAINER_EMAIL = "wesmckinn@gmail.com" -URL = "http://github.com/wesm/pandas" +URL = "http://pandas.sourceforge.net" DOWNLOAD_URL = '' CLASSIFIERS = [ 'Development Status :: 4 - Beta', @@ -117,7 +117,7 @@ MAJOR = 0 MINOR = 4 -MICRO = 0 +MICRO = 1 ISRELEASED = True VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) @@ -135,8 +135,6 @@ def write_version_py(filename='pandas/version.py'): cnt = """\ -from datetime import datetime - version = '%s' """ a = open(filename, 'w')