From 5b861c210bf300bd2572cf77e6ef6de7917db6d5 Mon Sep 17 00:00:00 2001 From: Devin Petersohn Date: Thu, 21 Feb 2019 09:14:51 -0800 Subject: [PATCH] Update pandas version to 0.24 (#451) * Update pandas version to 0.24 * pandas release notes: http://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.24.0.html * Update imports to match changes in pandas * Add functionality for list of functions on `axis=1` for `apply` * Remove `pd.match` from API * Small regression in pandas requires regression in Modin * pandas-dev/pandas#25101 reports this issue * pandas-dev/pandas#25102 resolves this issue * TODO: Expose `pandas.Array` once we properly test * Finishing regression update in `all`/`any` * Update to pandas 0.24 in setup.py and requirements.txt * Bump to 0.24.1 * Update API and add a test for the API * Add test for API, update API * Update API test and finalize compatibility updates * Revert bug * Cleanup and add tests * Fix bug in test * Lint * Lint * Remove print * Fix transform tests and bug in transform * Add list test for test_rename * Fix transform bug --- .coveragerc | 4 +- .travis.yml | 9 + .../query_compiler/pandas_query_compiler.py | 42 +- modin/engines/base/io.py | 2 + modin/pandas/__init__.py | 6 +- modin/pandas/dataframe.py | 426 +++++++++++------- modin/pandas/test/test_api.py | 56 +++ modin/pandas/test/test_dataframe.py | 139 +++++- modin/pandas/test/utils.py | 1 + requirements.txt | 2 +- setup.py | 2 +- 11 files changed, 493 insertions(+), 196 deletions(-) create mode 100644 modin/pandas/test/test_api.py diff --git a/.coveragerc b/.coveragerc index a118eb40b7f..259f0080456 100644 --- a/.coveragerc +++ b/.coveragerc @@ -20,4 +20,6 @@ exclude_lines = pragma: no cover # Don't complain if tests don't hit defensive assertion code: raise AssertionError - raise NotImplementedError \ No newline at end of file + raise NotImplementedError + raise ImportError + assert diff --git a/.travis.yml b/.travis.yml index eeb03dc47dd..5d7c1aace7d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,6 +40,15 @@ matrix: - black --check modin/ - flake8 . + - os: linux + dist: trusty + env: + - PYTHON=3.6 + - API_COMPAT=1 + script: + - export PATH="$HOME/miniconda/bin:$PATH" + - python -m pytest modin/pandas/test/test_api.py + install: - ./.travis/install-dependencies.sh diff --git a/modin/data_management/query_compiler/pandas_query_compiler.py b/modin/data_management/query_compiler/pandas_query_compiler.py index cb27ef7f8c8..e9957ba8c41 100644 --- a/modin/data_management/query_compiler/pandas_query_compiler.py +++ b/modin/data_management/query_compiler/pandas_query_compiler.py @@ -13,7 +13,7 @@ is_datetime_or_timedelta_dtype, is_bool_dtype, ) -from pandas.core.index import _ensure_index +from pandas.core.index import ensure_index from pandas.core.base import DataError from modin.engines.base.block_partitions import BaseBlockPartitions @@ -97,7 +97,7 @@ def pandas_index_extraction(df, axis): return index_obj[new_indices] if compute_diff else new_indices def _validate_set_axis(self, new_labels, old_labels): - new_labels = _ensure_index(new_labels) + new_labels = ensure_index(new_labels) old_len = len(old_labels) new_len = len(new_labels) if old_len != new_len: @@ -118,14 +118,14 @@ def _get_columns(self): def _set_index(self, new_index): if self._index_cache is None: - self._index_cache = _ensure_index(new_index) + self._index_cache = ensure_index(new_index) else: new_index = self._validate_set_axis(new_index, self._index_cache) self._index_cache = new_index def _set_columns(self, new_columns): if self._columns_cache is None: - self._columns_cache = _ensure_index(new_columns) + self._columns_cache = ensure_index(new_columns) else: new_columns = self._validate_set_axis(new_columns, self._columns_cache) self._columns_cache = new_columns @@ -1388,11 +1388,16 @@ def _process_all_any(self, func, **kwargs): if bool_only: if axis == 0 and not axis_none and len(not_bool_col) == len(self.columns): - return pandas.Series(dtype=bool) - if len(not_bool_col) == len(self.columns): - query_compiler = self - else: - query_compiler = self.drop(columns=not_bool_col) + # TODO add this line back once pandas-dev/pandas#25101 is resolved + # return pandas.Series(dtype=bool) + pass + # See note above about pandas-dev/pandas#25101 + # TODO remove this when pandas 0.24.2 is released. + query_compiler = self + # if len(not_bool_col) == len(self.columns): + # query_compiler = self + # else: + # query_compiler = self.drop(columns=not_bool_col) else: if ( bool_only is False @@ -2492,11 +2497,22 @@ def _list_like_func(self, func, axis, *args, **kwargs): Returns: A new PandasQueryCompiler. """ - func_prepared = self._prepare_method(lambda df: df.apply(func, *args, **kwargs)) + func_prepared = self._prepare_method( + lambda df: df.apply(func, axis, *args, **kwargs) + ) new_data = self._map_across_full_axis(axis, func_prepared) - # When the function is list-like, the function names become the index - new_index = [f if isinstance(f, string_types) else f.__name__ for f in func] - return self.__constructor__(new_data, new_index, self.columns) + # When the function is list-like, the function names become the index/columns + new_index = ( + [f if isinstance(f, string_types) else f.__name__ for f in func] + if axis == 0 + else self.index + ) + new_columns = ( + [f if isinstance(f, string_types) else f.__name__ for f in func] + if axis == 1 + else self.columns + ) + return self.__constructor__(new_data, new_index, new_columns) def _callable_func(self, func, axis, *args, **kwargs): """Apply callable functions across given axis. diff --git a/modin/engines/base/io.py b/modin/engines/base/io.py index 3eca588fe50..7cb6ec6d724 100644 --- a/modin/engines/base/io.py +++ b/modin/engines/base/io.py @@ -435,6 +435,7 @@ def to_sql( index_label=None, chunksize=None, dtype=None, + method=None, ): ErrorMessage.default_to_pandas("`to_sql`") df = qc.to_pandas() @@ -447,4 +448,5 @@ def to_sql( index_label=index_label, chunksize=chunksize, dtype=dtype, + method=method, ) diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 01c44b511ac..4f880c0b24e 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -14,7 +14,6 @@ factorize, test, qcut, - match, Panel, date_range, period_range, @@ -64,7 +63,7 @@ from .plotting import Plotting as plotting from .. import __execution_engine__ as execution_engine -__pandas_version__ = "0.23.4" +__pandas_version__ = "0.24.1" if pandas.__version__ != __pandas_version__: raise ImportError( @@ -131,7 +130,7 @@ def initialize_ray(): if execution_engine == "Ray": initialize_ray() num_cpus = ray.global_state.cluster_resources()["CPU"] -elif execution_engine == "Dask": +elif execution_engine == "Dask": # pragma: no cover from distributed.client import _get_global_client if threading.current_thread().name == "MainThread": @@ -174,7 +173,6 @@ def initialize_ray(): "factorize", "test", "qcut", - "match", "to_datetime", "get_dummies", "isna", diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 572bff589da..7ef686e9d76 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -5,23 +5,31 @@ import pandas from pandas.api.types import is_scalar from pandas.compat import to_str, string_types, numpy as numpy_compat, cPickle as pkl -import pandas.core.common as com +from pandas.core.common import ( + count_not_none, + _pipe, + apply_if_callable, + is_bool_indexer, + _get_rename_function, +) from pandas.core.dtypes.common import ( - _get_dtype_from_object, + infer_dtype_from_object, is_list_like, + is_dict_like, is_numeric_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, is_object_dtype, is_integer_dtype, ) -from pandas.core.index import _ensure_index_from_sequences +from pandas.core.index import ensure_index_from_sequences from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable from pandas.util._validators import validate_bool_kwarg import itertools import functools import numpy as np +from numpy import nan import re import sys import warnings @@ -31,6 +39,10 @@ from .iterator import PartitionIterator from .series import SeriesView +# Similar to pandas, sentinel value to use as kwarg in place of None when None has +# special meaning and needs to be distinguished from a user explicitly passing None. +sentinel = object() + @_inherit_docstrings( pandas.DataFrame, excluded=[pandas.DataFrame, pandas.DataFrame.__init__] @@ -206,7 +218,9 @@ def _validate_eval_query(self, expr, **kwargs): if isinstance(expr, str) and "not" in expr: if "parser" in kwargs and kwargs["parser"] == "python": - ErrorMessage.not_implemented("'Not' nodes are not implemented.") + ErrorMessage.not_implemented( + "'Not' nodes are not implemented." + ) # pragma: no cover @property def size(self): @@ -341,6 +355,7 @@ def groupby( sort=True, group_keys=True, squeeze=False, + observed=False, **kwargs ): """Apply a groupby to this DataFrame. See _groupby() remote task. @@ -390,13 +405,14 @@ def groupby( group_keys, squeeze, idx_name, + observed=observed, **kwargs ) def sum( self, axis=None, - skipna=True, + skipna=None, level=None, numeric_only=None, min_count=0, @@ -567,7 +583,7 @@ def aggregate(self, func, axis=0, *args, **kwargs): if axis == 0: try: - result = self._aggregate(func, axis=axis, *args, **kwargs) + result = self._aggregate(func, _axis=axis, *args, **kwargs) except TypeError: pass @@ -578,22 +594,21 @@ def aggregate(self, func, axis=0, *args, **kwargs): return result def _aggregate(self, arg, *args, **kwargs): - _axis = kwargs.pop("_axis", None) - if _axis is None: - _axis = getattr(self, "axis", 0) + _axis = kwargs.pop("_axis", 0) kwargs.pop("_level", None) if isinstance(arg, string_types): + kwargs.pop("is_transform", None) return self._string_function(arg, *args, **kwargs) # Dictionaries have complex behavior because they can be renamed here. elif isinstance(arg, dict): return self._default_to_pandas(pandas.DataFrame.agg, arg, *args, **kwargs) elif is_list_like(arg) or callable(arg): + kwargs.pop("is_transform", None) return self.apply(arg, axis=_axis, args=args, **kwargs) else: - # TODO Make pandas error - raise ValueError("type {} is not callable".format(type(arg))) + raise TypeError("type {} is not callable".format(type(arg))) def _string_function(self, func, *args, **kwargs): assert isinstance(func, string_types) @@ -645,7 +660,7 @@ def align( broadcast_axis=broadcast_axis, ) - def all(self, axis=0, bool_only=None, skipna=None, level=None, **kwargs): + def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): """Return whether all elements are True over requested axis Note: @@ -661,7 +676,7 @@ def all(self, axis=0, bool_only=None, skipna=None, level=None, **kwargs): axis=axis, bool_only=bool_only, skipna=skipna, level=level, **kwargs ) - def any(self, axis=0, bool_only=None, skipna=None, level=None, **kwargs): + def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): """Return whether any elements are True over requested axis Note: @@ -736,7 +751,15 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=None): return DataFrame(query_compiler=query_compiler) def apply( - self, func, axis=0, broadcast=False, raw=False, reduce=None, args=(), **kwds + self, + func, + axis=0, + broadcast=None, + raw=False, + reduce=None, + result_type=None, + args=(), + **kwds ): """Apply a function along input axis of DataFrame. @@ -769,14 +792,8 @@ def apply( FutureWarning, stacklevel=2, ) - elif is_list_like(func): - if axis == 1: - raise TypeError( - "(\"'list' object is not callable\", " - "'occurred at index {0}'".format(self.index[0]) - ) - elif not callable(func): - return + elif not callable(func) and not is_list_like(func): + raise TypeError("{} object is not callable".format(type(func))) query_compiler = self._query_compiler.apply(func, axis, *args, **kwds) if isinstance(query_compiler, pandas.Series): @@ -799,6 +816,21 @@ def as_matrix(self, columns=None): # TODO this is very inefficient, also see __array__ return to_pandas(self).as_matrix(columns) + def to_numpy(self, dtype=None, copy=False): + """Convert the DataFrame to a NumPy array. + + Args: + dtype: The dtype to pass to numpy.asarray() + copy: Whether to ensure that the returned value is a not a view on another + array. + + Returns: + A numpy array. + """ + return self._default_to_pandas( + pandas.DataFrame.to_numpy, dtype=dtype, copy=copy + ) + def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): return self._default_to_pandas( pandas.DataFrame.asfreq, @@ -832,16 +864,21 @@ def astype(self, dtype, copy=True, errors="raise", **kwargs): new_query_compiler = self._query_compiler.astype(col_dtypes, **kwargs) return self._create_dataframe_from_compiler(new_query_compiler, not copy) - def at_time(self, time, asof=False): - return self._default_to_pandas(pandas.DataFrame.at_time, time, asof=asof) + def at_time(self, time, asof=False, axis=None): + return self._default_to_pandas( + pandas.DataFrame.at_time, time, asof=asof, axis=axis + ) - def between_time(self, start_time, end_time, include_start=True, include_end=True): + def between_time( + self, start_time, end_time, include_start=True, include_end=True, axis=None + ): return self._default_to_pandas( pandas.DataFrame.between_time, start_time, end_time, include_start=include_start, include_end=include_end, + axis=axis, ) def bfill(self, axis=None, inplace=False, limit=None, downcast=None): @@ -882,7 +919,7 @@ def boxplot( figsize=None, layout=None, return_type=None, - **kwargs + **kwds ): return to_pandas(self).boxplot( column=column, @@ -894,7 +931,7 @@ def boxplot( figsize=figsize, layout=layout, return_type=return_type, - **kwargs + **kwds ) def clip(self, lower=None, upper=None, axis=None, inplace=False, *args, **kwargs): @@ -946,9 +983,6 @@ def compound(self, axis=None, skipna=None, level=None): pandas.DataFrame.compound, axis=axis, skipna=skipna, level=level ) - def consolidate(self, inplace=False): - return self._default_to_pandas(pandas.DataFrame.consolidate, inplace=inplace) - def convert_objects( self, convert_dates=True, @@ -969,11 +1003,11 @@ def corr(self, method="pearson", min_periods=1): pandas.DataFrame.corr, method=method, min_periods=min_periods ) - def corrwith(self, other, axis=0, drop=False): + def corrwith(self, other, axis=0, drop=False, method="pearson"): if isinstance(other, DataFrame): other = other._query_compiler.to_pandas() return self._default_to_pandas( - pandas.DataFrame.corrwith, other, axis=axis, drop=drop + pandas.DataFrame.corrwith, other, axis=axis, drop=drop, method=method ) def count(self, axis=0, level=None, numeric_only=False): @@ -1258,6 +1292,17 @@ def drop( ) return self._create_dataframe_from_compiler(new_query_compiler, inplace) + def droplevel(self, level, axis=0): + """Return index with requested level(s) removed. + + Args: + level: The level to drop + + Returns: + Index or MultiIndex + """ + return self._default_to_pandas(pandas.DataFrame.droplevel, level, axis=axis) + def drop_duplicates(self, subset=None, keep="first", inplace=False): """Return DataFrame with duplicate rows removed, optionally only considering certain columns @@ -1512,11 +1557,10 @@ def filter(self, items=None, like=None, regex=None, axis=None): Returns: A new DataFrame with the filter applied. """ - nkw = com._count_not_none(items, like, regex) + nkw = count_not_none(items, like, regex) if nkw > 1: raise TypeError( - "Keyword arguments `items`, `like`, or `regex` " - "are mutually exclusive" + "Keyword arguments `items`, `like`, or `regex` are mutually exclusive" ) if nkw == 0: raise TypeError("Must pass either `items`, `like`, or `regex`") @@ -1590,7 +1634,7 @@ def from_csv( cls, path, header=0, - sep=", ", + sep=",", index_col=0, parse_dates=True, encoding=None, @@ -1611,9 +1655,13 @@ def from_csv( ) @classmethod - def from_dict(cls, data, orient="columns", dtype=None): + def from_dict(cls, data, orient="columns", dtype=None, columns=None): ErrorMessage.default_to_pandas("`from_dict`") - return from_pandas(pandas.DataFrame.from_dict(data, orient=orient, dtype=dtype)) + return from_pandas( + pandas.DataFrame.from_dict( + data, orient=orient, dtype=dtype, columns=columns + ) + ) @classmethod def from_items(cls, items, columns=None, orient="columns"): @@ -1761,8 +1809,8 @@ def hist( figsize=None, layout=None, bins=10, - **kwargs - ): + **kwds + ): # pragma: no cover return self._default_to_pandas( pandas.DataFrame.hist, column=column, @@ -1778,7 +1826,7 @@ def hist( figsize=figsize, layout=layout, bins=bins, - **kwargs + **kwds ) def idxmax(self, axis=0, skipna=True): @@ -1925,6 +1973,7 @@ def interpolate( limit=None, inplace=False, limit_direction="forward", + limit_area=None, downcast=None, **kwargs ): @@ -1935,6 +1984,7 @@ def interpolate( limit=limit, inplace=inplace, limit_direction=limit_direction, + limit_area=limit_area, downcast=downcast, **kwargs ) @@ -2184,7 +2234,7 @@ def mad(self, axis=None, skipna=None, level=None): def mask( self, cond, - other=np.nan, + other=nan, inplace=False, axis=None, level=None, @@ -2409,7 +2459,7 @@ def mod(self, other, axis="columns", level=None, fill_value=None): ) return self._create_dataframe_from_compiler(new_query_compiler) - def mode(self, axis=0, numeric_only=False): + def mode(self, axis=0, numeric_only=False, dropna=True): """Perform mode across the DataFrame. Args: @@ -2422,7 +2472,7 @@ def mode(self, axis=0, numeric_only=False): axis = pandas.DataFrame()._get_axis_number(axis) return DataFrame( query_compiler=self._query_compiler.mode( - axis=axis, numeric_only=numeric_only + axis=axis, numeric_only=numeric_only, dropna=dropna ) ) @@ -2553,7 +2603,7 @@ def pipe(self, func, *args, **kwargs): Returns: object: the return type of ``func``. """ - return com._pipe(self, func, *args, **kwargs) + return _pipe(self, func, *args, **kwargs) def pivot(self, index=None, columns=None, values=None): return self._default_to_pandas( @@ -2668,7 +2718,7 @@ def prod( skipna=None, level=None, numeric_only=None, - min_count=1, + min_count=0, **kwargs ): """Return the product of the values for the requested axis @@ -2678,7 +2728,7 @@ def prod( skipna : boolean, default True level : int or level name, default None numeric_only : boolean, default None - min_count : int, default 1 + min_count : int, default 0 Returns: prod : Series or DataFrame (if level specified) @@ -2700,7 +2750,7 @@ def product( skipna=None, level=None, numeric_only=None, - min_count=1, + min_count=0, **kwargs ): """Return the product of the values for the requested axis @@ -2710,7 +2760,7 @@ def product( skipna : boolean, default True level : int or level name, default None numeric_only : boolean, default None - min_count : int, default 1 + min_count : int, default 0 Returns: product : Series or DataFrame (if level specified) @@ -3001,13 +3051,12 @@ def rename( If inplace is False, a new DataFrame with the updated axes. """ inplace = validate_bool_kwarg(inplace, "inplace") - # We have to do this with the args because of how rename handles - # kwargs. It doesn't ignore None values passed in, so we have to filter - # them ourselves. + # We have to do this with the args because of how rename handles kwargs. It + # doesn't ignore None values passed in, so we have to filter them ourselves. args = locals() kwargs = {k: v for k, v in args.items() if v is not None and k != "self"} - # inplace should always be true because this is just a copy, and we - # will use the results after. + # inplace should always be true because this is just a copy, and we will use the + # results after. kwargs["inplace"] = True df_to_rename = pandas.DataFrame(index=self.index, columns=self.columns) df_to_rename.rename(**kwargs) @@ -3022,15 +3071,62 @@ def rename( if not inplace: return obj - def rename_axis(self, mapper, axis=0, copy=True, inplace=False): - axes_is_columns = axis == 1 or axis == "columns" - renamed = self if inplace else self.copy() - if axes_is_columns: - renamed.columns.name = mapper + def rename_axis( + self, mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False + ): + kwargs = { + "index": index, + "columns": columns, + "axis": axis, + "copy": copy, + "inplace": inplace, + } + axes, kwargs = pandas.DataFrame()._construct_axes_from_arguments( + (), kwargs, sentinel=sentinel + ) + if axis is not None: + axis = pandas.DataFrame()._get_axis_number(axis) else: - renamed.index.name = mapper - if not inplace: - return renamed + axis = 0 + inplace = validate_bool_kwarg(inplace, "inplace") + + if mapper is not None: + # Use v0.23 behavior if a scalar or list + non_mapper = is_scalar(mapper) or ( + is_list_like(mapper) and not is_dict_like(mapper) + ) + if non_mapper: + return self._set_axis_name(mapper, axis=axis, inplace=inplace) + else: + # Deprecated (v0.21) behavior is if mapper is specified, + # and not a list or scalar, then call rename + msg = ( + "Using 'rename_axis' to alter labels is deprecated. " + "Use '.rename' instead" + ) + warnings.warn(msg, FutureWarning, stacklevel=3) + axis = pandas.DataFrame()._get_axis_name(axis) + d = {"copy": copy, "inplace": inplace, axis: mapper} + return self.rename(**d) + else: + # Use new behavior. Means that index and/or columns is specified + result = self if inplace else self.copy(deep=copy) + + for axis in axes: + if axes[axis] is None: + continue + v = axes[axis] + axis = pandas.DataFrame()._get_axis_number(axis) + non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) + if non_mapper: + newnames = v + else: + f = _get_rename_function(v) + curnames = self.index.names if axis == 0 else self.columns.names + newnames = [f(name) for name in curnames] + result._set_axis_name(newnames, axis=axis, inplace=True) + if not inplace: + return result def _set_axis_name(self, name, axis=0, inplace=False): """Alter the name or names of the axis. @@ -3043,12 +3139,12 @@ def _set_axis_name(self, name, axis=0, inplace=False): Returns: Type of caller or None if inplace=True. """ - axes_is_columns = axis == 1 or axis == "columns" + axis = pandas.DataFrame()._get_axis_number(axis) if axis is not None else 0 renamed = self if inplace else self.copy() - if axes_is_columns: - renamed.columns.set_names(name) + if axis == 0: + renamed.index = renamed.index.set_names(name) else: - renamed.index.set_names(name) + renamed.columns = renamed.columns.set_names(name) if not inplace: return renamed @@ -3465,7 +3561,7 @@ def select_dtypes(self, include=None, exclude=None): exclude = [] sel = tuple(map(set, (include, exclude))) - include, exclude = map(lambda x: set(map(_get_dtype_from_object, x)), sel) + include, exclude = map(lambda x: set(map(infer_dtype_from_object, x)), sel) include_these = pandas.Series(not bool(include), index=self.columns) exclude_these = pandas.Series(not bool(exclude), index=self.columns) @@ -3595,7 +3691,7 @@ def set_index( if drop: to_remove.append(col) arrays.append(level) - index = _ensure_index_from_sequences(arrays, names) + index = ensure_index_from_sequences(arrays, names) if verify_integrity and not index.is_unique: duplicates = index.get_duplicates() @@ -3615,9 +3711,13 @@ def set_value(self, index, col, value, takeable=False): pandas.DataFrame.set_value, index, col, value, takeable=takeable ) - def shift(self, periods=1, freq=None, axis=0): + def shift(self, periods=1, freq=None, axis=0, fill_value=None): return self._default_to_pandas( - pandas.DataFrame.shift, periods=periods, freq=freq, axis=axis + pandas.DataFrame.shift, + periods=periods, + freq=freq, + axis=axis, + fill_value=fill_value, ) def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): @@ -3759,18 +3859,6 @@ def sort_values( ).columns return self.reindex(columns=new_columns, copy=not inplace) - def sortlevel( - self, level=0, axis=0, ascending=True, inplace=False, sort_remaining=True - ): - return self._default_to_pandas( - pandas.DataFrame.sortlevel, - level=level, - axis=axis, - ascending=ascending, - inplace=inplace, - sort_remaining=sort_remaining, - ) - def squeeze(self, axis=None): # Checks for 1x1 DF, passes into squeeze with approproate ndim if ( @@ -3890,7 +3978,7 @@ def take(self, indices, axis=0, convert=None, is_copy=True, **kwargs): **kwargs ) - def to_clipboard(self, excel=None, sep=None, **kwargs): # pragma: no cover + def to_clipboard(self, excel=True, sep=None, **kwargs): # pragma: no cover return self._default_to_pandas( pandas.DataFrame.to_clipboard, excel=excel, sep=sep, **kwargs ) @@ -3907,10 +3995,10 @@ def to_csv( index_label=None, mode="w", encoding=None, - compression=None, + compression="infer", quoting=None, quotechar='"', - line_terminator="\n", + line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, @@ -3996,21 +4084,31 @@ def to_feather(self, fname): # pragma: no cover def to_gbq( self, destination_table, - project_id, - chunksize=10000, - verbose=True, + project_id=None, + chunksize=None, reauth=False, if_exists="fail", + auth_local_webserver=False, + table_schema=None, + location=None, + progress_bar=True, + credentials=None, + verbose=None, private_key=None, ): # pragma: no cover return self._default_to_pandas( pandas.DataFrame.to_gbq, destination_table, - project_id, + project_id=project_id, chunksize=chunksize, - verbose=verbose, reauth=reauth, if_exists=if_exists, + auth_local_webserver=auth_local_webserver, + table_schema=table_schema, + location=location, + progress_bar=progress_bar, + credentials=credentials, + verbose=verbose, private_key=private_key, ) @@ -4026,44 +4124,48 @@ def to_html( col_space=None, header=True, index=True, - na_rep="np.NaN", + na_rep="NaN", formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, - bold_rows=True, - classes=None, - escape=True, max_rows=None, max_cols=None, show_dimensions=False, - notebook=False, decimal=".", + bold_rows=True, + classes=None, + escape=True, + notebook=False, border=None, - ): # pragma: no cover + table_id=None, + render_links=False, + ): return self._default_to_pandas( pandas.DataFrame.to_html, - buf, - columns, - col_space, - header, - index, - na_rep, - formatters, - float_format, - sparsify, - index_names, - justify, - bold_rows, - classes, - escape, - max_rows, - max_cols, - show_dimensions, - notebook, - decimal, - border, + buf=buf, + columns=columns, + col_space=col_space, + header=header, + index=index, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + index_names=index_names, + justify=justify, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + bold_rows=bold_rows, + classes=classes, + escape=escape, + notebook=notebook, + border=border, + table_id=table_id, + render_links=render_links, ) def to_json( @@ -4076,19 +4178,21 @@ def to_json( date_unit="ms", default_handler=None, lines=False, - compression=None, + compression="infer", + index=True, ): # pragma: no cover return self._default_to_pandas( pandas.DataFrame.to_json, path_or_buf, - orient, - date_format, - double_precision, - force_ascii, - date_unit, - default_handler, - lines, - compression, + orient=orient, + date_format=date_format, + double_precision=double_precision, + force_ascii=force_ascii, + date_unit=date_unit, + default_handler=default_handler, + lines=lines, + compression=compression, + index=index, ) def to_latex( @@ -4098,7 +4202,7 @@ def to_latex( col_space=None, header=True, index=True, - na_rep="np.NaN", + na_rep="NaN", formatters=None, float_format=None, sparsify=None, @@ -4150,13 +4254,21 @@ def to_panel(self): # pragma: no cover return self._default_to_pandas(pandas.DataFrame.to_panel) def to_parquet( - self, fname, engine="auto", compression="snappy", **kwargs + self, + fname, + engine="auto", + compression="snappy", + index=None, + partition_cols=None, + **kwargs ): # pragma: no cover return self._default_to_pandas( pandas.DataFrame.to_parquet, fname, engine=engine, compression=compression, + index=index, + partition_cols=partition_cols, **kwargs ) @@ -4172,11 +4284,15 @@ def to_pickle( pandas.DataFrame.to_pickle, path, compression=compression, protocol=protocol ) - def to_records(self, index=True, convert_datetime64=True): + def to_records( + self, index=True, convert_datetime64=None, column_dtypes=None, index_dtypes=None + ): return self._default_to_pandas( pandas.DataFrame.to_records, index=index, convert_datetime64=convert_datetime64, + column_dtypes=column_dtypes, + index_dtypes=index_dtypes, ) def to_sparse(self, fill_value=None, kind="block"): @@ -4194,6 +4310,7 @@ def to_sql( index_label=None, chunksize=None, dtype=None, + method=None, ): new_query_compiler = self._query_compiler # writing the index to the database by inserting it to the DF @@ -4216,6 +4333,7 @@ def to_sql( index_label=index_label, chunksize=chunksize, dtype=dtype, + method=method, ) def to_stata( @@ -4228,17 +4346,21 @@ def to_stata( time_stamp=None, data_label=None, variable_labels=None, + version=114, + convert_strl=None, ): # pragma: no cover return self._default_to_pandas( pandas.DataFrame.to_stata, fname, - convert_dates, - write_index, - encoding, - byteorder, - time_stamp, - data_label, - variable_labels, + convert_dates=convert_dates, + write_index=write_index, + encoding=encoding, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + version=version, + convert_strl=convert_strl, ) def to_string( @@ -4248,16 +4370,17 @@ def to_string( col_space=None, header=True, index=True, - na_rep="np.NaN", + na_rep="NaN", formatters=None, float_format=None, sparsify=None, index_names=True, justify=None, - line_width=None, max_rows=None, max_cols=None, show_dimensions=False, + decimal=".", + line_width=None, ): return self._default_to_pandas( pandas.DataFrame.to_string, @@ -4272,10 +4395,11 @@ def to_string( sparsify=sparsify, index_names=index_names, justify=justify, - line_width=line_width, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width, ) def to_timestamp(self, freq=None, how="start", axis=0, copy=True): @@ -4286,13 +4410,10 @@ def to_timestamp(self, freq=None, how="start", axis=0, copy=True): def to_xarray(self): return self._default_to_pandas(pandas.DataFrame.to_xarray) - def transform(self, func, *args, **kwargs): + def transform(self, func, axis=0, *args, **kwargs): kwargs["is_transform"] = True - result = self.agg(func, *args, **kwargs) - try: - result.columns = self.columns - result.index = self.index - except ValueError: + result = self.agg(func, axis=axis, *args, **kwargs) + if len(result) != len(self): raise ValueError("transforms cannot produce aggregated results") return result @@ -4340,7 +4461,9 @@ def tz_convert(self, tz, axis=0, level=None, copy=True): pandas.DataFrame.tz_convert, tz, axis=axis, level=level, copy=copy ) - def tz_localize(self, tz, axis=0, level=None, copy=True, ambiguous="raise"): + def tz_localize( + self, tz, axis=0, level=None, copy=True, ambiguous="raise", nonexistent="raise" + ): return self._default_to_pandas( pandas.DataFrame.tz_localize, tz, @@ -4348,6 +4471,7 @@ def tz_localize(self, tz, axis=0, level=None, copy=True, ambiguous="raise"): level=level, copy=copy, ambiguous=ambiguous, + nonexistent=nonexistent, ) def unstack(self, level=-1, fill_value=None): @@ -4356,7 +4480,7 @@ def unstack(self, level=-1, fill_value=None): ) def update( - self, other, join="left", overwrite=True, filter_func=None, raise_conflict=False + self, other, join="left", overwrite=True, filter_func=None, errors="ignore" ): """Modify DataFrame in place using non-NA values from other. @@ -4371,14 +4495,14 @@ def update( Returns: None """ - if raise_conflict: + if errors == "raise": return self._default_to_pandas( pandas.DataFrame.update, other, join=join, overwrite=overwrite, filter_func=filter_func, - raise_conflict=raise_conflict, + errors=errors, ) if not isinstance(other, DataFrame): other = DataFrame(other) @@ -4387,7 +4511,7 @@ def update( join=join, overwrite=overwrite, filter_func=filter_func, - raise_conflict=raise_conflict, + errors=errors, ) self._update_inplace(new_query_compiler=query_compiler) @@ -4500,7 +4624,7 @@ def __getitem__(self, key): Returns: A Pandas Series representing the value for the column. """ - key = com._apply_if_callable(key, self) + key = apply_if_callable(key, self) # Shortcut if key is an actual column is_mi_columns = isinstance(self.columns, pandas.MultiIndex) try: @@ -4529,7 +4653,7 @@ def _getitem_column(self, key): ) def _getitem_array(self, key): - if com.is_bool_indexer(key): + if is_bool_indexer(key): if isinstance(key, pandas.Series) and not key.index.equals(self.index): warnings.warn( "Boolean Series key will be reindexed to match DataFrame index.", diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py new file mode 100644 index 00000000000..b218bc1a1d4 --- /dev/null +++ b/modin/pandas/test/test_api.py @@ -0,0 +1,56 @@ +import modin.pandas as pd +import pandas +import inspect +import numpy as np + + +def test_api_equality(): + modin_dir = [obj for obj in dir(pd.DataFrame) if obj[0] != "_"] + pandas_dir = [obj for obj in dir(pandas.DataFrame) if obj[0] != "_"] + + ignore = ["timetuple"] + missing_from_modin = set(pandas_dir) - set(modin_dir) + assert not len(missing_from_modin - set(ignore)) + + assert not len(set(modin_dir) - set(pandas_dir)) + + # These have to be checked manually + allowed_different = ["to_hdf", "hist"] + difference = [] + + for m in modin_dir: + if m in allowed_different: + continue + try: + pandas_sig = dict( + inspect.signature(getattr(pandas.DataFrame, m)).parameters + ) + except TypeError: + continue + try: + modin_sig = dict(inspect.signature(getattr(pd.DataFrame, m)).parameters) + except TypeError: + continue + + if not pandas_sig == modin_sig: + append_val = ( + m, + { + i: pandas_sig[i] + for i in pandas_sig.keys() + if pandas_sig[i] != modin_sig[i] + and not ( + pandas_sig[i].default is np.nan + and modin_sig[i].default is np.nan + ) + }, + ) + try: + # This validates that there are actually values to add to the difference + # based on the condition above. + if len(list(append_val[-1])[-1]) > 0: + difference.append(append_val) + except IndexError: + pass + + assert not len(difference), "Differences found in API: {}".format(difference) diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index c969aa34465..3a8e67a0bbc 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -122,6 +122,18 @@ def inter_df_math_helper(modin_df, pandas_df, op): modin_result = getattr(modin_df, op)(list_test, axis=0) df_equals(modin_result, pandas_result) + # Level test + new_idx = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in modin_df.index] + ) + modin_df_multi_level = modin_df.copy() + modin_df_multi_level.index = new_idx + + # Defaults to pandas + with pytest.warns(UserWarning): + # Operation against self for sanity check + getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_add(data): @@ -388,6 +400,17 @@ def comparison_inter_ops_helper(modin_df, pandas_df, op): modin_result = getattr(modin_df, op)(modin_df2) df_equals(modin_result, pandas_result) + new_idx = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in modin_df.index] + ) + modin_df_multi_level = modin_df.copy() + modin_df_multi_level.index = new_idx + + # Defaults to pandas + with pytest.warns(UserWarning): + # Operation against self for sanity check + getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_eq(data): @@ -460,6 +483,17 @@ def inter_df_math_right_ops_helper(modin_df, pandas_df, op): modin_result = getattr(modin_df, op)(4.0) df_equals(modin_result, pandas_result) + new_idx = pandas.MultiIndex.from_tuples( + [(i // 4, i // 2, i) for i in modin_df.index] + ) + modin_df_multi_level = modin_df.copy() + modin_df_multi_level.index = new_idx + + # Defaults to pandas + with pytest.warns(UserWarning): + # Operation against self for sanity check + getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_radd(data): @@ -525,7 +559,7 @@ def test_rtruediv(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) - inter_df_math_right_ops_helper(modin_df, pandas_df, "rtrudiv") + inter_df_math_right_ops_helper(modin_df, pandas_df, "rtruediv") @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -911,7 +945,7 @@ def test_apply(request, data, func, axis): pandas_result = pandas_df.apply(func, axis) except Exception as e: with pytest.raises(type(e)): - modin_result = modin_df.apply(func, axis) + modin_df.apply(func, axis) else: modin_result = modin_df.apply(func, axis) df_equals(modin_result, pandas_result) @@ -978,6 +1012,11 @@ def test_as_matrix(): tm.assert_almost_equal(mat, expected) +def test_to_numpy(): + with pytest.warns(UserWarning): + pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() + + def test_asfreq(): index = pd.date_range("1/1/2000", periods=4, freq="T") series = pd.Series([0.0, None, 2.0, 3.0], index=index) @@ -1213,12 +1252,6 @@ def test_compound(): pd.DataFrame(data).compound() -def test_consolidate(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).consolidate() - - def test_convert_objects(): data = test_data_values[0] with pytest.warns(UserWarning): @@ -1448,6 +1481,23 @@ def test_drop_api_equivalence(): modin_df.drop(axis=1) +def test_droplevel(): + df = ( + pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) + .set_index([0, 1]) + .rename_axis(["a", "b"]) + ) + df.columns = pd.MultiIndex.from_tuples( + [("c", "e"), ("d", "f")], names=["level_1", "level_2"] + ) + + with pytest.warns(UserWarning): + df.droplevel("a") + + with pytest.warns(UserWarning): + df.droplevel("level_2", axis=1) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_drop_duplicates(data): modin_df = pd.DataFrame(data) @@ -1758,7 +1808,7 @@ def test_equals(): df_equals(modin_df1, pd.DataFrame(modin_df1)) frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 5, 1]} - modin_df3 = pd.DataFrame(frame_data) + modin_df3 = pd.DataFrame(frame_data, index=list("abcd")) with pytest.raises(AssertionError): df_equals(modin_df3, modin_df1) @@ -1766,6 +1816,8 @@ def test_equals(): with pytest.raises(AssertionError): df_equals(modin_df3, modin_df2) + assert modin_df1.equals(modin_df2._query_compiler.to_pandas()) + def test_eval_df_use_case(): frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} @@ -2175,6 +2227,12 @@ def test_filter(data): df_equals(modin_df.filter(like=by["like"]), pandas_df.filter(like=by["like"])) + with pytest.raises(TypeError): + modin_df.filter(items=by["items"], regex=by["regex"]) + + with pytest.raises(TypeError): + modin_df.filter() + def test_first(): i = pd.date_range("2018-04-09", periods=4, freq="2D") @@ -2242,7 +2300,6 @@ def test_head(data, n): df_equals(modin_df.head(n), pandas_df.head(n)) -@pytest.mark.skip(reason="Skip plotting") def test_hist(): data = test_data_values[0] with pytest.warns(UserWarning): @@ -3362,6 +3419,47 @@ def test_rename_bug(): df_equals(modin_df, df) +def test_rename_axis(): + data = {"num_legs": [4, 4, 2], "num_arms": [0, 0, 2]} + index = ["dog", "cat", "monkey"] + modin_df = pd.DataFrame(data, index) + pandas_df = pandas.DataFrame(data, index) + df_equals(modin_df.rename_axis("animal"), pandas_df.rename_axis("animal")) + df_equals( + modin_df.rename_axis("limbs", axis="columns"), + pandas_df.rename_axis("limbs", axis="columns"), + ) + + modin_df.rename_axis("limbs", axis="columns", inplace=True) + pandas_df.rename_axis("limbs", axis="columns", inplace=True) + df_equals(modin_df, pandas_df) + + new_index = pd.MultiIndex.from_product( + [["mammal"], ["dog", "cat", "monkey"]], names=["type", "name"] + ) + modin_df.index = new_index + pandas_df.index = new_index + + df_equals( + modin_df.rename_axis(index={"type": "class"}), + pandas_df.rename_axis(index={"type": "class"}), + ) + df_equals( + modin_df.rename_axis(columns=str.upper), + pandas_df.rename_axis(columns=str.upper), + ) + df_equals( + modin_df.rename_axis(columns=[str.upper(o) for o in modin_df.columns.names]), + pandas_df.rename_axis(columns=[str.upper(o) for o in pandas_df.columns.names]), + ) + + with pytest.warns(FutureWarning): + df_equals( + modin_df.rename_axis(str.upper, axis=1), + pandas_df.rename_axis(str.upper, axis=1), + ) + + def test_rename_axis_inplace(): test_frame = TestData().frame modin_df = pd.DataFrame(test_frame) @@ -3789,12 +3887,6 @@ def test_sort_values(request, data, axis, ascending, na_position): df_equals(modin_df_cp, pandas_df_cp) -def test_sortlevel(): - data = test_data_values[0] - with pytest.warns(UserWarning): - pd.DataFrame(data).sortlevel() - - def test_squeeze(): frame_data = { "col1": [0, 1, 2, 3], @@ -3995,9 +4087,6 @@ def test_to_xarray(): pd.DataFrame(data).to_xarray() -@pytest.mark.skip( - reason="We do not have support to check if a UDF can only take in numeric functions" -) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("func", agg_func_values, ids=agg_func_keys) def test_transform(request, data, func): @@ -4005,12 +4094,12 @@ def test_transform(request, data, func): pandas_df = pandas.DataFrame(data) try: - pandas_result = pandas_df.agg(func) + pandas_result = pandas_df.transform(func) except Exception as e: with pytest.raises(type(e)): - modin_df.agg(func) + modin_df.transform(func) else: - modin_result = modin_df.agg(func) + modin_result = modin_df.transform(func) df_equals(modin_result, pandas_result) @@ -4024,12 +4113,12 @@ def test_transform_numeric(request, data, func): pandas_df = pandas.DataFrame(data) try: - pandas_result = pandas_df.agg(func) + pandas_result = pandas_df.transform(func) except Exception as e: with pytest.raises(type(e)): - modin_df.agg(func) + modin_df.transform(func) else: - modin_result = modin_df.agg(func) + modin_result = modin_df.transform(func) df_equals(modin_result, pandas_result) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index 16bedbbde6c..9663a29986c 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -141,6 +141,7 @@ "sum mean": ["sum", "mean"], "sum sum": ["sum", "sum"], "sum df sum": ["sum", lambda df: df.sum()], + "should raise TypeError": 1, } agg_func_keys = list(agg_func.keys()) agg_func_values = list(agg_func.values()) diff --git a/requirements.txt b/requirements.txt index 35cce4e4c22..893153188b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pandas==0.23.4 +pandas==0.24.1 numpy <= 1.15.0 dask[complete]==1.0.0 distributed==1.25.0 diff --git a/setup.py b/setup.py index f43cbd611c4..73d05a04652 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", - install_requires=["pandas==0.23.4", "ray==0.6.2", "numpy<=1.15.0", "typing"], + install_requires=["pandas==0.24.1", "ray==0.6.2", "numpy<=1.15.0", "typing"], extras_require={ # can be installed by pip install modin[dask] "dask": ["dask==1.0.0", "distributed==1.25.0"],