diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fac5c211cdad8..f0772f72d63d4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -122,6 +122,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then ! grep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. (autosummary|contents|currentmodule|deprecated|function|image|important|include|ipython|literalinclude|math|module|note|raw|seealso|toctree|versionadded|versionchanged|warning):[^:]" ./pandas ./doc/source RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check that the deprecated `assert_raises_regex` is not used (`pytest.raises(match=pattern)` should be used instead)' ; echo $MSG + ! grep -R --exclude=*.pyc --exclude=testing.py --exclude=test_testing.py assert_raises_regex pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for modules that pandas should not import' ; echo $MSG python -c " import sys diff --git a/doc/source/io.rst b/doc/source/io.rst index beb1c1daba962..34dc185c200e6 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2854,6 +2854,11 @@ It is often the case that users will insert columns to do temporary computations in Excel and you may not want to read in those columns. ``read_excel`` takes a ``usecols`` keyword to allow you to specify a subset of columns to parse. +.. deprecated:: 0.24.0 + +Passing in an integer for ``usecols`` has been deprecated. Please pass in a list +of ints from 0 to ``usecols`` inclusive instead. + If ``usecols`` is an integer, then it is assumed to indicate the last column to be parsed. diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 34921505a46bf..2445daebb580a 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -266,7 +266,7 @@ These changes conform sparse handling to return the correct types and work to ma ``SparseArray.take`` now returns a scalar for scalar input, ``SparseArray`` for others. Furthermore, it handles a negative indexer with the same rule as ``Index`` (:issue:`10560`, :issue:`12796`) -.. ipython:: python +.. code-block:: python s = pd.SparseArray([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) s.take(0) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 73fd526640212..20496c9fb3f31 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -970,6 +970,7 @@ Deprecations - The class ``FrozenNDArray`` has been deprecated. When unpickling, ``FrozenNDArray`` will be unpickled to ``np.ndarray`` once this class is removed (:issue:`9031`) - Deprecated the `nthreads` keyword of :func:`pandas.read_feather` in favor of `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) +- :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`) - Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`) .. _whatsnew_0240.deprecations.datetimelike_int_ops: @@ -1298,6 +1299,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) - :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) - :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) +- Bug in :func:`read_csv()` in which memory management was prematurely optimized for the C engine when the data was being read in chunks (:issue:`23509`) - :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) - :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) - :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3870a55c22fd6..40aa03caa56eb 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -132,6 +132,7 @@ cdef extern from "parser/tokenizer.h": int64_t *word_starts # where we are in the stream int64_t words_len int64_t words_cap + int64_t max_words_cap # maximum word cap encountered char *pword_start # pointer to stream start of current field int64_t word_start # position start of current field diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 2fce241027d56..e46e1e85f1c81 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -197,6 +197,7 @@ int parser_init(parser_t *self) { sz = sz ? sz : 1; self->words = (char **)malloc(sz * sizeof(char *)); self->word_starts = (int64_t *)malloc(sz * sizeof(int64_t)); + self->max_words_cap = sz; self->words_cap = sz; self->words_len = 0; @@ -247,7 +248,7 @@ void parser_del(parser_t *self) { } static int make_stream_space(parser_t *self, size_t nbytes) { - int64_t i, cap; + int64_t i, cap, length; int status; void *orig_ptr, *newptr; @@ -287,8 +288,23 @@ static int make_stream_space(parser_t *self, size_t nbytes) { */ cap = self->words_cap; + + /** + * If we are reading in chunks, we need to be aware of the maximum number + * of words we have seen in previous chunks (self->max_words_cap), so + * that way, we can properly allocate when reading subsequent ones. + * + * Otherwise, we risk a buffer overflow if we mistakenly under-allocate + * just because a recent chunk did not have as many words. + */ + if (self->words_len + nbytes < self->max_words_cap) { + length = self->max_words_cap - nbytes; + } else { + length = self->words_len; + } + self->words = - (char **)grow_buffer((void *)self->words, self->words_len, + (char **)grow_buffer((void *)self->words, length, (int64_t*)&self->words_cap, nbytes, sizeof(char *), &status); TRACE( @@ -1241,6 +1257,19 @@ int parser_trim_buffers(parser_t *self) { int64_t i; + /** + * Before we free up space and trim, we should + * save how many words we saw when parsing, if + * it exceeds the maximum number we saw before. + * + * This is important for when we read in chunks, + * so that we can inform subsequent chunk parsing + * as to how many words we could possibly see. + */ + if (self->words_cap > self->max_words_cap) { + self->max_words_cap = self->words_cap; + } + /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 9fc3593aaaf5b..c32c061c7fa89 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -142,6 +142,7 @@ typedef struct parser_t { int64_t *word_starts; // where we are in the stream int64_t words_len; int64_t words_cap; + int64_t max_words_cap; // maximum word cap encountered char *pword_start; // pointer to stream start of current field int64_t word_start; // position start of current field diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 3fa4f503d2dd5..daf2dcccd284b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -124,8 +124,12 @@ def asi8(self): # do not cache or you'll create a memory leak return self._data.view('i8') - # ------------------------------------------------------------------ - # Array-like Methods + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + + @property + def nbytes(self): + return self._data.nbytes @property def shape(self): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b0485cc82f07f..a6f688fb0cf7a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -385,7 +385,7 @@ def _resolution(self): return libresolution.resolution(self.asi8, self.tz) # ---------------------------------------------------------------- - # Array-like Methods + # Array-Like / EA-Interface Methods def __array__(self, dtype=None): if is_object_dtype(dtype): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 482968fdb4766..b343d42ef3b7c 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -272,10 +272,6 @@ def _concat_same_type(cls, to_concat): # -------------------------------------------------------------------- # Data / Attributes - @property - def nbytes(self): - # TODO(DatetimeArray): remove - return self._data.nbytes @cache_readonly def dtype(self): @@ -286,10 +282,6 @@ def _ndarray_values(self): # Ordinals return self._data - @property - def asi8(self): - return self._data - @property def freq(self): """Return the frequency object for this PeriodArray.""" @@ -330,6 +322,50 @@ def start_time(self): def end_time(self): return self.to_timestamp(how='end') + def to_timestamp(self, freq=None, how='start'): + """ + Cast to DatetimeArray/Index. + + Parameters + ---------- + freq : string or DateOffset, optional + Target frequency. The default is 'D' for week or longer, + 'S' otherwise + how : {'s', 'e', 'start', 'end'} + + Returns + ------- + DatetimeArray/Index + """ + from pandas.core.arrays import DatetimeArrayMixin + + how = libperiod._validate_end_alias(how) + + end = how == 'E' + if end: + if freq == 'B': + # roll forward to ensure we land on B date + adjust = Timedelta(1, 'D') - Timedelta(1, 'ns') + return self.to_timestamp(how='start') + adjust + else: + adjust = Timedelta(1, 'ns') + return (self + self.freq).to_timestamp(how='start') - adjust + + if freq is None: + base, mult = frequencies.get_freq_code(self.freq) + freq = frequencies.get_to_timestamp_base(base) + else: + freq = Period._maybe_convert_freq(freq) + + base, mult = frequencies.get_freq_code(freq) + new_data = self.asfreq(freq, how=how) + + new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) + return DatetimeArrayMixin(new_data, freq='infer') + + # -------------------------------------------------------------------- + # Array-like / EA-Interface Methods + def __repr__(self): return '<{}>\n{}\nLength: {}, dtype: {}'.format( self.__class__.__name__, @@ -456,6 +492,8 @@ def value_counts(self, dropna=False): name=result.index.name) return Series(result.values, index=index, name=result.name) + # -------------------------------------------------------------------- + def shift(self, periods=1): """ Shift values by desired number. @@ -567,49 +605,9 @@ def asfreq(self, freq=None, how='E'): return type(self)(new_data, freq=freq) - def to_timestamp(self, freq=None, how='start'): - """ - Cast to DatetimeArray/Index - - Parameters - ---------- - freq : string or DateOffset, optional - Target frequency. The default is 'D' for week or longer, - 'S' otherwise - how : {'s', 'e', 'start', 'end'} - - Returns - ------- - DatetimeArray/Index - """ - from pandas.core.arrays import DatetimeArrayMixin - - how = libperiod._validate_end_alias(how) - - end = how == 'E' - if end: - if freq == 'B': - # roll forward to ensure we land on B date - adjust = Timedelta(1, 'D') - Timedelta(1, 'ns') - return self.to_timestamp(how='start') + adjust - else: - adjust = Timedelta(1, 'ns') - return (self + self.freq).to_timestamp(how='start') - adjust - - if freq is None: - base, mult = frequencies.get_freq_code(self.freq) - freq = frequencies.get_to_timestamp_base(base) - else: - freq = Period._maybe_convert_freq(freq) - - base, mult = frequencies.get_freq_code(freq) - new_data = self.asfreq(freq, how=how) - - new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) - return DatetimeArrayMixin(new_data, freq='infer') - # ------------------------------------------------------------------ # Formatting + def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): """ actually format my specific types """ # TODO(DatetimeArray): remove @@ -630,9 +628,13 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): values = np.array([formatter(dt) for dt in values]) return values + # Delegation... + def strftime(self, date_format): + return self._format_native_types(date_format=date_format) + def repeat(self, repeats, *args, **kwargs): """ - Repeat elements of a Categorical. + Repeat elements of a PeriodArray. See also -------- @@ -643,10 +645,6 @@ def repeat(self, repeats, *args, **kwargs): values = self._data.repeat(repeats) return type(self)(values, self.freq) - # Delegation... - def strftime(self, date_format): - return self._format_native_types(date_format=date_format) - def astype(self, dtype, copy=True): # TODO: Figure out something better here... # We have DatetimeLikeArrayMixin -> diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index a63b3fb53625f..672261c2a407e 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -284,6 +284,83 @@ def is_dtype(cls, dtype): return True return isinstance(dtype, np.dtype) or dtype == 'Sparse' + def update_dtype(self, dtype): + """Convert the SparseDtype to a new dtype. + + This takes care of converting the ``fill_value``. + + Parameters + ---------- + dtype : Union[str, numpy.dtype, SparseDtype] + The new dtype to use. + + * For a SparseDtype, it is simply returned + * For a NumPy dtype (or str), the current fill value + is converted to the new dtype, and a SparseDtype + with `dtype` and the new fill value is returned. + + Returns + ------- + SparseDtype + A new SparseDtype with the corret `dtype` and fill value + for that `dtype`. + + Raises + ------ + ValueError + When the current fill value cannot be converted to the + new `dtype` (e.g. trying to convert ``np.nan`` to an + integer dtype). + + + Examples + -------- + >>> SparseDtype(int, 0).update_dtype(float) + Sparse[float64, 0.0] + + >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) + Sparse[float64, nan] + """ + cls = type(self) + dtype = pandas_dtype(dtype) + + if not isinstance(dtype, cls): + fill_value = astype_nansafe(np.array(self.fill_value), + dtype).item() + dtype = cls(dtype, fill_value=fill_value) + + return dtype + + @property + def _subtype_with_str(self): + """ + Whether the SparseDtype's subtype should be considered ``str``. + + Typically, pandas will store string data in an object-dtype array. + When converting values to a dtype, e.g. in ``.astype``, we need to + be more specific, we need the actual underlying type. + + Returns + ------- + + >>> SparseDtype(int, 1)._subtype_with_str + dtype('int64') + + >>> SparseDtype(object, 1)._subtype_with_str + dtype('O') + + >>> dtype = SparseDtype(str, '') + >>> dtype.subtype + dtype('O') + + >>> dtype._subtype_with_str + str + """ + if isinstance(self.fill_value, compat.string_types): + return type(self.fill_value) + return self.subtype + + # ---------------------------------------------------------------------------- # Array @@ -614,7 +691,7 @@ def __array__(self, dtype=None, copy=True): # Can't put pd.NaT in a datetime64[ns] fill_value = np.datetime64('NaT') try: - dtype = np.result_type(self.sp_values.dtype, fill_value) + dtype = np.result_type(self.sp_values.dtype, type(fill_value)) except TypeError: dtype = object @@ -996,7 +1073,7 @@ def _take_with_fill(self, indices, fill_value=None): if len(self) == 0: # Empty... Allow taking only if all empty if (indices == -1).all(): - dtype = np.result_type(self.sp_values, fill_value) + dtype = np.result_type(self.sp_values, type(fill_value)) taken = np.empty_like(indices, dtype=dtype) taken.fill(fill_value) return taken @@ -1009,7 +1086,7 @@ def _take_with_fill(self, indices, fill_value=None): if self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values taken = np.full(sp_indexer.shape, fill_value=fill_value, - dtype=np.result_type(fill_value)) + dtype=np.result_type(type(fill_value))) else: taken = self.sp_values.take(sp_indexer) @@ -1030,12 +1107,13 @@ def _take_with_fill(self, indices, fill_value=None): result_type = taken.dtype if m0.any(): - result_type = np.result_type(result_type, self.fill_value) + result_type = np.result_type(result_type, + type(self.fill_value)) taken = taken.astype(result_type) taken[old_fill_indices] = self.fill_value if m1.any(): - result_type = np.result_type(result_type, fill_value) + result_type = np.result_type(result_type, type(fill_value)) taken = taken.astype(result_type) taken[new_fill_indices] = fill_value @@ -1061,7 +1139,7 @@ def _take_without_fill(self, indices): # edge case in take... # I think just return out = np.full(indices.shape, self.fill_value, - dtype=np.result_type(self.fill_value)) + dtype=np.result_type(type(self.fill_value))) arr, sp_index, fill_value = make_sparse(out, fill_value=self.fill_value) return type(self)(arr, sparse_index=sp_index, @@ -1073,7 +1151,7 @@ def _take_without_fill(self, indices): if fillable.any(): # TODO: may need to coerce array to fill value - result_type = np.result_type(taken, self.fill_value) + result_type = np.result_type(taken, type(self.fill_value)) taken = taken.astype(result_type) taken[fillable] = self.fill_value @@ -1093,7 +1171,9 @@ def _concat_same_type(cls, to_concat): fill_value = fill_values[0] - if len(set(fill_values)) > 1: + # np.nan isn't a singleton, so we may end up with multiple + # NaNs here, so we ignore tha all NA case too. + if not (len(set(fill_values)) == 1 or isna(fill_values).all()): warnings.warn("Concatenating sparse arrays with multiple fill " "values: '{}'. Picking the first and " "converting the rest.".format(fill_values), @@ -1212,13 +1292,10 @@ def astype(self, dtype=None, copy=True): IntIndex Indices: array([2, 3], dtype=int32) """ - dtype = pandas_dtype(dtype) - - if not isinstance(dtype, SparseDtype): - dtype = SparseDtype(dtype, fill_value=self.fill_value) - + dtype = self.dtype.update_dtype(dtype) + subtype = dtype._subtype_with_str sp_values = astype_nansafe(self.sp_values, - dtype.subtype, + subtype, copy=copy) if sp_values is self.sp_values and copy: sp_values = sp_values.copy() diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 1f78e0c00bf00..9dbdd6ff8b562 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -190,6 +190,9 @@ def _generate_range(cls, start, end, periods, freq, closed=None): return cls._simple_new(index, freq=freq) + # ---------------------------------------------------------------- + # Array-Like / EA-Interface Methods + # ---------------------------------------------------------------- # Arithmetic Methods @@ -412,20 +415,25 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): array : list-like copy : bool, default False unit : str, default "ns" + The timedelta unit to treat integers as multiples of. errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. Returns ------- - ndarray[timedelta64[ns]] + converted : numpy.ndarray + The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None + The inferred frequency of the sequence. Raises ------ - ValueError : data cannot be converted to timedelta64[ns] + ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- - Unlike `pandas.to_timedelta`, if setting `errors=ignore` will not cause + Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ @@ -497,12 +505,13 @@ def ints_to_td64ns(data, unit="ns"): Parameters ---------- - data : np.ndarray with integer-dtype + data : numpy.ndarray with integer-dtype unit : str, default "ns" + The timedelta unit to treat integers as multiples of. Returns ------- - ndarray[timedelta64[ns]] + numpy.ndarray : timedelta64[ns] array converted from data bool : whether a copy was made """ copy_made = False @@ -538,15 +547,18 @@ def objects_to_td64ns(data, unit="ns", errors="raise"): ---------- data : ndarray or Index unit : str, default "ns" + The timedelta unit to treat integers as multiples of. errors : {"raise", "coerce", "ignore"}, default "raise" + How to handle elements that cannot be converted to timedelta64[ns]. + See ``pandas.to_timedelta`` for details. Returns ------- - ndarray[timedelta64[ns]] + numpy.ndarray : timedelta64[ns] array converted from data Raises ------ - ValueError : data cannot be converted to timedelta64[ns] + ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 34f25c5634d5b..2c7f6ae8e3533 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11,6 +11,8 @@ import pandas as pd from pandas._libs import properties, Timestamp, iNaT +from pandas.errors import AbstractMethodError + from pandas.core.dtypes.common import ( ensure_int64, ensure_object, @@ -200,7 +202,7 @@ def _constructor(self): """Used when a manipulation result has the same dimensions as the original. """ - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def __unicode__(self): # unicode representation based upon iterating over self @@ -221,7 +223,7 @@ def _constructor_sliced(self): """Used when a manipulation result has one lower dimension(s) as the original, such as DataFrame single columns slicing. """ - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) @property def _constructor_expanddim(self): @@ -2884,7 +2886,7 @@ def _iget_item_cache(self, item): return lower def _box_item_values(self, key, values): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _maybe_cache_changed(self, item, value): """The object has called back to us saying maybe it has changed. diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 451f1199ac8e6..b0477c7d3a8ad 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -18,6 +18,7 @@ import pandas.compat as compat from pandas.compat import lzip, map from pandas.compat.numpy import _np_version_under1p13 +from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import maybe_downcast_to_dtype @@ -240,7 +241,7 @@ def _aggregate_generic(self, func, *args, **kwargs): return self._wrap_generic_output(result, obj) def _wrap_aggregated_output(self, output, names=None): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _aggregate_item_by_item(self, func, *args, **kwargs): # only for axis==0 @@ -1659,4 +1660,4 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): raise ValueError("axis value must be greater than 0") def _wrap_aggregated_output(self, output, names=None): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ea7507799fa9a..12327e1cf148e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -20,6 +20,7 @@ class providing the base-class of operations. import pandas.compat as compat from pandas.compat import callable, range, set_function_name, zip from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._validators import validate_kwargs @@ -706,7 +707,7 @@ def _iterate_slices(self): yield self._selection_name, self._selected_obj def transform(self, func, *args, **kwargs): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _cumcount_array(self, ascending=True): """ @@ -861,7 +862,7 @@ def _python_agg_general(self, func, *args, **kwargs): return self._wrap_aggregated_output(output) def _wrap_applied_output(self, *args, **kwargs): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _concat_objects(self, keys, values, not_indexed_same=False): from pandas.core.reshape.concat import concat diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 390334a89cbfe..125bd9a5e855d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -13,6 +13,7 @@ from pandas._libs import NaT, groupby as libgroupby, iNaT, lib, reduction from pandas.compat import lzip, range, zip +from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -841,7 +842,7 @@ def _chop(self, sdata, slice_obj): return sdata.iloc[slice_obj] def apply(self, f): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) class SeriesSplitter(DataSplitter): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 59429488a7c2f..4547f47314bad 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -4,44 +4,33 @@ """ import warnings -from pandas import compat -from pandas.compat.numpy import function as nv -from pandas.core.tools.timedeltas import to_timedelta - import numpy as np -from pandas._libs import lib, iNaT, NaT -from pandas._libs.tslibs.timestamps import round_nsint64, RoundTo +from pandas._libs import NaT, iNaT, lib +from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 +import pandas.compat as compat +from pandas.compat.numpy import function as nv +from pandas.errors import AbstractMethodError +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - ensure_int64, - is_dtype_equal, - is_float, - is_integer, - is_list_like, - is_scalar, - is_bool_dtype, - is_period_dtype, - is_categorical_dtype, - is_datetime_or_timedelta_dtype, - is_float_dtype, - is_integer_dtype, - is_object_dtype, - is_string_dtype) -from pandas.core.dtypes.generic import ( - ABCIndex, ABCSeries, ABCIndexClass) + ensure_int64, is_bool_dtype, is_categorical_dtype, + is_datetime_or_timedelta_dtype, is_dtype_equal, is_float, is_float_dtype, + is_integer, is_integer_dtype, is_list_like, is_object_dtype, + is_period_dtype, is_scalar, is_string_dtype) +import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna -from pandas.core import common as com, algorithms, ops - -import pandas.io.formats.printing as printing +from pandas.core import algorithms, ops from pandas.core.arrays import PeriodArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs -from pandas.util._decorators import Appender, cache_readonly -import pandas.core.dtypes.concat as _concat +from pandas.core.tools.timedeltas import to_timedelta + +import pandas.io.formats.printing as printing -import pandas.core.indexes.base as ibase _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -543,7 +532,7 @@ def argmax(self, axis=None, *args, **kwargs): @property def _formatter_func(self): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _format_attrs(self): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c82cff19573e3..b754b2705d034 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1,57 +1,45 @@ # pylint: disable=E1101 from __future__ import division + +from datetime import datetime, time, timedelta import operator import warnings -from datetime import time, datetime, timedelta import numpy as np from pytz import utc -from pandas.core.base import _shared_docs +from pandas._libs import ( + Timestamp, index as libindex, join as libjoin, lib, tslib as libts) +from pandas._libs.tslibs import ( + ccalendar, conversion, fields, parsing, timezones) +import pandas.compat as compat +from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.core.dtypes.common import ( - _INT64_DTYPE, - _NS_DTYPE, - is_datetime64_dtype, - is_datetimetz, - is_dtype_equal, - is_integer, - is_float, - is_integer_dtype, - is_datetime64_ns_dtype, - is_period_dtype, - is_string_like, - is_list_like, - is_scalar, - pandas_dtype, - ensure_int64) + _INT64_DTYPE, _NS_DTYPE, ensure_int64, is_datetime64_dtype, + is_datetime64_ns_dtype, is_datetimetz, is_dtype_equal, is_float, + is_integer, is_integer_dtype, is_list_like, is_period_dtype, is_scalar, + is_string_like, pandas_dtype) +import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna -import pandas.core.dtypes.concat as _concat -from pandas.core.arrays.datetimes import DatetimeArrayMixin, _to_m8 from pandas.core.arrays import datetimelike as dtl - +from pandas.core.arrays.datetimes import ( + DatetimeArrayMixin as DatetimeArray, _to_m8) +from pandas.core.base import _shared_docs +import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.datetimelike import ( + DatelikeOps, DatetimeIndexOpsMixin, TimelikeOps, wrap_array_method, + wrap_field_accessor) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name -import pandas.compat as compat -from pandas.tseries.frequencies import to_offset, Resolution -from pandas.core.indexes.datetimelike import ( - DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, - wrap_field_accessor, wrap_array_method) -from pandas.tseries.offsets import ( - CDay, prefix_mapping) - -from pandas.util._decorators import Appender, cache_readonly, Substitution -import pandas.core.common as com -import pandas.tseries.offsets as offsets import pandas.core.tools.datetimes as tools -from pandas._libs import (lib, index as libindex, tslib as libts, - join as libjoin, Timestamp) -from pandas._libs.tslibs import (timezones, conversion, fields, parsing, - ccalendar) +from pandas.tseries import offsets +from pandas.tseries.frequencies import Resolution, to_offset +from pandas.tseries.offsets import CDay, prefix_mapping def _new_DatetimeIndex(cls, d): @@ -68,7 +56,7 @@ def _new_DatetimeIndex(cls, d): return result -class DatetimeIndex(DatetimeArrayMixin, DatelikeOps, TimelikeOps, +class DatetimeIndex(DatetimeArray, DatelikeOps, TimelikeOps, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray of datetime64 data, represented internally as int64, and @@ -182,8 +170,6 @@ class DatetimeIndex(DatetimeArrayMixin, DatelikeOps, TimelikeOps, pandas.to_datetime : Convert argument to datetime """ - _resolution = cache_readonly(DatetimeArrayMixin._resolution.fget) - _typ = 'datetimeindex' _join_precedence = 10 @@ -227,8 +213,6 @@ def _join_i8_wrapper(joinf, **kwargs): _is_numeric_dtype = False _infer_as_myclass = True - _timezone = cache_readonly(DatetimeArrayMixin._timezone.fget) - is_normalized = cache_readonly(DatetimeArrayMixin.is_normalized.fget) # -------------------------------------------------------------------- # Constructors @@ -268,8 +252,7 @@ def __new__(cls, data=None, # if dtype has an embedded tz, capture it tz = dtl.validate_tz_from_dtype(dtype, tz) - if not isinstance(data, (np.ndarray, Index, ABCSeries, - DatetimeArrayMixin)): + if not isinstance(data, (np.ndarray, Index, ABCSeries, DatetimeArray)): # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) @@ -283,7 +266,7 @@ def __new__(cls, data=None, data = tools.to_datetime(data, dayfirst=dayfirst, yearfirst=yearfirst) - if isinstance(data, DatetimeArrayMixin): + if isinstance(data, DatetimeArray): if tz is None: tz = data.tz elif data.tz is None: @@ -1125,43 +1108,47 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- # Wrapping DatetimeArray - year = wrap_field_accessor(DatetimeArrayMixin.year) - month = wrap_field_accessor(DatetimeArrayMixin.month) - day = wrap_field_accessor(DatetimeArrayMixin.day) - hour = wrap_field_accessor(DatetimeArrayMixin.hour) - minute = wrap_field_accessor(DatetimeArrayMixin.minute) - second = wrap_field_accessor(DatetimeArrayMixin.second) - microsecond = wrap_field_accessor(DatetimeArrayMixin.microsecond) - nanosecond = wrap_field_accessor(DatetimeArrayMixin.nanosecond) - weekofyear = wrap_field_accessor(DatetimeArrayMixin.weekofyear) + _timezone = cache_readonly(DatetimeArray._timezone.fget) + is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) + _resolution = cache_readonly(DatetimeArray._resolution.fget) + + year = wrap_field_accessor(DatetimeArray.year) + month = wrap_field_accessor(DatetimeArray.month) + day = wrap_field_accessor(DatetimeArray.day) + hour = wrap_field_accessor(DatetimeArray.hour) + minute = wrap_field_accessor(DatetimeArray.minute) + second = wrap_field_accessor(DatetimeArray.second) + microsecond = wrap_field_accessor(DatetimeArray.microsecond) + nanosecond = wrap_field_accessor(DatetimeArray.nanosecond) + weekofyear = wrap_field_accessor(DatetimeArray.weekofyear) week = weekofyear - dayofweek = wrap_field_accessor(DatetimeArrayMixin.dayofweek) + dayofweek = wrap_field_accessor(DatetimeArray.dayofweek) weekday = dayofweek - weekday_name = wrap_field_accessor(DatetimeArrayMixin.weekday_name) + weekday_name = wrap_field_accessor(DatetimeArray.weekday_name) - dayofyear = wrap_field_accessor(DatetimeArrayMixin.dayofyear) - quarter = wrap_field_accessor(DatetimeArrayMixin.quarter) - days_in_month = wrap_field_accessor(DatetimeArrayMixin.days_in_month) + dayofyear = wrap_field_accessor(DatetimeArray.dayofyear) + quarter = wrap_field_accessor(DatetimeArray.quarter) + days_in_month = wrap_field_accessor(DatetimeArray.days_in_month) daysinmonth = days_in_month - is_month_start = wrap_field_accessor(DatetimeArrayMixin.is_month_start) - is_month_end = wrap_field_accessor(DatetimeArrayMixin.is_month_end) - is_quarter_start = wrap_field_accessor(DatetimeArrayMixin.is_quarter_start) - is_quarter_end = wrap_field_accessor(DatetimeArrayMixin.is_quarter_end) - is_year_start = wrap_field_accessor(DatetimeArrayMixin.is_year_start) - is_year_end = wrap_field_accessor(DatetimeArrayMixin.is_year_end) - is_leap_year = wrap_field_accessor(DatetimeArrayMixin.is_leap_year) - - tz_localize = wrap_array_method(DatetimeArrayMixin.tz_localize, True) - tz_convert = wrap_array_method(DatetimeArrayMixin.tz_convert, True) - to_perioddelta = wrap_array_method(DatetimeArrayMixin.to_perioddelta, + is_month_start = wrap_field_accessor(DatetimeArray.is_month_start) + is_month_end = wrap_field_accessor(DatetimeArray.is_month_end) + is_quarter_start = wrap_field_accessor(DatetimeArray.is_quarter_start) + is_quarter_end = wrap_field_accessor(DatetimeArray.is_quarter_end) + is_year_start = wrap_field_accessor(DatetimeArray.is_year_start) + is_year_end = wrap_field_accessor(DatetimeArray.is_year_end) + is_leap_year = wrap_field_accessor(DatetimeArray.is_leap_year) + + tz_localize = wrap_array_method(DatetimeArray.tz_localize, True) + tz_convert = wrap_array_method(DatetimeArray.tz_convert, True) + to_perioddelta = wrap_array_method(DatetimeArray.to_perioddelta, False) - to_period = wrap_array_method(DatetimeArrayMixin.to_period, True) - normalize = wrap_array_method(DatetimeArrayMixin.normalize, True) - to_julian_date = wrap_array_method(DatetimeArrayMixin.to_julian_date, + to_period = wrap_array_method(DatetimeArray.to_period, True) + normalize = wrap_array_method(DatetimeArray.normalize, True) + to_julian_date = wrap_array_method(DatetimeArray.to_julian_date, False) - month_name = wrap_array_method(DatetimeArrayMixin.month_name, True) - day_name = wrap_array_method(DatetimeArrayMixin.day_name, True) + month_name = wrap_array_method(DatetimeArray.month_name, True) + day_name = wrap_array_method(DatetimeArray.day_name, True) # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 128068959ebd3..7890f03a1eba7 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -256,8 +256,12 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): result._reset_identity() return result + # ------------------------------------------------------------------------ + # Wrapping PeriodArray + # ------------------------------------------------------------------------ # Data + @property def _ndarray_values(self): return self._data._ndarray_values @@ -361,13 +365,6 @@ def asfreq(self, freq=None, how='E'): result = self._data.asfreq(freq=freq, how=how) return self._simple_new(result, name=self.name) - def _nat_new(self, box=True): - # TODO(DatetimeArray): remove this - result = self._data._nat_new(box=box) - if box: - result = self._simple_new(result, name=self.name) - return result - def to_timestamp(self, freq=None, how='start'): from pandas import DatetimeIndex result = self._data.to_timestamp(freq=freq, how=how) @@ -425,6 +422,7 @@ def _maybe_convert_timedelta(self, other): # ------------------------------------------------------------------------ # Indexing + @cache_readonly def _engine(self): return self._engine_type(lambda: self, len(self)) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 35e17c7400892..d9625d38b85de 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -2,44 +2,36 @@ from datetime import datetime import numpy as np + +from pandas._libs import ( + NaT, Timedelta, index as libindex, join as libjoin, lib) +import pandas.compat as compat +from pandas.util._decorators import Appender, Substitution + from pandas.core.dtypes.common import ( - _TD_DTYPE, - is_integer, - is_float, - is_list_like, - is_scalar, - is_timedelta64_dtype, - is_timedelta64_ns_dtype, - pandas_dtype, - ensure_int64) + _TD_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar, + is_timedelta64_dtype, is_timedelta64_ns_dtype, pandas_dtype) +import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna +from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.timedeltas import ( - TimedeltaArrayMixin, _is_convertible_to_td, _to_m8, + TimedeltaArrayMixin as TimedeltaArray, _is_convertible_to_td, _to_m8, sequence_to_td64ns) -from pandas.core.arrays import datetimelike as dtl - -from pandas.core.indexes.base import Index -from pandas.core.indexes.numeric import Int64Index -import pandas.compat as compat - -from pandas.tseries.frequencies import to_offset from pandas.core.base import _shared_docs -from pandas.core.indexes.base import _index_shared_docs import pandas.core.common as com -from pandas.core.ops import get_op_result_name -import pandas.core.dtypes.concat as _concat -from pandas.util._decorators import Appender, Substitution +from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.datetimelike import ( - TimelikeOps, DatetimeIndexOpsMixin, wrap_arithmetic_op, - wrap_array_method, wrap_field_accessor) -from pandas.core.tools.timedeltas import ( - _coerce_scalar_to_timedelta_type) -from pandas._libs import (lib, index as libindex, - join as libjoin, Timedelta, NaT) + DatetimeIndexOpsMixin, TimelikeOps, wrap_arithmetic_op, wrap_array_method, + wrap_field_accessor) +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name +from pandas.core.tools.timedeltas import _coerce_scalar_to_timedelta_type + +from pandas.tseries.frequencies import to_offset -class TimedeltaIndex(TimedeltaArrayMixin, DatetimeIndexOpsMixin, +class TimedeltaIndex(TimedeltaArray, DatetimeIndexOpsMixin, TimelikeOps, Int64Index): """ Immutable ndarray of timedelta64 data, represented internally as int64, and @@ -223,8 +215,7 @@ def _maybe_update_attributes(self, attrs): return attrs def _evaluate_with_timedelta_like(self, other, op): - result = TimedeltaArrayMixin._evaluate_with_timedelta_like(self, other, - op) + result = TimedeltaArray._evaluate_with_timedelta_like(self, other, op) return wrap_arithmetic_op(self, other, result) def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): @@ -236,12 +227,12 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): # ------------------------------------------------------------------- # Wrapping TimedeltaArray - days = wrap_field_accessor(TimedeltaArrayMixin.days) - seconds = wrap_field_accessor(TimedeltaArrayMixin.seconds) - microseconds = wrap_field_accessor(TimedeltaArrayMixin.microseconds) - nanoseconds = wrap_field_accessor(TimedeltaArrayMixin.nanoseconds) + days = wrap_field_accessor(TimedeltaArray.days) + seconds = wrap_field_accessor(TimedeltaArray.seconds) + microseconds = wrap_field_accessor(TimedeltaArray.microseconds) + nanoseconds = wrap_field_accessor(TimedeltaArray.nanoseconds) - total_seconds = wrap_array_method(TimedeltaArrayMixin.total_seconds, True) + total_seconds = wrap_array_method(TimedeltaArray.total_seconds, True) # ------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 155cf566b4c40..3a67238a66450 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -10,12 +10,11 @@ import pandas.compat as compat from pandas.compat import BytesIO, StringIO, string_types, text_type from pandas.errors import ( # noqa - DtypeWarning, EmptyDataError, ParserError, ParserWarning) + AbstractMethodError, DtypeWarning, EmptyDataError, ParserError, + ParserWarning) from pandas.core.dtypes.common import is_file_like, is_number -import pandas.core.common as com - from pandas.io.formats.printing import pprint_thing # gh-12665: Alias for now and remove later. @@ -67,7 +66,7 @@ def __iter__(self): return self def __next__(self): - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) if not compat.PY3: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 2e93c237bb7ea..c25a7670cce44 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -95,6 +95,10 @@ usecols : int, str, list-like, or callable default None * If None, then parse all columns, * If int, then indicates last column to be parsed + + .. deprecated:: 0.24.0 + Pass in a list of ints instead from 0 to `usecols` inclusive. + * If string, then indicates comma separated list of Excel column letters and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of both sides. @@ -778,6 +782,10 @@ def _maybe_convert_usecols(usecols): return usecols if is_integer(usecols): + warnings.warn(("Passing in an integer for `usecols` has been " + "deprecated. Please pass in a list of ints from " + "0 to `usecols` inclusive instead."), + FutureWarning, stacklevel=2) return lrange(usecols + 1) if isinstance(usecols, compat.string_types): diff --git a/pandas/io/html.py b/pandas/io/html.py index bcbb07c6dddfb..c967bdd29df1f 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -12,12 +12,11 @@ from pandas.compat import ( binary_type, iteritems, lmap, lrange, raise_with_traceback, string_types, u) -from pandas.errors import EmptyDataError +from pandas.errors import AbstractMethodError, EmptyDataError from pandas.core.dtypes.common import is_list_like from pandas import Series -import pandas.core.common as com from pandas.io.common import _is_url, _validate_header_arg, urlopen from pandas.io.formats.printing import pprint_thing @@ -256,7 +255,7 @@ def _text_getter(self, obj): text : str or unicode The text from an individual DOM node. """ - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _parse_td(self, obj): """Return the td elements from a row element. @@ -271,7 +270,7 @@ def _parse_td(self, obj): list of node-like These are the elements of each row, i.e., the columns. """ - raise com.AbstractMethodError(self) + raise AbstractMethodError(self) def _parse_thead_tr(self, table): """ @@ -286,7 +285,7 @@ def _parse_thead_tr(self, table): list of node-like These are the