diff --git a/.gitignore b/.gitignore index 4bbbcad0c97adb..96b1f945870de0 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ dist .coverage coverage.xml coverage_html_report +*.pytest_cache # OS generated files # ###################### @@ -90,7 +91,6 @@ scikits # Unit / Performance Testing # ############################## -.pytest_cache/ asv_bench/env/ asv_bench/html/ asv_bench/results/ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf index 0492805a1408b1..696ed288cf7a60 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx index 6cca9ac4647f7e..f8b98a6f1f8e4a 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 9a5e184884e76c..f7fc9575566b78 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -246,16 +246,16 @@ changes in this branch specific to one bug or feature so it is clear what the branch brings to *pandas*. You can have many shiny-new-features and switch in between them using the git checkout command. -To update this branch, you need to retrieve the changes from the master branch:: +When creating this branch, make sure your master branch is up to date with +the latest upstream master version. To update your local master branch, you +can do:: - git fetch upstream - git rebase upstream/master + git checkout master + git pull upstream master --ff-only -This will replay your commits on top of the latest pandas git master. If this -leads to merge conflicts, you must resolve these before submitting your pull -request. If you have uncommitted changes, you will need to ``stash`` them prior -to updating. This will effectively store your changes and they can be reapplied -after updating. +When you want to update the feature branch with changes in master after +you created the branch, check the section on +:ref:`updating a PR `. .. _contributing.documentation: @@ -964,32 +964,6 @@ Now you can commit your changes in your local repository:: git commit -m -Combining commits ------------------ - -If you have multiple commits, you may want to combine them into one commit, often -referred to as "squashing" or "rebasing". This is a common request by package maintainers -when submitting a pull request as it maintains a more compact commit history. To rebase -your commits:: - - git rebase -i HEAD~# - -Where # is the number of commits you want to combine. Then you can pick the relevant -commit message and discard others. - -To squash to the master branch do:: - - git rebase -i master - -Use the ``s`` option on a commit to ``squash``, meaning to keep the commit messages, -or ``f`` to ``fixup``, meaning to merge the commit messages. - -Then you will need to push the branch (see below) forcefully to replace the current -commits with the new ones:: - - git push origin shiny-new-feature -f - - Pushing your changes -------------------- @@ -1045,15 +1019,51 @@ release. To submit a pull request: #. Click ``Send Pull Request``. This request then goes to the repository maintainers, and they will review -the code. If you need to make more changes, you can make them in -your branch, push them to GitHub, and the pull request will be automatically -updated. Pushing them to GitHub again is done by:: +the code. + +.. _contributing.update-pr: - git push -f origin shiny-new-feature +Updating your pull request +-------------------------- + +Based on the review you get on your pull request, you will probably need to make +some changes to the code. In that case, you can make them in your branch, +add a new commit to that branch, push it to GitHub, and the pull request will be +automatically updated. Pushing them to GitHub again is done by:: + + git push origin shiny-new-feature This will automatically update your pull request with the latest code and restart the :ref:`Continuous Integration ` tests. +Another reason you might need to update your pull request is to solve conflicts +with changes that have been merged into the master branch since you opened your +pull request. + +To do this, you need to "merge upstream master" in your branch:: + + git checkout shiny-new-feature + git fetch upstream + git merge upstream/master + +If there are no conflicts (or they could be fixed automatically), a file with a +default commit message will open, and you can simply save and quit this file. + +If there are merge conflicts, you need to solve those conflicts. See for +example at https://help.github.com/articles/resolving-a-merge-conflict-using-the-command-line/ +for an explanation on how to do this. +Once the conflicts are merged and the files where the conflicts were solved are +added, you can run ``git commit`` to save those fixes. + +If you have uncommitted changes at the moment you want to update the branch with +master, you will need to ``stash`` them prior to updating (see the +`stash docs `__). +This will effectively store your changes and they can be reapplied after updating. + +After the feature branch has been update locally, you can now update your pull +request by pushing to the branch on GitHub:: + + git push origin shiny-new-feature Delete your merged branch (optional) ------------------------------------ diff --git a/doc/source/merging.rst b/doc/source/merging.rst index cfd3f9e88e4eaf..b17261f54f1867 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -31,7 +31,7 @@ operations. Concatenating objects --------------------- -The :func:`~pandas.concat` function (in the main pandas namespace) does all of +The :func:`~pandas.concat` function (in the main pandas namespace) does all of the heavy lifting of performing concatenation operations along an axis while performing optional set logic (union or intersection) of the indexes (if any) on the other axes. Note that I say "if any" because there is only a single possible @@ -109,9 +109,9 @@ some configurable handling of "what to do with the other axes": to the actual data concatenation. - ``copy`` : boolean, default True. If False, do not copy data unnecessarily. -Without a little bit of context many of these arguments don't make much sense. -Let's revisit the above example. Suppose we wanted to associate specific keys -with each of the pieces of the chopped up DataFrame. We can do this using the +Without a little bit of context many of these arguments don't make much sense. +Let's revisit the above example. Suppose we wanted to associate specific keys +with each of the pieces of the chopped up DataFrame. We can do this using the ``keys`` argument: .. ipython:: python @@ -138,9 +138,9 @@ It's not a stretch to see how this can be very useful. More detail on this functionality below. .. note:: - It is worth noting that :func:`~pandas.concat` (and therefore - :func:`~pandas.append`) makes a full copy of the data, and that constantly - reusing this function can create a significant performance hit. If you need + It is worth noting that :func:`~pandas.concat` (and therefore + :func:`~pandas.append`) makes a full copy of the data, and that constantly + reusing this function can create a significant performance hit. If you need to use the operation over several datasets, use a list comprehension. :: @@ -153,7 +153,7 @@ Set logic on the other axes ~~~~~~~~~~~~~~~~~~~~~~~~~~~ When gluing together multiple DataFrames, you have a choice of how to handle -the other axes (other than the one being concatenated). This can be done in +the other axes (other than the one being concatenated). This can be done in the following three ways: - Take the (sorted) union of them all, ``join='outer'``. This is the default @@ -216,8 +216,8 @@ DataFrame: Concatenating using ``append`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A useful shortcut to :func:`~pandas.concat` are the :meth:`~DataFrame.append` -instance methods on ``Series`` and ``DataFrame``. These methods actually predated +A useful shortcut to :func:`~pandas.concat` are the :meth:`~DataFrame.append` +instance methods on ``Series`` and ``DataFrame``. These methods actually predated ``concat``. They concatenate along ``axis=0``, namely the index: .. ipython:: python @@ -263,8 +263,8 @@ need to be: .. note:: - Unlike the :py:meth:`~list.append` method, which appends to the original list - and returns ``None``, :meth:`~DataFrame.append` here **does not** modify + Unlike the :py:meth:`~list.append` method, which appends to the original list + and returns ``None``, :meth:`~DataFrame.append` here **does not** modify ``df1`` and returns its copy with ``df2`` appended. .. _merging.ignore_index: @@ -362,9 +362,9 @@ Passing ``ignore_index=True`` will drop all name references. More concatenating with group keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A fairly common use of the ``keys`` argument is to override the column names +A fairly common use of the ``keys`` argument is to override the column names when creating a new ``DataFrame`` based on existing ``Series``. -Notice how the default behaviour consists on letting the resulting ``DataFrame`` +Notice how the default behaviour consists on letting the resulting ``DataFrame`` inherit the parent ``Series``' name, when these existed. .. ipython:: python @@ -460,7 +460,7 @@ Appending rows to a DataFrame ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ While not especially efficient (since a new object must be created), you can -append a single row to a ``DataFrame`` by passing a ``Series`` or dict to +append a single row to a ``DataFrame`` by passing a ``Series`` or dict to ``append``, which returns a new ``DataFrame`` as above. .. ipython:: python @@ -505,7 +505,7 @@ pandas has full-featured, **high performance** in-memory join operations idiomatically very similar to relational databases like SQL. These methods perform significantly better (in some cases well over an order of magnitude better) than other open source implementations (like ``base::merge.data.frame`` -in R). The reason for this is careful algorithmic design and the internal layout +in R). The reason for this is careful algorithmic design and the internal layout of the data in ``DataFrame``. See the :ref:`cookbook` for some advanced strategies. @@ -513,7 +513,7 @@ See the :ref:`cookbook` for some advanced strategies. Users who are familiar with SQL but new to pandas might be interested in a :ref:`comparison with SQL`. -pandas provides a single function, :func:`~pandas.merge`, as the entry point for +pandas provides a single function, :func:`~pandas.merge`, as the entry point for all standard database join operations between ``DataFrame`` objects: :: @@ -582,7 +582,7 @@ and ``right`` is a subclass of DataFrame, the return type will still be ``DataFrame``. ``merge`` is a function in the pandas namespace, and it is also available as a -``DataFrame`` instance method :meth:`~DataFrame.merge`, with the calling +``DataFrame`` instance method :meth:`~DataFrame.merge`, with the calling ``DataFrame `` being implicitly considered the left object in the join. The related :meth:`~DataFrame.join` method, uses ``merge`` internally for the @@ -594,7 +594,7 @@ Brief primer on merge methods (relational algebra) Experienced users of relational databases like SQL will be familiar with the terminology used to describe join operations between two SQL-table like -structures (``DataFrame`` objects). There are several cases to consider which +structures (``DataFrame`` objects). There are several cases to consider which are very important to understand: - **one-to-one** joins: for example when joining two ``DataFrame`` objects on @@ -634,8 +634,8 @@ key combination: labels=['left', 'right'], vertical=False); plt.close('all'); -Here is a more complicated example with multiple join keys. Only the keys -appearing in ``left`` and ``right`` are present (the intersection), since +Here is a more complicated example with multiple join keys. Only the keys +appearing in ``left`` and ``right`` are present (the intersection), since ``how='inner'`` by default. .. ipython:: python @@ -751,13 +751,13 @@ Checking for duplicate keys .. versionadded:: 0.21.0 -Users can use the ``validate`` argument to automatically check whether there -are unexpected duplicates in their merge keys. Key uniqueness is checked before -merge operations and so should protect against memory overflows. Checking key -uniqueness is also a good way to ensure user data structures are as expected. +Users can use the ``validate`` argument to automatically check whether there +are unexpected duplicates in their merge keys. Key uniqueness is checked before +merge operations and so should protect against memory overflows. Checking key +uniqueness is also a good way to ensure user data structures are as expected. -In the following example, there are duplicate values of ``B`` in the right -``DataFrame``. As this is not a one-to-one merge -- as specified in the +In the following example, there are duplicate values of ``B`` in the right +``DataFrame``. As this is not a one-to-one merge -- as specified in the ``validate`` argument -- an exception will be raised. @@ -770,11 +770,11 @@ In the following example, there are duplicate values of ``B`` in the right In [53]: result = pd.merge(left, right, on='B', how='outer', validate="one_to_one") ... - MergeError: Merge keys are not unique in right dataset; not a one-to-one merge + MergeError: Merge keys are not unique in right dataset; not a one-to-one merge -If the user is aware of the duplicates in the right ``DataFrame`` but wants to -ensure there are no duplicates in the left DataFrame, one can use the -``validate='one_to_many'`` argument instead, which will not raise an exception. +If the user is aware of the duplicates in the right ``DataFrame`` but wants to +ensure there are no duplicates in the left DataFrame, one can use the +``validate='one_to_many'`` argument instead, which will not raise an exception. .. ipython:: python @@ -786,8 +786,8 @@ ensure there are no duplicates in the left DataFrame, one can use the The merge indicator ~~~~~~~~~~~~~~~~~~~ -:func:`~pandas.merge` accepts the argument ``indicator``. If ``True``, a -Categorical-type column called ``_merge`` will be added to the output object +:func:`~pandas.merge` accepts the argument ``indicator``. If ``True``, a +Categorical-type column called ``_merge`` will be added to the output object that takes on values: =================================== ================ @@ -895,7 +895,7 @@ Joining on index ~~~~~~~~~~~~~~~~ :meth:`DataFrame.join` is a convenient method for combining the columns of two -potentially differently-indexed ``DataFrames`` into a single result +potentially differently-indexed ``DataFrames`` into a single result ``DataFrame``. Here is a very basic example: .. ipython:: python @@ -975,7 +975,7 @@ indexes: Joining key columns on an index ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:meth:`~DataFrame.join` takes an optional ``on`` argument which may be a column +:meth:`~DataFrame.join` takes an optional ``on`` argument which may be a column or multiple column names, which specifies that the passed ``DataFrame`` is to be aligned on that column in the ``DataFrame``. These two function calls are completely equivalent: @@ -987,7 +987,7 @@ completely equivalent: how='left', sort=False) Obviously you can choose whichever form you find more convenient. For -many-to-one joins (where one of the ``DataFrame``'s is already indexed by the +many-to-one joins (where one of the ``DataFrame``'s is already indexed by the join key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python @@ -1125,20 +1125,29 @@ This is equivalent but less verbose and more memory efficient / faster than this Joining with two multi-indexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This is not implemented via ``join`` at-the-moment, however it can be done using -the following code. +.. versionadded:: 0.23.1 + +You can join a multi-indexed ``Dataframe`` on the overlapping names of another multi-indexed ``Dataframe`` .. ipython:: python - index = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), - ('K1', 'X2')], - names=['key', 'X']) + index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), + ('K1', 'X2')], + names=['key', 'X']) + left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, - index=index) + index=index_left) - result = pd.merge(left.reset_index(), right.reset_index(), - on=['key'], how='inner').set_index(['key','X','Y']) + index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), + ('K2', 'Y2'), ('K2', 'Y3')], + names=['key', 'Y']) + + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=index_right) + + left.join(right) .. ipython:: python :suppress: @@ -1148,6 +1157,13 @@ the following code. labels=['left', 'right'], vertical=False); plt.close('all'); +For earlier versions it can be done using the following. + +.. ipython:: python + + pd.merge(left.reset_index(), right.reset_index(), + on=['key'], how='inner').set_index(['key','X','Y']) + .. _merging.merge_on_columns_and_levels: Merging on a combination of columns and index levels @@ -1254,7 +1270,7 @@ similarly. Joining multiple DataFrame or Panel objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A list or tuple of ``DataFrames`` can also be passed to :meth:`~DataFrame.join` +A list or tuple of ``DataFrames`` can also be passed to :meth:`~DataFrame.join` to join them together on their indexes. .. ipython:: python @@ -1276,7 +1292,7 @@ Merging together values within Series or DataFrame columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Another fairly common situation is to have two like-indexed (or similarly -indexed) ``Series`` or ``DataFrame`` objects and wanting to "patch" values in +indexed) ``Series`` or ``DataFrame`` objects and wanting to "patch" values in one object from values for matching indices in the other. Here is an example: .. ipython:: python @@ -1301,7 +1317,7 @@ For this, use the :meth:`~DataFrame.combine_first` method: plt.close('all'); Note that this method only takes values from the right ``DataFrame`` if they are -missing in the left ``DataFrame``. A related method, :meth:`~DataFrame.update`, +missing in the left ``DataFrame``. A related method, :meth:`~DataFrame.update`, alters non-NA values inplace: .. ipython:: python @@ -1353,15 +1369,15 @@ Merging AsOf .. versionadded:: 0.19.0 -A :func:`merge_asof` is similar to an ordered left-join except that we match on -nearest key rather than equal keys. For each row in the ``left`` ``DataFrame``, -we select the last row in the ``right`` ``DataFrame`` whose ``on`` key is less +A :func:`merge_asof` is similar to an ordered left-join except that we match on +nearest key rather than equal keys. For each row in the ``left`` ``DataFrame``, +we select the last row in the ``right`` ``DataFrame`` whose ``on`` key is less than the left's key. Both DataFrames must be sorted by the key. -Optionally an asof merge can perform a group-wise merge. This matches the +Optionally an asof merge can perform a group-wise merge. This matches the ``by`` key equally, in addition to the nearest match on the ``on`` key. -For example; we might have ``trades`` and ``quotes`` and we want to ``asof`` +For example; we might have ``trades`` and ``quotes`` and we want to ``asof`` merge them. .. ipython:: python @@ -1420,8 +1436,8 @@ We only asof within ``2ms`` between the quote time and the trade time. by='ticker', tolerance=pd.Timedelta('2ms')) -We only asof within ``10ms`` between the quote time and the trade time and we -exclude exact matches on time. Note that though we exclude the exact matches +We only asof within ``10ms`` between the quote time and the trade time and we +exclude exact matches on time. Note that though we exclude the exact matches (of the quotes), prior quotes **do** propagate to that point in time. .. ipython:: python diff --git a/doc/source/tutorials.rst b/doc/source/tutorials.rst index 85e455de7d246e..895fe595de2057 100644 --- a/doc/source/tutorials.rst +++ b/doc/source/tutorials.rst @@ -13,6 +13,8 @@ pandas' own :ref:`10 Minutes to pandas<10min>`. More complex recipes are in the :ref:`Cookbook`. +A handy pandas `cheat sheet `_. + pandas Cookbook --------------- diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c08e22af295f43..3dd23c087d5963 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -59,6 +59,40 @@ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtyp pd.get_dummies(df, columns=['c']).dtypes pd.get_dummies(df, columns=['c'], dtype=bool).dtypes +.. _whatsnew_0230.enhancements.join_with_two_multiindexes: + +Joining with two multi-indexes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +As of Pandas 0.23.1 the :func:`Dataframe.join` can be used to join multi-indexed ``Dataframe`` instances on the overlaping index levels + +See the :ref:`Merge, join, and concatenate +` documentation section. + +.. ipython:: python + + index_left = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), + ('K1', 'X2')], + names=['key', 'X']) + left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], + 'B': ['B0', 'B1', 'B2']}, + index=index_left) + + index_right = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), + ('K2', 'Y2'), ('K2', 'Y3')], + names=['key', 'Y']) + right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], + 'D': ['D0', 'D1', 'D2', 'D3']}, + index=index_right) + + left.join(right) + +For earlier versions it can be done using the following. + +.. ipython:: python + + pd.merge(left.reset_index(), right.reset_index(), + on=['key'], how='inner').set_index(['key','X','Y']) .. _whatsnew_0230.enhancements.merge_on_columns_and_levels: @@ -713,6 +747,7 @@ Other API Changes - ``Categorical.fillna`` now validates its ``value`` and ``method`` keyword arguments. It now raises when both or none are specified, matching the behavior of :meth:`Series.fillna` (:issue:`19682`) - ``pd.to_datetime('today')`` now returns a datetime, consistent with ``pd.Timestamp('today')``; previously ``pd.to_datetime('today')`` returned a ``.normalized()`` datetime (:issue:`19935`) - :func:`Series.str.replace` now takes an optional `regex` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`) +- :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`) .. _whatsnew_0230.deprecations: @@ -738,6 +773,7 @@ Deprecations - :attr:`Timestamp.weekday_name`, :attr:`DatetimeIndex.weekday_name`, and :attr:`Series.dt.weekday_name` are deprecated in favor of :meth:`Timestamp.day_name`, :meth:`DatetimeIndex.day_name`, and :meth:`Series.dt.day_name` (:issue:`12806`) - ``pandas.tseries.plotting.tsplot`` is deprecated. Use :func:`Series.plot` instead (:issue:`18627`) +- ``Index.summary()`` is deprecated and will be removed in a future version (:issue:`18217`) .. _whatsnew_0230.prior_deprecations: @@ -843,6 +879,9 @@ Categorical - Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`) - Bug in :class:`Index` constructor with ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19032`) - Bug in :class:`Series` constructor with scalar and ``dtype=CategoricalDtype(...)`` where ``categories`` and ``ordered`` are not maintained (issue:`19565`) +- Bug in ``Categorical.__iter__`` not converting to Python types (:issue:`19909`) +- Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`) +- Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`) Datetimelike ^^^^^^^^^^^^ @@ -939,6 +978,7 @@ Indexing - Bug in :class:`IntervalIndex` where set operations that returned an empty ``IntervalIndex`` had the wrong dtype (:issue:`19101`) - Bug in :meth:`DataFrame.drop_duplicates` where no ``KeyError`` is raised when passing in columns that don't exist on the ``DataFrame`` (issue:`19726`) - Bug in ``Index`` subclasses constructors that ignore unexpected keyword arguments (:issue:`19348`) +- Bug in :meth:`Index.difference` when taking difference of an ``Index`` with itself (:issue:`20040`) MultiIndex diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 2baf8c47ad7e35..008747c0a9e789 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1350,6 +1350,32 @@ cdef class _Period(object): @property def week(self): + """ + Get the week of the year on the given Period. + + Returns + ------- + int + + See Also + -------- + Period.dayofweek : Get the day component of the Period. + Period.weekday : Get the day component of the Period. + + Examples + -------- + >>> p = pd.Period("2018-03-11", "H") + >>> p.week + 10 + + >>> p = pd.Period("2018-02-01", "D") + >>> p.week + 5 + + >>> p = pd.Period("2018-01-06", "D") + >>> p.week + 1 + """ return self.weekofyear @property diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9101fca58d5fa5..de2e638265f1eb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -435,6 +435,35 @@ def isin(comps, values): return f(comps, values) +def _factorize_array(values, check_nulls, na_sentinel=-1, size_hint=None): + """Factorize an array-like to labels and uniques. + + This doesn't do any coercion of types or unboxing before factorization. + + Parameters + ---------- + values : ndarray + check_nulls : bool + Whether to check for nulls in the hashtable's 'get_labels' method. + na_sentinel : int, default -1 + size_hint : int, optional + Passsed through to the hashtable's 'get_labels' method + + Returns + ------- + labels, uniques : ndarray + """ + (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) + + table = hash_klass(size_hint or len(values)) + uniques = vec_klass() + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) + + labels = _ensure_platform_int(labels) + uniques = uniques.to_array() + return labels, uniques + + @deprecate_kwarg(old_arg_name='order', new_arg_name=None) def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ @@ -442,8 +471,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): Parameters ---------- - values : ndarray (1-d) - Sequence + values : Sequence + ndarrays must be 1-D. Sequences that aren't pandas objects are + coereced to ndarrays before factorization. sort : boolean, default False Sort by values na_sentinel : int, default -1 @@ -458,26 +488,43 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): Series note: an array of Periods will ignore sort as it returns an always sorted - PeriodIndex + PeriodIndex. """ + # Implementation notes: This method is responsible for 3 things + # 1.) coercing data to array-like (ndarray, Index, extension array) + # 2.) factorizing labels and uniques + # 3.) Maybe boxing the output in an Index + # + # Step 2 is dispatched to extension types (like Categorical). They are + # responsible only for factorization. All data coercion, sorting and boxing + # should happen here. values = _ensure_arraylike(values) original = values - values, dtype, _ = _ensure_data(values) - (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) - - table = hash_klass(size_hint or len(values)) - uniques = vec_klass() - check_nulls = not is_integer_dtype(original) - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) - labels = _ensure_platform_int(labels) - uniques = uniques.to_array() + if is_categorical_dtype(values): + values = getattr(values, '_values', values) + labels, uniques = values.factorize() + dtype = original.dtype + else: + values, dtype, _ = _ensure_data(values) + check_nulls = not is_integer_dtype(original) + labels, uniques = _factorize_array(values, check_nulls, + na_sentinel=na_sentinel, + size_hint=size_hint) if sort and len(uniques) > 0: from pandas.core.sorting import safe_sort - uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, - assume_unique=True) + try: + order = uniques.argsort() + order2 = order.argsort() + labels = take_1d(order2, labels, fill_value=na_sentinel) + uniques = uniques.take(order) + except TypeError: + # Mixed types, where uniques.argsort fails. + uniques, labels = safe_sort(uniques, labels, + na_sentinel=na_sentinel, + assume_unique=True) uniques = _reconstruct_data(uniques, dtype, original) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1f33081a5f610e..fa565aa802faf0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -236,6 +236,59 @@ def isna(self): """ raise AbstractMethodError(self) + def fillna(self, value=None, method=None, limit=None): + """ Fill NA/NaN values using the specified method. + + Parameters + ---------- + value : scalar, array-like + If a scalar value is passed it is used to fill all missing values. + Alternatively, an array-like 'value' can be given. It's expected + that the array-like have the same length as 'self'. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + limit : int, default None + If method is specified, this is the maximum number of consecutive + NaN values to forward/backward fill. In other words, if there is + a gap with more than this number of consecutive NaNs, it will only + be partially filled. If method is not specified, this is the + maximum number of entries along the entire axis where NaNs will be + filled. + + Returns + ------- + filled : ExtensionArray with NA/NaN filled + """ + from pandas.api.types import is_scalar + from pandas.util._validators import validate_fillna_kwargs + from pandas.core.missing import pad_1d, backfill_1d + + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + if not is_scalar(value): + if len(value) != len(self): + raise ValueError("Length of 'value' does not match. Got ({}) " + " expected {}".format(len(value), len(self))) + value = value[mask] + + if mask.any(): + if method is not None: + func = pad_1d if method == 'pad' else backfill_1d + new_values = func(self.astype(object), limit=limit, + mask=mask) + new_values = self._constructor_from_sequence(new_values) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values + def unique(self): """Compute the ExtensionArray of unique values. @@ -285,6 +338,7 @@ def take(self, indexer, allow_fill=True, fill_value=None): .. code-block:: python def take(self, indexer, allow_fill=True, fill_value=None): + indexer = np.asarray(indexer) mask = indexer == -1 result = self.data.take(indexer) result[mask] = np.nan # NA for this type diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e23dc3b3e5b89f..e7d414f9de5449 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -7,6 +7,7 @@ from pandas import compat from pandas.compat import u, lzip from pandas._libs import lib, algos as libalgos +from pandas._libs.tslib import iNaT from pandas.core.dtypes.generic import ( ABCSeries, ABCIndexClass, ABCCategoricalIndex) @@ -364,10 +365,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) - @classmethod - def _constructor_from_sequence(cls, scalars): - return cls(scalars) - @property def categories(self): """The categories of this categorical. @@ -425,6 +422,10 @@ def _ndarray_values(self): def _constructor(self): return Categorical + @classmethod + def _constructor_from_sequence(cls, scalars): + return Categorical(scalars) + def copy(self): """ Copy constructor. """ return self._constructor(values=self._codes.copy(), @@ -479,9 +480,7 @@ def tolist(self): (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period) """ - if is_datetimelike(self.categories): - return [com._maybe_box_datetimelike(x) for x in self] - return np.array(self).tolist() + return list(self) @property def base(self): @@ -1257,7 +1256,7 @@ def isna(self): """ Detect missing values - Both missing values (-1 in .codes) and NA as a category are detected. + Missing values (-1 in .codes) are detected. Returns ------- @@ -1272,13 +1271,6 @@ def isna(self): """ ret = self._codes == -1 - - # String/object and float categories can hold np.nan - if self.categories.dtype.kind in ['S', 'O', 'f']: - if np.nan in self.categories: - nan_pos = np.where(isna(self.categories))[0] - # we only have one NA in categories - ret = np.logical_or(ret, self._codes == nan_pos) return ret isnull = isna @@ -1314,16 +1306,14 @@ def dropna(self): """ Return the Categorical without null values. - Both missing values (-1 in .codes) and NA as a category are detected. - NA is removed from the categories if present. + Missing values (-1 in .codes) are detected. Returns ------- valid : Categorical """ result = self[self.notna()] - if isna(result.categories).any(): - result = result.remove_categories([np.nan]) + return result def value_counts(self, dropna=True): @@ -1335,7 +1325,7 @@ def value_counts(self, dropna=True): Parameters ---------- dropna : boolean, default True - Don't include counts of NaN, even if NaN is a category. + Don't include counts of NaN. Returns ------- @@ -1347,11 +1337,9 @@ def value_counts(self, dropna=True): """ from numpy import bincount - from pandas import isna, Series, CategoricalIndex + from pandas import Series, CategoricalIndex - obj = (self.remove_categories([np.nan]) if dropna and - isna(self.categories).any() else self) - code, cat = obj._codes, obj.categories + code, cat = self._codes, self.categories ncat, mask = len(cat), 0 <= code ix, clean = np.arange(ncat), mask.all() @@ -1591,16 +1579,16 @@ def fillna(self, value=None, method=None, limit=None): Parameters ---------- - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed Series - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap value : scalar, dict, Series If a scalar value is passed it is used to fill all missing values. Alternatively, a Series or dict can be used to fill in different values for each index. The value should not be a list. The value(s) passed should either be in the categories or should be NaN. + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap limit : int, default None (Not implemented yet for Categorical!) If method is specified, this is the maximum number of consecutive @@ -1626,14 +1614,6 @@ def fillna(self, value=None, method=None, limit=None): values = self._codes - # Make sure that we also get NA in categories - if self.categories.dtype.kind in ['S', 'O', 'f']: - if np.nan in self.categories: - values = values.copy() - nan_pos = np.where(isna(self.categories))[0] - # we only have one NA in categories - values[values == nan_pos] = -1 - # pad / bfill if method is not None: @@ -1716,7 +1696,7 @@ def __len__(self): def __iter__(self): """Returns an Iterator over the values of this Categorical.""" - return iter(self.get_values()) + return iter(self.get_values().tolist()) def _tidy_repr(self, max_vals=10, footer=True): """ a short repr displaying only max_vals and an optional (but default @@ -1887,15 +1867,6 @@ def __setitem__(self, key, value): key = np.asarray(key) lindexer = self.categories.get_indexer(rvalue) - - # FIXME: the following can be removed after GH7820 is fixed: - # https://github.com/pandas-dev/pandas/issues/7820 - # float categories do currently return -1 for np.nan, even if np.nan is - # included in the index -> "repair" this here - if isna(rvalue).any() and isna(self.categories).any(): - nan_pos = np.where(isna(self.categories))[0] - lindexer[lindexer == -1] = nan_pos - lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer @@ -2072,6 +2043,60 @@ def unique(self): take_codes = sorted(take_codes) return cat.set_categories(cat.categories.take(take_codes)) + def factorize(self, na_sentinel=-1): + """Encode the Categorical as an enumerated type. + + Parameters + ---------- + sort : boolean, default False + Sort by values + na_sentinel: int, default -1 + Value to mark "not found" + + Returns + ------- + labels : ndarray + An integer NumPy array that's an indexer into the original + Categorical + uniques : Categorical + A Categorical whose values are the unique values and + whose dtype matches the original CategoricalDtype. Note that if + there any unobserved categories in ``self`` will not be present + in ``uniques.values``. They will be present in + ``uniques.categories`` + + Examples + -------- + >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c']) + >>> labels, uniques = cat.factorize() + >>> labels + (array([0, 0, 1]), + >>> uniques + [a, c] + Categories (3, object): [a, b, c]) + + Missing values are handled + + >>> labels, uniques = pd.factorize(pd.Categorical(['a', 'b', None])) + >>> labels + array([ 0, 1, -1]) + >>> uniques + [a, b] + Categories (2, object): [a, b] + """ + from pandas.core.algorithms import _factorize_array + + codes = self.codes.astype('int64') + codes[codes == -1] = iNaT + # We set missing codes, normally -1, to iNaT so that the + # Int64HashTable treats them as missing values. + labels, uniques = _factorize_array(codes, check_nulls=True, + na_sentinel=na_sentinel) + uniques = self._constructor(self.categories.take(uniques), + categories=self.categories, + ordered=self.ordered) + return labels, uniques + def equals(self, other): """ Returns True if categorical arrays are equal. diff --git a/pandas/core/base.py b/pandas/core/base.py index 257b26b64e6421..f6869753664196 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -9,10 +9,10 @@ from pandas.core.dtypes.missing import isna from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass from pandas.core.dtypes.common import ( + is_datetimelike, is_object_dtype, is_list_like, is_scalar, - is_datetimelike, is_extension_type, is_extension_array_dtype) @@ -787,7 +787,36 @@ def empty(self): return not self.size def max(self): - """ The maximum value of the object """ + """ + Return the maximum value of the Index. + + Returns + ------- + scalar + Maximum value. + + See Also + -------- + Index.min : Return the minimum value in an Index. + Series.max : Return the maximum value in a Series. + DataFrame.max : Return the maximum values in a DataFrame. + + Examples + -------- + >>> idx = pd.Index([3, 2, 1]) + >>> idx.max() + 3 + + >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx.max() + 'c' + + For a MultiIndex, the maximum is determined lexicographically. + + >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx.max() + ('b', 2) + """ return nanops.nanmax(self.values) def argmax(self, axis=None): @@ -801,7 +830,35 @@ def argmax(self, axis=None): return nanops.nanargmax(self.values) def min(self): - """ The minimum value of the object """ + """ + Return the minimum value of the Index. + + Returns + ------- + scalar + Minimum value. + + See Also + -------- + Index.max : Return the maximum value of the object. + Series.min : Return the minimum value in a Series. + DataFrame.min : Return the minimum values in a DataFrame. + + Examples + -------- + >>> idx = pd.Index([3, 2, 1]) + >>> idx.min() + 1 + + >>> idx = pd.Index(['c', 'b', 'a']) + >>> idx.min() + 'a' + + For a MultiIndex, the minimum is determined lexicographically. + >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)]) + >>> idx.min() + ('a', 1) + """ return nanops.nanmin(self.values) def argmin(self, axis=None): @@ -826,9 +883,10 @@ def tolist(self): -------- numpy.ndarray.tolist """ - - if is_datetimelike(self): + if is_datetimelike(self._values): return [com._maybe_box_datetimelike(x) for x in self._values] + elif is_extension_array_dtype(self._values): + return list(self._values) else: return self._values.tolist() diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index d54d980d02ffac..6dbed5f138d5dd 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -5,26 +5,16 @@ from pandas.errors import AbstractMethodError -class ExtensionDtype(object): - """A custom data type, to be paired with an ExtensionArray. - - Notes - ----- - The interface includes the following abstract methods that must - be implemented by subclasses: - - * type - * name - * construct_from_string - - This class does not inherit from 'abc.ABCMeta' for performance reasons. - Methods and properties required by the interface raise - ``pandas.errors.AbstractMethodError`` and no ``register`` method is - provided for registering virtual subclasses. - """ - - def __str__(self): - return self.name +class _DtypeOpsMixin(object): + # Not all of pandas' extension dtypes are compatibile with + # the new ExtensionArray interface. This means PandasExtensionDtype + # can't subclass ExtensionDtype yet, as is_extension_array_dtype would + # incorrectly say that these types are extension types. + # + # In the interim, we put methods that are shared between the two base + # classes ExtensionDtype and PandasExtensionDtype here. Both those base + # classes will inherit from this Mixin. Once everything is compatible, this + # class's methods can be moved to ExtensionDtype and removed. def __eq__(self, other): """Check whether 'other' is equal to self. @@ -52,6 +42,74 @@ def __eq__(self, other): def __ne__(self, other): return not self.__eq__(other) + @property + def names(self): + # type: () -> Optional[List[str]] + """Ordered list of field names, or None if there are no fields. + + This is for compatibility with NumPy arrays, and may be removed in the + future. + """ + return None + + @classmethod + def is_dtype(cls, dtype): + """Check if we match 'dtype'. + + Parameters + ---------- + dtype : object + The object to check. + + Returns + ------- + is_dtype : bool + + Notes + ----- + The default implementation is True if + + 1. ``cls.construct_from_string(dtype)`` is an instance + of ``cls``. + 2. ``dtype`` is an object and is an instance of ``cls`` + 3. ``dtype`` has a ``dtype`` attribute, and any of the above + conditions is true for ``dtype.dtype``. + """ + dtype = getattr(dtype, 'dtype', dtype) + + if isinstance(dtype, np.dtype): + return False + elif dtype is None: + return False + elif isinstance(dtype, cls): + return True + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False + + +class ExtensionDtype(_DtypeOpsMixin): + """A custom data type, to be paired with an ExtensionArray. + + Notes + ----- + The interface includes the following abstract methods that must + be implemented by subclasses: + + * type + * name + * construct_from_string + + This class does not inherit from 'abc.ABCMeta' for performance reasons. + Methods and properties required by the interface raise + ``pandas.errors.AbstractMethodError`` and no ``register`` method is + provided for registering virtual subclasses. + """ + + def __str__(self): + return self.name + @property def type(self): # type: () -> type @@ -87,16 +145,6 @@ def name(self): """ raise AbstractMethodError(self) - @property - def names(self): - # type: () -> Optional[List[str]] - """Ordered list of field names, or None if there are no fields. - - This is for compatibility with NumPy arrays, and may be removed in the - future. - """ - return None - @classmethod def construct_from_string(cls, string): """Attempt to construct this type from a string. @@ -128,39 +176,3 @@ def construct_from_string(cls, string): ... "'{}'".format(cls, string)) """ raise AbstractMethodError(cls) - - @classmethod - def is_dtype(cls, dtype): - """Check if we match 'dtype'. - - Parameters - ---------- - dtype : object - The object to check. - - Returns - ------- - is_dtype : bool - - Notes - ----- - The default implementation is True if - - 1. ``cls.construct_from_string(dtype)`` is an instance - of ``cls``. - 2. ``dtype`` is an object and is an instance of ``cls`` - 3. ``dtype`` has a ``dtype`` attribute, and any of the above - conditions is true for ``dtype.dtype``. - """ - dtype = getattr(dtype, 'dtype', dtype) - - if isinstance(dtype, np.dtype): - return False - elif dtype is None: - return False - elif isinstance(dtype, cls): - return True - try: - return cls.construct_from_string(dtype) is not None - except TypeError: - return False diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b1d0dc2a2442e6..d19f19b7224a73 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -11,7 +11,8 @@ from .common import (_ensure_object, is_bool, is_integer, is_float, is_complex, is_datetimetz, is_categorical_dtype, is_datetimelike, - is_extension_type, is_object_dtype, + is_extension_type, + is_object_dtype, is_datetime64tz_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, @@ -26,7 +27,8 @@ _ensure_int32, _ensure_int64, _NS_DTYPE, _TD_DTYPE, _INT64_DTYPE, _POSSIBLY_CAST_DTYPES) -from .dtypes import ExtensionDtype, DatetimeTZDtype, PeriodDtype +from .dtypes import (ExtensionDtype, PandasExtensionDtype, DatetimeTZDtype, + PeriodDtype) from .generic import (ABCDatetimeIndex, ABCPeriodIndex, ABCSeries) from .missing import isna, notna @@ -1114,7 +1116,8 @@ def find_common_type(types): if all(is_dtype_equal(first, t) for t in types[1:]): return first - if any(isinstance(t, ExtensionDtype) for t in types): + if any(isinstance(t, (PandasExtensionDtype, ExtensionDtype)) + for t in types): return np.object # take lowest unit diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 197b35de888962..3a90feb7ccd7d8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -9,7 +9,7 @@ DatetimeTZDtype, DatetimeTZDtypeType, PeriodDtype, PeriodDtypeType, IntervalDtype, IntervalDtypeType, - ExtensionDtype) + ExtensionDtype, PandasExtensionDtype) from .generic import (ABCCategorical, ABCPeriodIndex, ABCDatetimeIndex, ABCSeries, ABCSparseArray, ABCSparseSeries, ABCCategoricalIndex, @@ -2006,7 +2006,7 @@ def pandas_dtype(dtype): return CategoricalDtype.construct_from_string(dtype) except TypeError: pass - elif isinstance(dtype, ExtensionDtype): + elif isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)): return dtype try: diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index d262a71933915d..708f54f5ca75ba 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -5,10 +5,10 @@ from pandas import compat from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex -from .base import ExtensionDtype +from .base import ExtensionDtype, _DtypeOpsMixin -class PandasExtensionDtype(ExtensionDtype): +class PandasExtensionDtype(_DtypeOpsMixin): """ A np.dtype duck-typed class, suitable for holding a custom dtype. @@ -83,7 +83,7 @@ class CategoricalDtypeType(type): pass -class CategoricalDtype(PandasExtensionDtype): +class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ Type for categorical data with the categories and orderedness diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 6fed25a0012f2d..a02f0c5b2a4d6b 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -28,20 +28,37 @@ def is_number(obj): """ Check if the object is a number. + Returns True when the object is a number, and False if is not. + Parameters ---------- - obj : The object to check. + obj : any type + The object to check if is a number. Returns ------- is_number : bool Whether `obj` is a number or not. + See Also + -------- + pandas.api.types.is_integer: checks a subgroup of numbers + Examples -------- - >>> is_number(1) + >>> pd.api.types.is_number(1) + True + >>> pd.api.types.is_number(7.15) True - >>> is_number("foo") + + Booleans are valid because they are int subclass. + + >>> pd.api.types.is_number(False) + True + + >>> pd.api.types.is_number("foo") + False + >>> pd.api.types.is_number("5") False """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 687705640a4670..efb002474f876a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1970,33 +1970,132 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): """ - Concise summary of a DataFrame. + Print a concise summary of a DataFrame. + + This method prints information about a DataFrame including + the index dtype and column dtypes, non-null values and memory usage. Parameters ---------- - verbose : {None, True, False}, optional - Whether to print the full summary. - None follows the `display.max_info_columns` setting. - True or False overrides the `display.max_info_columns` setting. + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. buf : writable buffer, defaults to sys.stdout - max_cols : int, default None - Determines whether full summary or short summary is printed. - None follows the `display.max_info_columns` setting. - memory_usage : boolean/string, default None + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + max_cols : int, optional + When to switch from the verbose to the truncated output. If the + DataFrame has more than `max_cols` columns, the truncated output + is used. By default, the setting in + ``pandas.options.display.max_info_columns`` is used. + memory_usage : bool, str, optional Specifies whether total memory usage of the DataFrame - elements (including index) should be displayed. None follows - the `display.memory_usage` setting. True or False overrides - the `display.memory_usage` setting. A value of 'deep' is equivalent - of True, with deep introspection. Memory usage is shown in - human-readable units (base-2 representation). - null_counts : boolean, default None - Whether to show the non-null counts + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the frame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. - - If None, then only show if the frame is smaller than - max_info_rows and max_info_columns. - - If True, always show counts. - - If False, never show counts. + Returns + ------- + None + This method prints a summary of a DataFrame and returns None. + See Also + -------- + DataFrame.describe: Generate descriptive statistics of DataFrame + columns. + DataFrame.memory_usage: Memory usage of DataFrame columns. + + Examples + -------- + >>> int_values = [1, 2, 3, 4, 5] + >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] + >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] + >>> df = pd.DataFrame({"int_col": int_values, "text_col": text_values, + ... "float_col": float_values}) + >>> df + int_col text_col float_col + 0 1 alpha 0.00 + 1 2 beta 0.25 + 2 3 gamma 0.50 + 3 4 delta 0.75 + 4 5 epsilon 1.00 + + Prints information of all columns: + + >>> df.info(verbose=True) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + int_col 5 non-null int64 + text_col 5 non-null object + float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 200.0+ bytes + + Prints a summary of columns count and its dtypes but not per column + information: + + >>> df.info(verbose=False) + + RangeIndex: 5 entries, 0 to 4 + Columns: 3 entries, int_col to float_col + dtypes: float64(1), int64(1), object(1) + memory usage: 200.0+ bytes + + Pipe output of DataFrame.info to buffer instead of sys.stdout, get + buffer content and writes to a text file: + + >>> import io + >>> buffer = io.StringIO() + >>> df.info(buf=buffer) + >>> s = buffer.getvalue() + >>> with open("df_info.txt", "w", encoding="utf-8") as f: + ... f.write(s) + 260 + + The `memory_usage` parameter allows deep introspection mode, specially + useful for big DataFrames and fine-tune memory optimization: + + >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> df = pd.DataFrame({ + ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) + ... }) + >>> df.info() + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + column_1 1000000 non-null object + column_2 1000000 non-null object + column_3 1000000 non-null object + dtypes: object(3) + memory usage: 22.9+ MB + + >>> df.info(memory_usage='deep') + + RangeIndex: 1000000 entries, 0 to 999999 + Data columns (total 3 columns): + column_1 1000000 non-null object + column_2 1000000 non-null object + column_3 1000000 non-null object + dtypes: object(3) + memory usage: 188.8 MB """ if buf is None: # pragma: no cover @@ -2005,7 +2104,7 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, lines = [] lines.append(str(type(self))) - lines.append(self.index.summary()) + lines.append(self.index._summary()) if len(self.columns) == 0: lines.append('Empty %s' % type(self).__name__) @@ -2099,32 +2198,88 @@ def _sizeof_fmt(num, size_qualifier): fmt.buffer_put_lines(buf, lines) def memory_usage(self, index=True, deep=False): - """Memory usage of DataFrame columns. + """ + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. + + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. Parameters ---------- - index : bool - Specifies whether to include memory usage of DataFrame's - index in returned Series. If `index=True` (default is False) - the first index of the Series is `Index`. - deep : bool - Introspect the data deeply, interrogate - `object` dtypes for system-level memory consumption + index : bool, default True + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True`` the memory usage of the + index the first item in the output. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned values. Returns ------- sizes : Series - A series with column names as index and memory usage of - columns with units of bytes. - - Notes - ----- - Memory usage does not include memory consumed by elements that - are not components of the array if deep=False + A Series whose index is the original column names and whose values + is the memory usage of each column in bytes. See Also -------- - numpy.ndarray.nbytes + numpy.ndarray.nbytes : Total bytes consumed by the elements of an + ndarray. + Series.memory_usage : Bytes consumed by a Series. + pandas.Categorical : Memory-efficient array for string values with + many repeated values. + DataFrame.info : Concise summary of a DataFrame. + + Examples + -------- + >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] + >>> data = dict([(t, np.ones(shape=5000).astype(t)) + ... for t in dtypes]) + >>> df = pd.DataFrame(data) + >>> df.head() + int64 float64 complex128 object bool + 0 1 1.0 (1+0j) 1 True + 1 1 1.0 (1+0j) 1 True + 2 1 1.0 (1+0j) 1 True + 3 1 1.0 (1+0j) 1 True + 4 1 1.0 (1+0j) 1 True + + >>> df.memory_usage() + Index 80 + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + >>> df.memory_usage(index=False) + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + The memory footprint of `object` dtype columns is ignored by default: + + >>> df.memory_usage(deep=True) + Index 80 + int64 40000 + float64 40000 + complex128 80000 + object 160000 + bool 5000 + dtype: int64 + + Use a Categorical for efficient storage of an object-dtype column with + many repeated values. + + >>> df['object'].astype('category').memory_usage(deep=True) + 5168 """ result = Series([c.memory_usage(index=False, deep=deep) for col, c in self.iteritems()], index=self.columns) @@ -3829,71 +3984,101 @@ def notnull(self): def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False): """ - Return object with labels on given axis omitted where alternately any - or all of the data are missing + Remove missing values. + + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, or tuple/list thereof - Pass tuple or list to drop on multiple axes - how : {'any', 'all'} - * any : if any NA values are present, drop that label - * all : if all values are NA, drop that label - thresh : int, default None - int value : require that many non-NA values - subset : array-like + Determine if rows or columns which contain missing values are + removed. + + * 0, or 'index' : Drop rows which contain missing values. + * 1, or 'columns' : Drop columns which contain missing value. + + Pass tuple or list to drop on multiple axes. + how : {'any', 'all'}, default 'any' + Determine if row or column is removed from DataFrame, when we have + at least one NA or all NA. + + * 'any' : If any NA values are present, drop that row or column. + * 'all' : If all values are NA, drop that row or column. + thresh : int, optional + Require that many non-NA values. + subset : array-like, optional Labels along other axis to consider, e.g. if you are dropping rows - these would be a list of columns to include - inplace : boolean, default False + these would be a list of columns to include. + inplace : bool, default False If True, do operation inplace and return None. Returns ------- - dropped : DataFrame + DataFrame + DataFrame with NA entries dropped from it. + + See Also + -------- + DataFrame.isna: Indicate missing values. + DataFrame.notna : Indicate existing (non-missing) values. + DataFrame.fillna : Replace missing values. + Series.dropna : Drop missing values. + Index.dropna : Drop missing indices. Examples -------- - >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], - ... [np.nan, np.nan, np.nan, 5]], - ... columns=list('ABCD')) + >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], + ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], + ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), + ... pd.NaT]}) >>> df - A B C D - 0 NaN 2.0 NaN 0 - 1 3.0 4.0 NaN 1 - 2 NaN NaN NaN 5 + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT - Drop the columns where all elements are nan: + Drop the rows where at least one element is missing. - >>> df.dropna(axis=1, how='all') - A B D - 0 NaN 2.0 0 - 1 3.0 4.0 1 - 2 NaN NaN 5 + >>> df.dropna() + name toy born + 1 Batman Batmobile 1940-04-25 - Drop the columns where any of the elements is nan + Drop the columns where at least one element is missing. - >>> df.dropna(axis=1, how='any') - D - 0 0 - 1 1 - 2 5 + >>> df.dropna(axis='columns') + name + 0 Alfred + 1 Batman + 2 Catwoman - Drop the rows where all of the elements are nan - (there is no row to drop, so df stays the same): + Drop the rows where all elements are missing. - >>> df.dropna(axis=0, how='all') - A B C D - 0 NaN 2.0 NaN 0 - 1 3.0 4.0 NaN 1 - 2 NaN NaN NaN 5 + >>> df.dropna(how='all') + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT - Keep only the rows with at least 2 non-na values: + Keep only the rows with at least 2 non-NA values. >>> df.dropna(thresh=2) - A B C D - 0 NaN 2.0 NaN 0 - 1 3.0 4.0 NaN 1 + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Define in which columns to look for missing values. + + >>> df.dropna(subset=['name', 'born']) + name toy born + 1 Batman Batmobile 1940-04-25 + Keep the DataFrame with valid entries in the same variable. + + >>> df.dropna(inplace=True) + >>> df + name toy born + 1 Batman Batmobile 1940-04-25 """ inplace = validate_bool_kwarg(inplace, 'inplace') if isinstance(axis, (tuple, list)): @@ -4163,34 +4348,93 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, inplace=inplace, sort_remaining=sort_remaining) def nlargest(self, n, columns, keep='first'): - """Get the rows of a DataFrame sorted by the `n` largest - values of `columns`. + """ + Return the first `n` rows ordered by `columns` in descending order. + + Return the first `n` rows with the largest values in `columns`, in + descending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=False).head(n)``, but more + performant. Parameters ---------- n : int - Number of items to retrieve - columns : list or str - Column name or names to order by + Number of rows to return. + columns : label or list of labels + Column label(s) to order by. keep : {'first', 'last'}, default 'first' Where there are duplicate values: - - ``first`` : take the first occurrence. - - ``last`` : take the last occurrence. + + - `first` : prioritize the first occurrence(s) + - `last` : prioritize the last occurrence(s) Returns ------- DataFrame + The first `n` rows ordered by the given columns in descending + order. + + See Also + -------- + DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in + ascending order. + DataFrame.sort_values : Sort DataFrame by the values + DataFrame.head : Return the first `n` rows without re-ordering. + + Notes + ----- + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. Examples -------- - >>> df = pd.DataFrame({'a': [1, 10, 8, 11, -1], + >>> df = pd.DataFrame({'a': [1, 10, 8, 10, -1], ... 'b': list('abdce'), ... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]}) + >>> df + a b c + 0 1 a 1.0 + 1 10 b 2.0 + 2 8 d NaN + 3 10 c 3.0 + 4 -1 e 4.0 + + In the following example, we will use ``nlargest`` to select the three + rows having the largest values in column "a". + >>> df.nlargest(3, 'a') - a b c - 3 11 c 3 - 1 10 b 2 - 2 8 d NaN + a b c + 1 10 b 2.0 + 3 10 c 3.0 + 2 8 d NaN + + When using ``keep='last'``, ties are resolved in reverse order: + + >>> df.nlargest(3, 'a', keep='last') + a b c + 3 10 c 3.0 + 1 10 b 2.0 + 2 8 d NaN + + To order by the largest values in column "a" and then "c", we can + specify multiple columns like in the next example. + + >>> df.nlargest(3, ['a', 'c']) + a b c + 3 10 c 3.0 + 1 10 b 2.0 + 2 8 d NaN + + Attempting to use ``nlargest`` on non-numeric dtypes will raise a + ``TypeError``: + + >>> df.nlargest(3, 'b') + Traceback (most recent call last): + TypeError: Column 'b' has dtype object, cannot use method 'nlargest' """ return algorithms.SelectNFrame(self, n=n, @@ -5477,39 +5721,52 @@ def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, def applymap(self, func): """ - Apply a function to a DataFrame that is intended to operate - elementwise, i.e. like doing map(func, series) for each series in the - DataFrame + Apply a function to a Dataframe elementwise. + + This method applies a function that accepts and returns a scalar + to every element of a DataFrame. Parameters ---------- - func : function - Python function, returns a single value from a single value - - Examples - -------- - - >>> df = pd.DataFrame(np.random.randn(3, 3)) - >>> df - 0 1 2 - 0 -0.029638 1.081563 1.280300 - 1 0.647747 0.831136 -1.549481 - 2 0.513416 -0.884417 0.195343 - >>> df = df.applymap(lambda x: '%.2f' % x) - >>> df - 0 1 2 - 0 -0.03 1.08 1.28 - 1 0.65 0.83 -1.55 - 2 0.51 -0.88 0.20 + func : callable + Python function, returns a single value from a single value. Returns ------- - applied : DataFrame + DataFrame + Transformed DataFrame. See also -------- - DataFrame.apply : For operations on rows/columns + DataFrame.apply : Apply a function along input axis of DataFrame + + Examples + -------- + >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) + >>> df + 0 1 + 0 1.000 2.120 + 1 3.356 4.567 + + >>> df.applymap(lambda x: len(str(x))) + 0 1 + 0 3 4 + 1 5 5 + + Note that a vectorized version of `func` often exists, which will + be much faster. You could square each number elementwise. + + >>> df.applymap(lambda x: x**2) + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 + + But it's better to avoid applymap in that case. + >>> df ** 2 + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 """ # if we have a dtype == 'M8[ns]', provide boxed values diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e4feaf9e1fb0c0..5682ad411fd2fd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -524,24 +524,37 @@ def _expand_axes(self, key): return new_axes - _shared_docs['set_axis'] = """Assign desired index to given axis + def set_axis(self, labels, axis=0, inplace=None): + """ + Assign desired index to given axis. + + Indexes for column or row labels can be changed by assigning + a list-like or Index. + + .. versionchanged:: 0.21.0 + + The signature is now `labels` and `axis`, consistent with + the rest of pandas API. Previously, the `axis` and `labels` + arguments were respectively the first and second positional + arguments. Parameters ---------- - labels: list-like or Index - The values for the new index - axis : int or string, default 0 + labels : list-like, Index + The values for the new index. + + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to update. The value 0 identifies the rows, and 1 + identifies the columns. + inplace : boolean, default None Whether to return a new %(klass)s instance. - WARNING: inplace=None currently falls back to to True, but - in a future version, will default to False. Use inplace=True - explicitly rather than relying on the default. + .. warning:: - .. versionadded:: 0.21.0 - The signature is make consistent to the rest of the API. - Previously, the "axis" and "labels" arguments were respectively - the first and second positional arguments. + ``inplace=None`` currently falls back to to True, but in a + future version, will default to False. Use inplace=True + explicitly rather than relying on the default. Returns ------- @@ -550,43 +563,62 @@ def _expand_axes(self, key): See Also -------- - pandas.NDFrame.rename + pandas.DataFrame.rename_axis : Alter the name of the index or columns. Examples -------- + **Series** + >>> s = pd.Series([1, 2, 3]) >>> s 0 1 1 2 2 3 dtype: int64 + >>> s.set_axis(['a', 'b', 'c'], axis=0, inplace=False) a 1 b 2 c 3 dtype: int64 + + The original object is not modified. + + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + + **DataFrame** + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - >>> df.set_axis(['a', 'b', 'c'], axis=0, inplace=False) + + Change the row labels. + + >>> df.set_axis(['a', 'b', 'c'], axis='index', inplace=False) A B a 1 4 b 2 5 c 3 6 - >>> df.set_axis(['I', 'II'], axis=1, inplace=False) + + Change the column labels. + + >>> df.set_axis(['I', 'II'], axis='columns', inplace=False) I II 0 1 4 1 2 5 2 3 6 - >>> df.set_axis(['i', 'ii'], axis=1, inplace=True) + + Now, update the labels inplace. + + >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) >>> df i ii 0 1 4 1 2 5 2 3 6 - """ - - @Appender(_shared_docs['set_axis'] % dict(klass='NDFrame')) - def set_axis(self, labels, axis=0, inplace=None): if is_scalar(labels): warnings.warn( 'set_axis now takes "labels" as first argument, and ' @@ -1489,12 +1521,20 @@ def __contains__(self, key): @property def empty(self): - """True if NDFrame is entirely empty [no items], meaning any of the + """ + Indicator whether DataFrame is empty. + + True if DataFrame is entirely empty (no items), meaning any of the axes are of length 0. + Returns + ------- + bool + If DataFrame is empty, return True, if not return False. + Notes ----- - If NDFrame contains only NaNs, it is still not considered empty. See + If DataFrame contains only NaNs, it is still not considered empty. See the example below. Examples @@ -3951,7 +3991,9 @@ def tail(self, n=5): def sample(self, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None): """ - Returns a random sample of items from an axis of object. + Return a random sample of items from an axis of object. + + You can use `random_state` for reproducibility. Parameters ---------- @@ -3988,7 +4030,6 @@ def sample(self, n=None, frac=None, replace=False, weights=None, Examples -------- - Generate an example ``Series`` and ``DataFrame``: >>> s = pd.Series(np.random.randn(50)) @@ -4027,6 +4068,16 @@ def sample(self, n=None, frac=None, replace=False, weights=None, 40 0.823173 -0.078816 1.009536 1.015108 15 1.421154 -0.055301 -1.922594 -0.019696 6 -0.148339 0.832938 1.787600 -1.383767 + + You can use `random state` for reproducibility: + + >>> df.sample(random_state=1) + A B C D + 37 -2.027662 0.103611 0.237496 -0.165867 + 43 -0.259323 -0.583426 1.516140 -0.479118 + 12 -1.686325 -0.579510 0.985195 -0.460286 + 8 1.167946 0.429082 1.215742 -1.636041 + 9 1.197475 -0.864188 1.554031 -1.505264 """ if axis is None: @@ -6099,53 +6150,79 @@ def clip(self, lower=None, upper=None, axis=None, inplace=False, """ Trim values at input threshold(s). + Assigns values outside boundary to boundary values. Thresholds + can be singular values or array like, and in the latter case + the clipping is performed element-wise in the specified axis. + Parameters ---------- lower : float or array_like, default None + Minimum threshold value. All values below this + threshold will be set to it. upper : float or array_like, default None + Maximum threshold value. All values above this + threshold will be set to it. axis : int or string axis name, optional Align object with lower and upper along the given axis. inplace : boolean, default False - Whether to perform the operation in place on the data - .. versionadded:: 0.21.0 + Whether to perform the operation in place on the data. + + .. versionadded:: 0.21.0 + *args, **kwargs + Additional keywords have no effect but might be accepted + for compatibility with numpy. + + See Also + -------- + clip_lower : Clip values below specified threshold(s). + clip_upper : Clip values above specified threshold(s). Returns ------- - clipped : Series + Series or DataFrame + Same type as calling object with the values outside the + clip boundaries replaced Examples -------- + >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]} + >>> df = pd.DataFrame(data) >>> df - 0 1 - 0 0.335232 -1.256177 - 1 -1.367855 0.746646 - 2 0.027753 -1.176076 - 3 0.230930 -0.679613 - 4 1.261967 0.570967 - - >>> df.clip(-1.0, 0.5) - 0 1 - 0 0.335232 -1.000000 - 1 -1.000000 0.500000 - 2 0.027753 -1.000000 - 3 0.230930 -0.679613 - 4 0.500000 0.500000 - + col_0 col_1 + 0 9 -2 + 1 -3 -7 + 2 0 6 + 3 -1 8 + 4 5 -5 + + Clips per column using lower and upper thresholds: + + >>> df.clip(-4, 6) + col_0 col_1 + 0 6 -2 + 1 -3 -4 + 2 0 6 + 3 -1 6 + 4 5 -4 + + Clips using specific lower and upper thresholds per column element: + + >>> t = pd.Series([2, -4, -1, 6, 3]) >>> t - 0 -0.3 - 1 -0.2 - 2 -0.1 - 3 0.0 - 4 0.1 - dtype: float64 + 0 2 + 1 -4 + 2 -1 + 3 6 + 4 3 + dtype: int64 - >>> df.clip(t, t + 1, axis=0) - 0 1 - 0 0.335232 -0.300000 - 1 -0.200000 0.746646 - 2 0.027753 -0.100000 - 3 0.230930 0.000000 - 4 1.100000 0.570967 + >>> df.clip(t, t + 4, axis=0) + col_0 col_1 + 0 6 2 + 1 -3 -4 + 2 0 3 + 3 6 8 + 4 5 3 """ if isinstance(self, ABCPanel): raise NotImplementedError("clip is not supported yet for panels") @@ -6195,7 +6272,8 @@ def clip_upper(self, threshold, axis=None, inplace=False): Align object with threshold along the given axis. inplace : boolean, default False Whether to perform the operation in place on the data - .. versionadded:: 0.21.0 + + .. versionadded:: 0.21.0 See Also -------- @@ -6210,24 +6288,104 @@ def clip_upper(self, threshold, axis=None, inplace=False): def clip_lower(self, threshold, axis=None, inplace=False): """ - Return copy of the input with values below given value(s) truncated. + Return copy of the input with values below a threshold truncated. Parameters ---------- - threshold : float or array_like - axis : int or string axis name, optional - Align object with threshold along the given axis. + threshold : numeric or array-like + Minimum value allowed. All values below threshold will be set to + this value. + + * float : every value is compared to `threshold`. + * array-like : The shape of `threshold` should match the object + it's compared to. When `self` is a Series, `threshold` should be + the length. When `self` is a DataFrame, `threshold` should 2-D + and the same shape as `self` for ``axis=None``, or 1-D and the + same length as the axis being compared. + + axis : {0 or 'index', 1 or 'columns'}, default 0 + Align `self` with `threshold` along the given axis. + inplace : boolean, default False - Whether to perform the operation in place on the data - .. versionadded:: 0.21.0 + Whether to perform the operation in place on the data. + + .. versionadded:: 0.21.0 See Also -------- - clip + Series.clip : Return copy of input with values below and above + thresholds truncated. + Series.clip_upper : Return copy of input with values above + threshold truncated. Returns ------- clipped : same type as input + + Examples + -------- + Series single threshold clipping: + + >>> s = pd.Series([5, 6, 7, 8, 9]) + >>> s.clip_lower(8) + 0 8 + 1 8 + 2 8 + 3 8 + 4 9 + dtype: int64 + + Series clipping element-wise using an array of thresholds. `threshold` + should be the same length as the Series. + + >>> elemwise_thresholds = [4, 8, 7, 2, 5] + >>> s.clip_lower(elemwise_thresholds) + 0 5 + 1 8 + 2 7 + 3 8 + 4 9 + dtype: int64 + + DataFrames can be compared to a scalar. + + >>> df = pd.DataFrame({"A": [1, 3, 5], "B": [2, 4, 6]}) + >>> df + A B + 0 1 2 + 1 3 4 + 2 5 6 + + >>> df.clip_lower(3) + A B + 0 3 3 + 1 3 4 + 2 5 6 + + Or to an array of values. By default, `threshold` should be the same + shape as the DataFrame. + + >>> df.clip_lower(np.array([[3, 4], [2, 2], [6, 2]])) + A B + 0 3 4 + 1 3 4 + 2 6 6 + + Control how `threshold` is broadcast with `axis`. In this case + `threshold` should be the same length as the axis specified by + `axis`. + + >>> df.clip_lower(np.array([3, 3, 5]), axis='index') + A B + 0 3 3 + 1 3 4 + 2 5 6 + + >>> df.clip_lower(np.array([4, 5]), axis='columns') + A B + 0 4 5 + 1 4 5 + 2 5 6 """ return self._clip_with_one_bound(threshold, method=self.ge, axis=axis, inplace=inplace) @@ -7439,29 +7597,37 @@ def tshift(self, periods=1, freq=None, axis=0): def truncate(self, before=None, after=None, axis=None, copy=True): """ - Truncates a sorted DataFrame/Series before and/or after some - particular index value. If the axis contains only datetime values, - before/after parameters are converted to datetime values. + Truncate a Series or DataFrame before and after some index value. + + This is a useful shorthand for boolean indexing based on index + values above or below certain thresholds. Parameters ---------- before : date, string, int - Truncate all rows before this index value + Truncate all rows before this index value. after : date, string, int - Truncate all rows after this index value - axis : {0 or 'index', 1 or 'columns'} - - * 0 or 'index': apply truncation to rows - * 1 or 'columns': apply truncation to columns - - Default is stat axis for given data type (0 for Series and - DataFrames, 1 for Panels) + Truncate all rows after this index value. + axis : {0 or 'index', 1 or 'columns'}, optional + Axis to truncate. Truncates the index (rows) by default. copy : boolean, default is True, - return a copy of the truncated section + Return a copy of the truncated section. Returns ------- - truncated : type of caller + type of caller + The truncated Series or DataFrame. + + See Also + -------- + DataFrame.loc : Select a subset of a DataFrame by label. + DataFrame.iloc : Select a subset of a DataFrame by position. + + Notes + ----- + If the index being truncated contains only datetime values, + `before` and `after` may be specified as strings instead of + Timestamps. Examples -------- @@ -7469,28 +7635,63 @@ def truncate(self, before=None, after=None, axis=None, copy=True): ... 'B': ['f', 'g', 'h', 'i', 'j'], ... 'C': ['k', 'l', 'm', 'n', 'o']}, ... index=[1, 2, 3, 4, 5]) + >>> df + A B C + 1 a f k + 2 b g l + 3 c h m + 4 d i n + 5 e j o + >>> df.truncate(before=2, after=4) A B C 2 b g l 3 c h m 4 d i n - >>> df = pd.DataFrame({'A': [1, 2, 3, 4, 5], - ... 'B': [6, 7, 8, 9, 10], - ... 'C': [11, 12, 13, 14, 15]}, - ... index=['a', 'b', 'c', 'd', 'e']) - >>> df.truncate(before='b', after='d') - A B C - b 2 7 12 - c 3 8 13 - d 4 9 14 - The index values in ``truncate`` can be datetimes or string - dates. Note that ``truncate`` assumes a 0 value for any unspecified - date component in a ``DatetimeIndex`` in contrast to slicing which - returns any partially matching dates. + The columns of a DataFrame can be truncated. + + >>> df.truncate(before="A", after="B", axis="columns") + A B + 1 a f + 2 b g + 3 c h + 4 d i + 5 e j + + For Series, only rows can be truncated. + >>> df['A'].truncate(before=2, after=4) + 2 b + 3 c + 4 d + Name: A, dtype: object + + The index values in ``truncate`` can be datetimes or string + dates. >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s') >>> df = pd.DataFrame(index=dates, data={'A': 1}) + >>> df.tail() + A + 2016-01-31 23:59:56 1 + 2016-01-31 23:59:57 1 + 2016-01-31 23:59:58 1 + 2016-01-31 23:59:59 1 + 2016-02-01 00:00:00 1 + + >>> df.truncate(before=pd.Timestamp('2016-01-05'), + ... after=pd.Timestamp('2016-01-10')).tail() + A + 2016-01-09 23:59:56 1 + 2016-01-09 23:59:57 1 + 2016-01-09 23:59:58 1 + 2016-01-09 23:59:59 1 + 2016-01-10 00:00:00 1 + + Because the index is a DatetimeIndex containing only dates, we can + specify `before` and `after` as strings. They will be coerced to + Timestamps before truncation. + >>> df.truncate('2016-01-05', '2016-01-10').tail() A 2016-01-09 23:59:56 1 @@ -7498,6 +7699,11 @@ def truncate(self, before=None, after=None, axis=None, copy=True): 2016-01-09 23:59:58 1 2016-01-09 23:59:59 1 2016-01-10 00:00:00 1 + + Note that ``truncate`` assumes a 0 value for any unspecified time + component (midnight). This differs from partial string slicing, which + returns any partially matching dates. + >>> df.loc['2016-01-05':'2016-01-10', :].tail() A 2016-01-10 23:59:55 1 diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index c27754d57d82b6..225ccbf5907796 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -200,6 +200,39 @@ class TimedeltaProperties(Properties): """ def to_pytimedelta(self): + """ + Return an array of native `datetime.timedelta` objects. + + Python's standard `datetime` library uses a different representation + timedelta's. This method converts a Series of pandas Timedeltas + to `datetime.timedelta` format with the same length as the original + Series. + + Returns + ------- + a : numpy.ndarray + 1D array containing data with `datetime.timedelta` type. + + Examples + -------- + >>> s = pd.Series(pd.to_timedelta(np.arange(5), unit='d')) + >>> s + 0 0 days + 1 1 days + 2 2 days + 3 3 days + 4 4 days + dtype: timedelta64[ns] + + >>> s.dt.to_pytimedelta() + array([datetime.timedelta(0), datetime.timedelta(1), + datetime.timedelta(2), datetime.timedelta(3), + datetime.timedelta(4)], dtype=object) + + See Also + -------- + datetime.timedelta + """ return self._get_values().to_pytimedelta() @property diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d5daece62cba86..17c20b12c32f89 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -457,7 +457,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): Must be careful not to recurse. """ if not hasattr(values, 'dtype'): - if values is None and dtype is not None: + if (values is None or not len(values)) and dtype is not None: values = np.empty(0, dtype=dtype) else: values = np.array(values, copy=False) @@ -491,6 +491,8 @@ def _shallow_copy(self, values=None, **kwargs): values = self.values attributes = self._get_attributes_dict() attributes.update(kwargs) + if not len(values) and 'dtype' not in kwargs: + attributes['dtype'] = self.dtype return self._simple_new(values, **attributes) def _shallow_copy_with_infer(self, values=None, **kwargs): @@ -511,6 +513,8 @@ def _shallow_copy_with_infer(self, values=None, **kwargs): attributes = self._get_attributes_dict() attributes.update(kwargs) attributes['copy'] = False + if not len(values) and 'dtype' not in kwargs: + attributes['dtype'] = self.dtype if self._infer_as_myclass: try: return self._constructor(values, **attributes) @@ -1190,6 +1194,11 @@ def to_frame(self, index=True): DataFrame DataFrame containing the original Index data. + See Also + -------- + Index.to_series : Convert an Index to a Series. + Series.to_frame : Convert Series to DataFrame. + Examples -------- >>> idx = pd.Index(['Ant', 'Bear', 'Cow'], name='animal') @@ -1384,7 +1393,19 @@ def _has_complex_internals(self): # to disable groupby tricks in MultiIndex return False - def summary(self, name=None): + def _summary(self, name=None): + """ + Return a summarized representation + + Parameters + ---------- + name : str + name to use in the summary representation + + Returns + ------- + String with a summarized representation of the index + """ if len(self) > 0: head = self[0] if (hasattr(head, 'format') and @@ -1403,6 +1424,15 @@ def summary(self, name=None): name = type(self).__name__ return '%s: %s entries%s' % (name, len(self), index_summary) + def summary(self, name=None): + """ + Return a summarized representation + .. deprecated:: 0.23.0 + """ + warnings.warn("'summary' is deprecated and will be removed in a " + "future version.", FutureWarning, stacklevel=2) + return self._summary(name) + def _mpl_repr(self): # how to represent ourselves to matplotlib return self.values @@ -2815,7 +2845,7 @@ def difference(self, other): self._assert_can_do_setop(other) if self.equals(other): - return Index([], name=self.name) + return self._shallow_copy([]) other, result_name = self._convert_can_do_setop(other) @@ -3659,46 +3689,69 @@ def join(self, other, how='left', level=None, return_indexers=False, def _join_multi(self, other, how, return_indexers=True): from .multi import MultiIndex - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + from pandas.core.reshape.merge import _complete_multilevel_join # figure out join names - self_names = com._not_none(*self.names) - other_names = com._not_none(*other.names) - overlap = list(set(self_names) & set(other_names)) + self_names = set(com._not_none(*self.names)) + other_names = set(com._not_none(*other.names)) + overlap = list(self_names & other_names) - # need at least 1 in common, but not more than 1 + # need at least 1 in common if not len(overlap): - raise ValueError("cannot join with no level specified and no " - "overlapping names") - if len(overlap) > 1: - raise NotImplementedError("merging with more than one level " - "overlap on a multi-index is not " - "implemented") - jl = overlap[0] + raise ValueError("cannot join with no overlapping index names") + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) + + if self_is_mi and other_is_mi: + + # Drop the non matching levels + ldrop_levels = list(set(self_names) - set(overlap)) + rdrop_levels = list(set(other_names) - set(overlap)) + + self_jnlevels = self.droplevel(ldrop_levels) + other_jnlevels = other.droplevel(rdrop_levels) + + if not (self_jnlevels.is_unique and other_jnlevels.is_unique): + raise ValueError("Join on level between two MultiIndex objects" + "is ambiguous") + + dropped_levels = ldrop_levels + rdrop_levels + + join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, + return_indexers=True) + + levels, labels, names = _complete_multilevel_join(self, other, how, + dropped_levels, + join_idx, + lidx, ridx) + + multi_join_idx = MultiIndex(levels=levels, labels=labels, + names=names, verify_integrity=False) + + multi_join_idx = multi_join_idx.remove_unused_levels() + + return multi_join_idx, lidx, ridx + + jl = list(overlap)[0] + + # Case where only one index is multi # make the indices into mi's that match - if not (self_is_mi and other_is_mi): - - flip_order = False - if self_is_mi: - self, other = other, self - flip_order = True - # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) - - level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) - - if flip_order: - if isinstance(result, tuple): - return result[0], result[2], result[1] - return result + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + # flip if join method is right or left + how = {'right': 'left', 'left': 'right'}.get(how, how) + + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) - # 2 multi-indexes - raise NotImplementedError("merging with both multi-indexes is not " - "implemented") + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 1be71ff68c2fb7..e9011a3eb912c3 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -58,12 +58,12 @@ class DatelikeOps(object): """ common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex """ def strftime(self, date_format): - return np.asarray(self.format(date_format=date_format), - dtype=compat.text_type) + return Index(self.format(date_format=date_format), + dtype=compat.text_type) strftime.__doc__ = """ - Convert to string array using specified date_format. + Convert to Index using specified date_format. - Return an array of formatted strings specified by date_format, which + Return an Index of formatted strings specified by date_format, which supports the same string format as the python standard library. Details of the string format can be found in `python string format doc <{0}>`__ @@ -74,8 +74,8 @@ def strftime(self, date_format): Returns ------- - numpy.ndarray - NumPy array of formatted strings + Index + Index of formatted strings See Also -------- @@ -1049,9 +1049,18 @@ def where(self, cond, other=None): return self._shallow_copy(result, **self._get_attributes_dict()) - def summary(self, name=None): + def _summary(self, name=None): """ - return a summarized representation + Return a summarized representation + + Parameters + ---------- + name : str + name to use in the summary representation + + Returns + ------- + String with a summarized representation of the index """ formatter = self._formatter_func if len(self) > 0: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1f387dadfb9aec..e8bc9a2519333c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1773,7 +1773,41 @@ def freq(self, value): is_month_end = _field_accessor( 'is_month_end', 'is_month_end', - "Logical indicating if last day of month (defined by frequency)") + """ + Indicator for whether the date is the last day of the month. + + Returns + ------- + Series or array + For Series, returns a Series with boolean values. For + DatetimeIndex, returns a boolean array. + + See Also + -------- + is_month_start : Indicator for whether the date is the first day + of the month. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> dates = pd.Series(pd.date_range("2018-02-27", periods=3)) + >>> dates + 0 2018-02-27 + 1 2018-02-28 + 2 2018-03-01 + dtype: datetime64[ns] + >>> dates.dt.is_month_end + 0 False + 1 True + 2 False + dtype: bool + + >>> idx = pd.date_range("2018-02-27", periods=3) + >>> idx.is_month_end + array([False, True, False], dtype=bool) + """) is_quarter_start = _field_accessor( 'is_quarter_start', 'is_quarter_start', @@ -1945,7 +1979,43 @@ def freq(self, value): is_leap_year = _field_accessor( 'is_leap_year', 'is_leap_year', - "Logical indicating if the date belongs to a leap year") + """ + Boolean indicator if the date belongs to a leap year. + + A leap year is a year, which has 366 days (instead of 365) including + 29th of February as an intercalary day. + Leap years are years which are multiples of four with the exception + of years divisible by 100 but not by 400. + + Returns + ------- + Series or ndarray + Booleans indicating if dates belong to a leap year. + + Examples + -------- + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. + + >>> idx = pd.date_range("2012-01-01", "2015-01-01", freq="Y") + >>> idx + DatetimeIndex(['2012-12-31', '2013-12-31', '2014-12-31'], + dtype='datetime64[ns]', freq='A-DEC') + >>> idx.is_leap_year + array([ True, False, False], dtype=bool) + + >>> dates = pd.Series(idx) + >>> dates_series + 0 2012-12-31 + 1 2013-12-31 + 2 2014-12-31 + dtype: datetime64[ns] + >>> dates_series.dt.is_leap_year + 0 True + 1 False + 2 False + dtype: bool + """) @property def time(self): @@ -1964,11 +2034,41 @@ def date(self): def normalize(self): """ - Return DatetimeIndex with times to midnight. Length is unaltered + Convert times to midnight. + + The time component of the date-timeise converted to midnight i.e. + 00:00:00. This is useful in cases, when the time does not matter. + Length is unaltered. The timezones are unaffected. + + This method is available on Series with datetime values under + the ``.dt`` accessor, and directly on DatetimeIndex. Returns ------- - normalized : DatetimeIndex + DatetimeIndex or Series + The same type as the original data. Series will have the same + name and index. DatetimeIndex will have the same name. + + See Also + -------- + floor : Floor the datetimes to the specified freq. + ceil : Ceil the datetimes to the specified freq. + round : Round the datetimes to the specified freq. + + Examples + -------- + >>> idx = pd.DatetimeIndex(start='2014-08-01 10:00', freq='H', + ... periods=3, tz='Asia/Calcutta') + >>> idx + DatetimeIndex(['2014-08-01 10:00:00+05:30', + '2014-08-01 11:00:00+05:30', + '2014-08-01 12:00:00+05:30'], + dtype='datetime64[ns, Asia/Calcutta]', freq='H') + >>> idx.normalize() + DatetimeIndex(['2014-08-01 00:00:00+05:30', + '2014-08-01 00:00:00+05:30', + '2014-08-01 00:00:00+05:30'], + dtype='datetime64[ns, Asia/Calcutta]', freq=None) """ new_values = conversion.date_normalize(self.asi8, self.tz) return DatetimeIndex(new_values, freq='infer', name=self.name, @@ -2477,36 +2577,34 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, """ Return a fixed frequency DatetimeIndex. - The default frequency is day (calendar). + Exactly two of the three parameters `start`, `end` and `periods` + must be specified. Parameters ---------- - start : string or datetime-like, default None + start : str or datetime-like, optional Left bound for generating dates. - end : string or datetime-like, default None + end : str or datetime-like, optional Right bound for generating dates. - periods : integer, default None + periods : integer, optional Number of periods to generate. - freq : string or DateOffset, default 'D' (calendar daily) - Frequency strings can have multiples, e.g. '5H'. - tz : string, default None + freq : str or DateOffset, default 'D' (calendar daily) + Frequency strings can have multiples, e.g. '5H'. See + :ref:`here ` for a list of + frequency aliases. + tz : str or tzinfo, optional Time zone name for returning localized DatetimeIndex, for example - Asia/Hong_Kong. + 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is + timezone-naive. normalize : bool, default False Normalize start/end dates to midnight before generating date range. - name : string, default None + name : str, default None Name of the resulting DatetimeIndex. - closed : string, default None + closed : {None, 'left', 'right'}, optional Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None). - - Notes - ----- - Of the three parameters: ``start``, ``end``, and ``periods``, exactly two - must be specified. - - To learn more about the frequency strings, please see `this link - `__. + the 'left', 'right', or both sides (None, the default). + **kwargs + For compatibility. Has no effect on the result. Returns ------- @@ -2514,19 +2612,87 @@ def date_range(start=None, end=None, periods=None, freq='D', tz=None, See Also -------- + pandas.DatetimeIndex : An immutable container for datetimes. pandas.period_range : Return a fixed frequency PeriodIndex. pandas.interval_range : Return a fixed frequency IntervalIndex. Examples -------- - >>> pd.date_range('2018-10-03', periods=2) # doctest: +NORMALIZE_WHITESPACE - DatetimeIndex(['2018-10-03', '2018-10-04'], dtype='datetime64[ns]', - freq='D') - - >>> pd.date_range(start='2018-01-01', end='20180103') - ... # doctest: +NORMALIZE_WHITESPACE - DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'], - dtype='datetime64[ns]', freq='D') + **Specifying the values** + + The next three examples generate the same `DatetimeIndex`, but vary + the combination of `start`, `end` and `periods`. + + Specify `start` and `end`, with the default daily frequency. + + >>> pd.date_range(start='1/1/2018', end='1/08/2018') + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq='D') + + Specify `start` and `periods`, the number of periods (days). + + >>> pd.date_range(start='1/1/2018', periods=8) + DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', + '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], + dtype='datetime64[ns]', freq='D') + + Specify `end` and `periods`, the number of periods (days). + + >>> pd.date_range(end='1/1/2018', periods=8) + DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28', + '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'], + dtype='datetime64[ns]', freq='D') + + **Other Parameters** + + Changed the `freq` (frequency) to ``'M'`` (month end frequency). + + >>> pd.date_range(start='1/1/2018', periods=5, freq='M') + DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', + '2018-05-31'], + dtype='datetime64[ns]', freq='M') + + Multiples are allowed + + >>> pd.date_range(start='1/1/2018', periods=5, freq='3M') + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq='3M') + + `freq` can also be specified as an Offset object. + + >>> pd.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3)) + DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', + '2019-01-31'], + dtype='datetime64[ns]', freq='3M') + + Specify `tz` to set the timezone. + + >>> pd.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo') + DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00', + '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00', + '2018-01-05 00:00:00+09:00'], + dtype='datetime64[ns, Asia/Tokyo]', freq='D') + + `closed` controls whether to include `start` and `end` that are on the + boundary. The default includes boundary points on either end. + + >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed=None) + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], + dtype='datetime64[ns]', freq='D') + + Use ``closed='left'`` to exclude `end` if it falls on the boundary. + + >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='left') + DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], + dtype='datetime64[ns]', freq='D') + + Use ``closed='right'`` to exclude `start` if it falls on the boundary. + + >>> pd.date_range(start='2017-01-01', end='2017-01-04', closed='right') + DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], + dtype='datetime64[ns]', freq='D') """ return DatetimeIndex(start=start, end=end, periods=periods, freq=freq, tz=tz, normalize=normalize, name=name, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 60eda70714da5b..8226c4bcac4947 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2755,7 +2755,7 @@ def intersection(self, other): other_tuples = other._ndarray_values uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) if len(uniq_tuples) == 0: - return MultiIndex(levels=[[]] * self.nlevels, + return MultiIndex(levels=self.levels, labels=[[]] * self.nlevels, names=result_names, verify_integrity=False) else: @@ -2777,7 +2777,7 @@ def difference(self, other): return self if self.equals(other): - return MultiIndex(levels=[[]] * self.nlevels, + return MultiIndex(levels=self.levels, labels=[[]] * self.nlevels, names=result_names, verify_integrity=False) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 240c9b1f3377cd..bb6702b50ad3d9 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -17,6 +17,7 @@ from pandas.core.dtypes.dtypes import ( ExtensionDtype, DatetimeTZDtype, + PandasExtensionDtype, CategoricalDtype) from pandas.core.dtypes.common import ( _TD_DTYPE, _NS_DTYPE, @@ -598,7 +599,8 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, list(errors_legal_values), errors)) raise ValueError(invalid_arg) - if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): + if (inspect.isclass(dtype) and + issubclass(dtype, (PandasExtensionDtype, ExtensionDtype))): msg = ("Expected an instance of {}, but got the class instead. " "Try instantiating 'dtype'.".format(dtype.__name__)) raise TypeError(msg) @@ -1963,6 +1965,23 @@ def concat_same_type(self, to_concat, placement=None): return self.make_block_same_class(values, ndim=self.ndim, placement=placement) + def fillna(self, value, limit=None, inplace=False, downcast=None, + mgr=None): + values = self.values if inplace else self.values.copy() + values = values.fillna(value=value, limit=limit) + return [self.make_block_same_class(values=values, + placement=self.mgr_locs, + ndim=self.ndim)] + + def interpolate(self, method='pad', axis=0, inplace=False, limit=None, + fill_value=None, **kwargs): + + values = self.values if inplace else self.values.copy() + return self.make_block_same_class( + values=values.fillna(value=fill_value, method=method, + limit=limit), + placement=self.mgr_locs) + class NumericBlock(Block): __slots__ = () @@ -2522,27 +2541,6 @@ def _try_coerce_result(self, result): return result - def fillna(self, value, limit=None, inplace=False, downcast=None, - mgr=None): - # we may need to upcast our fill to match our dtype - if limit is not None: - raise NotImplementedError("specifying a limit for 'fillna' has " - "not been implemented yet") - - values = self.values if inplace else self.values.copy() - values = self._try_coerce_result(values.fillna(value=value, - limit=limit)) - return [self.make_block(values=values)] - - def interpolate(self, method='pad', axis=0, inplace=False, limit=None, - fill_value=None, **kwargs): - - values = self.values if inplace else self.values.copy() - return self.make_block_same_class( - values=values.fillna(fill_value=fill_value, method=method, - limit=limit), - placement=self.mgr_locs) - def shift(self, periods, axis=0, mgr=None): return self.make_block_same_class(values=self.values.shift(periods), placement=self.mgr_locs) @@ -5005,7 +5003,7 @@ def _interleaved_dtype(blocks): dtype = find_common_type([b.dtype for b in blocks]) # only numpy compat - if isinstance(dtype, ExtensionDtype): + if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)): dtype = np.object return dtype diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 6c6a54993b6697..e14f82906cd065 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -345,7 +345,7 @@ def _get_op_name(op, special): _add_example_FRAME = """ >>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], - columns=['one']) +... columns=['one']) >>> a one a 1.0 @@ -353,8 +353,8 @@ def _get_op_name(op, special): c 1.0 d NaN >>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], - two=[np.nan, 2, np.nan, 2]), - index=['a', 'b', 'd', 'e']) +... two=[np.nan, 2, np.nan, 2]), +... index=['a', 'b', 'd', 'e']) >>> b one two a 1.0 NaN @@ -370,6 +370,33 @@ def _get_op_name(op, special): e NaN 2.0 """ +_sub_example_FRAME = """ +>>> a = pd.DataFrame([2, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], +... columns=['one']) +>>> a + one +a 2.0 +b 1.0 +c 1.0 +d NaN +>>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], +... two=[3, 2, np.nan, 2]), +... index=['a', 'b', 'd', 'e']) +>>> b + one two +a 1.0 3.0 +b NaN 2.0 +d 1.0 NaN +e NaN 2.0 +>>> a.sub(b, fill_value=0) + one two +a 1.0 -3.0 +b 1.0 -2.0 +c 1.0 NaN +d -1.0 NaN +e NaN -2.0 +""" + _op_descriptions = { # Arithmetic Operators 'add': {'op': '+', @@ -379,7 +406,7 @@ def _get_op_name(op, special): 'sub': {'op': '-', 'desc': 'Subtraction', 'reverse': 'rsub', - 'df_examples': None}, + 'df_examples': _sub_example_FRAME}, 'mul': {'op': '*', 'desc': 'Multiplication', 'reverse': 'rmul', @@ -516,33 +543,6 @@ def _get_op_name(op, special): Returns ------- result : DataFrame - -Examples --------- ->>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], - columns=['one']) ->>> a - one -a 1.0 -b 1.0 -c 1.0 -d NaN ->>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], - two=[np.nan, 2, np.nan, 2]), - index=['a', 'b', 'd', 'e']) ->>> b - one two -a 1.0 NaN -b NaN 2.0 -d 1.0 NaN -e NaN 2.0 ->>> a.add(b, fill_value=0) - one two -a 2.0 NaN -b 1.0 2.0 -c 1.0 NaN -d 1.0 NaN -e NaN 2.0 """ _flex_doc_FRAME = """ @@ -556,14 +556,14 @@ def _get_op_name(op, special): other : Series, DataFrame, or constant axis : {{0, 1, 'index', 'columns'}} For Series input, axis to match Series index on +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level fill_value : None or float value, default None Fill existing missing (NaN) values, and any new element needed for successful DataFrame alignment, with this value before computation. If data in both corresponding DataFrame locations is missing the result will be missing -level : int or name - Broadcast across a level, matching Index values on the - passed MultiIndex level Notes ----- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7b1a0875bba590..3b77e2a3f480ff 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1143,6 +1143,82 @@ def _get_join_indexers(left_keys, right_keys, sort=False, how='inner', return join_func(lkey, rkey, count, **kwargs) +def _complete_multilevel_join(left, right, how, dropped_levels, + join_idx, lidx, ridx): + """ + *this is an internal non-public method* + + Returns the levels, labels and names of a multilevel to multilevel join + Depending on the type of join, this method restores the appropriate + dropped levels of the joined multi-index. The method relies on lidx, ridx + which hold the index positions of left and right, where a join was feasible + + Parameters + ---------- + left : Index + left index + right : Index + right index + join_idx : Index + the index of the join between the common levels of left and right + how : {'left', 'right', 'outer', 'inner'} + lidx : intp array + left indexer + right : intp array + right indexer + dropped_levels : str array + list of non-common levels + + Returns + ------- + levels : intp array + levels of combined multiindexes + labels : str array + labels of combined multiindexes + names : str array + names of combined multiindexes + + """ + + join_levels = join_idx.levels + join_labels = join_idx.labels + join_names = join_idx.names + + # lidx and ridx hold the indexes where the join occured + # for left and right respectively. If left (right) is None it means that + # the join occured on all indices of left (right) + if lidx is None: + lidx = range(0, len(left)) + + if ridx is None: + ridx = range(0, len(right)) + + # Iterate through the levels that must be restored + for dl in dropped_levels: + if dl in left.names: + idx = left + indexer = lidx + else: + idx = right + indexer = ridx + + # The index of the level name to be restored + name_idx = idx.names.index(dl) + + restore_levels = idx.levels[name_idx].values + restore_labels = idx.labels[name_idx] + + join_levels = join_levels.__add__([restore_levels]) + join_names = join_names.__add__([dl]) + + # Inject -1 in the labels list where a join was not possible + # IOW indexer[i]=-1 + labels = [restore_labels[i] if i != -1 else -1 for i in indexer] + join_labels = join_labels.__add__([labels]) + + return join_levels, join_labels, join_names + + class _OrderedMerge(_MergeOperation): _merge_type = 'ordered_merge' diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 30132ddc05c406..be28f7091712fc 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -26,69 +26,133 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False): """ - Return indices of half-open bins to which each value of `x` belongs. + Bin values into discrete intervals. + + Use `cut` when you need to segment and sort data values into bins. This + function is also useful for going from a continuous variable to a + categorical variable. For example, `cut` could convert ages to groups of + age ranges. Supports binning into an equal number of bins, or a + pre-specified array of bins. Parameters ---------- x : array-like - Input array to be binned. It has to be 1-dimensional. - bins : int, sequence of scalars, or IntervalIndex - If `bins` is an int, it defines the number of equal-width bins in the - range of `x`. However, in this case, the range of `x` is extended - by .1% on each side to include the min or max values of `x`. If - `bins` is a sequence it defines the bin edges allowing for - non-uniform bin width. No extension of the range of `x` is done in - this case. - right : bool, optional - Indicates whether the bins include the rightmost edge or not. If - right == True (the default), then the bins [1,2,3,4] indicate - (1,2], (2,3], (3,4]. - labels : array or boolean, default None - Used as labels for the resulting bins. Must be of the same length as - the resulting bins. If False, return only integer indicators of the - bins. - retbins : bool, optional - Whether to return the bins or not. Can be useful if bins is given + The input array to be binned. Must be 1-dimensional. + bins : int, sequence of scalars, or pandas.IntervalIndex + The criteria to bin by. + + * int : Defines the number of equal-width bins in the range of `x`. The + range of `x` is extended by .1% on each side to include the minimum + and maximum values of `x`. + * sequence of scalars : Defines the bin edges allowing for non-uniform + width. No extension of the range of `x` is done. + * IntervalIndex : Defines the exact bins to be used. + + right : bool, default True + Indicates whether `bins` includes the rightmost edge or not. If + ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` + indicate (1,2], (2,3], (3,4]. This argument is ignored when + `bins` is an IntervalIndex. + labels : array or bool, optional + Specifies the labels for the returned bins. Must be the same length as + the resulting bins. If False, returns only integer indicators of the + bins. This affects the type of the output container (see below). + This argument is ignored when `bins` is an IntervalIndex. + retbins : bool, default False + Whether to return the bins or not. Useful when bins is provided as a scalar. - precision : int, optional - The precision at which to store and display the bins labels - include_lowest : bool, optional + precision : int, default 3 + The precision at which to store and display the bins labels. + include_lowest : bool, default False Whether the first interval should be left-inclusive or not. Returns ------- - out : Categorical or Series or array of integers if labels is False - The return type (Categorical or Series) depends on the input: a Series - of type category if input is a Series else Categorical. Bins are - represented as categories when categorical data is returned. - bins : ndarray of floats - Returned only if `retbins` is True. + out : pandas.Categorical, Series, or ndarray + An array-like object representing the respective bin for each value + of `x`. The type depends on the value of `labels`. - Notes - ----- - The `cut` function can be useful for going from a continuous variable to - a categorical variable. For example, `cut` could convert ages to groups - of age ranges. + * True (default) : returns a Series for Series `x` or a + pandas.Categorical for all other inputs. The values stored within + are Interval dtype. - Any NA values will be NA in the result. Out of bounds values will be NA in - the resulting Categorical object + * sequence of scalars : returns a Series for Series `x` or a + pandas.Categorical for all other inputs. The values stored within + are whatever the type in the sequence is. + * False : returns an ndarray of integers. + + bins : numpy.ndarray or IntervalIndex. + The computed or specified bins. Only returned when `retbins=True`. + For scalar or sequence `bins`, this is an ndarray with the computed + bins. For an IntervalIndex `bins`, this is equal to `bins`. + + See Also + -------- + qcut : Discretize variable into equal-sized buckets based on rank + or based on sample quantiles. + pandas.Categorical : Array type for storing data that come from a + fixed set of values. + Series : One-dimensional array with axis labels (including time series). + pandas.IntervalIndex : Immutable Index implementing an ordered, + sliceable set. + + Notes + ----- + Any NA values will be NA in the result. Out of bounds values will be NA in + the resulting Series or pandas.Categorical object. Examples -------- - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) + Discretize into three equal-sized bins. + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) ... # doctest: +ELLIPSIS - ([(0.19, 3.367], (0.19, 3.367], (0.19, 3.367], (3.367, 6.533], ... - Categories (3, interval[float64]): [(0.19, 3.367] < (3.367, 6.533] ... + [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... - >>> pd.cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), - ... 3, labels=["good", "medium", "bad"]) - ... # doctest: +SKIP - [good, good, good, medium, bad, good] - Categories (3, object): [good < medium < bad] + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) + ... # doctest: +ELLIPSIS + ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... + Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... + array([0.994, 3. , 5. , 7. ])) + + Discovers the same bins, but assign them specific labels. Notice that + the returned Categorical's categories are `labels` and is ordered. + + >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), + ... 3, labels=["bad", "medium", "good"]) + [bad, good, medium, medium, good, bad] + Categories (3, object): [bad < medium < good] - >>> pd.cut(np.ones(5), 4, labels=False) - array([1, 1, 1, 1, 1]) + ``labels=False`` implies you just want the bins back. + + >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) + array([0, 1, 1, 3]) + + Passing a Series as an input returns a Series with categorical dtype: + + >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), + ... index=['a', 'b', 'c', 'd', 'e']) + >>> pd.cut(s, 3) + ... # doctest: +ELLIPSIS + a (1.992, 4.667] + b (1.992, 4.667] + c (4.667, 7.333] + d (7.333, 10.0] + e (7.333, 10.0] + dtype: category + Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... + + Passing an IntervalIndex for `bins` results in those categories exactly. + Notice that values not covered by the IntervalIndex are set to NaN. 0 + is to the left of the first bin (which is closed on the right), and 1.5 + falls between two bins. + + >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) + >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) + [NaN, (0, 1], NaN, (2, 3], (4, 5]] + Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 diff --git a/pandas/core/series.py b/pandas/core/series.py index 07cfc671cbd288..e4801242073a20 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1071,55 +1071,112 @@ def _set_value(self, label, value, takeable=False): def reset_index(self, level=None, drop=False, name=None, inplace=False): """ - Analogous to the :meth:`pandas.DataFrame.reset_index` function, see - docstring there. + Generate a new DataFrame or Series with the index reset. + + This is useful when the index needs to be treated as a column, or + when the index is meaningless and needs to be reset to the default + before another operation. Parameters ---------- - level : int, str, tuple, or list, default None - Only remove the given levels from the index. Removes all levels by - default - drop : boolean, default False - Do not try to insert index into dataframe columns - name : object, default None - The name of the column corresponding to the Series values - inplace : boolean, default False - Modify the Series in place (do not create a new object) + level : int, str, tuple, or list, default optional + For a Series with a MultiIndex, only remove the specified levels + from the index. Removes all levels by default. + drop : bool, default False + Just reset the index, without inserting it as a column in + the new DataFrame. + name : object, optional + The name to use for the column containing the original Series + values. Uses ``self.name`` by default. This argument is ignored + when `drop` is True. + inplace : bool, default False + Modify the Series in place (do not create a new object). Returns - ---------- - resetted : DataFrame, or Series if drop == True + ------- + Series or DataFrame + When `drop` is False (the default), a DataFrame is returned. + The newly created columns will come first in the DataFrame, + followed by the original Series values. + When `drop` is True, a `Series` is returned. + In either case, if ``inplace=True``, no value is returned. + + See Also + -------- + DataFrame.reset_index: Analogous function for DataFrame. Examples -------- - >>> s = pd.Series([1, 2, 3, 4], index=pd.Index(['a', 'b', 'c', 'd'], - ... name = 'idx')) + + >>> s = pd.Series([1, 2, 3, 4], name='foo', + ... index=pd.Index(['a', 'b', 'c', 'd'], name='idx')) + + Generate a DataFrame with default index. + >>> s.reset_index() - idx 0 - 0 a 1 - 1 b 2 - 2 c 3 - 3 d 4 - - >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', - ... 'foo', 'qux', 'qux']), - ... np.array(['one', 'two', 'one', 'two', 'one', 'two', - ... 'one', 'two'])] + idx foo + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + To specify the name of the new column use `name`. + + >>> s.reset_index(name='values') + idx values + 0 a 1 + 1 b 2 + 2 c 3 + 3 d 4 + + To generate a new Series with the default set `drop` to True. + + >>> s.reset_index(drop=True) + 0 1 + 1 2 + 2 3 + 3 4 + Name: foo, dtype: int64 + + To update the Series in place, without generating a new one + set `inplace` to True. Note that it also requires ``drop=True``. + + >>> s.reset_index(inplace=True, drop=True) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + Name: foo, dtype: int64 + + The `level` parameter is interesting for Series with a multi-level + index. + + >>> arrays = [np.array(['bar', 'bar', 'baz', 'baz']), + ... np.array(['one', 'two', 'one', 'two'])] >>> s2 = pd.Series( - ... range(8), + ... range(4), name='foo', ... index=pd.MultiIndex.from_arrays(arrays, ... names=['a', 'b'])) + + To remove a specific level from the Index, use `level`. + >>> s2.reset_index(level='a') - a 0 + a foo b - one bar 0 - two bar 1 - one baz 2 - two baz 3 - one foo 4 - two foo 5 - one qux 6 - two qux 7 + one bar 0 + two bar 1 + one baz 2 + two baz 3 + + If `level` is not set, all levels are removed from the Index. + + >>> s2.reset_index() + a b foo + 0 bar one 0 + 1 bar two 1 + 2 baz one 2 + 3 baz two 3 """ inplace = validate_bool_kwarg(inplace, 'inplace') if drop: @@ -3626,13 +3683,74 @@ def notnull(self): def dropna(self, axis=0, inplace=False, **kwargs): """ - Return Series without null values + Return a new Series with missing values removed. + + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. + + Parameters + ---------- + axis : {0 or 'index'}, default 0 + There is only one axis to drop values from. + inplace : bool, default False + If True, do operation inplace and return None. + **kwargs + Not in use. Returns ------- - valid : Series - inplace : boolean, default False - Do operation in place. + Series + Series with NA entries dropped from it. + + See Also + -------- + Series.isna: Indicate missing values. + Series.notna : Indicate existing (non-missing) values. + Series.fillna : Replace missing values. + DataFrame.dropna : Drop rows or columns which contain NA values. + Index.dropna : Drop missing indices. + + Examples + -------- + >>> ser = pd.Series([1., 2., np.nan]) + >>> ser + 0 1.0 + 1 2.0 + 2 NaN + dtype: float64 + + Drop NA values from a Series. + + >>> ser.dropna() + 0 1.0 + 1 2.0 + dtype: float64 + + Keep the Series with valid entries in the same variable. + + >>> ser.dropna(inplace=True) + >>> ser + 0 1.0 + 1 2.0 + dtype: float64 + + Empty strings are not considered NA values. ``None`` is considered an + NA value. + + >>> ser = pd.Series([np.NaN, 2, pd.NaT, '', None, 'I stay']) + >>> ser + 0 NaN + 1 2 + 2 NaT + 3 + 4 None + 5 I stay + dtype: object + >>> ser.dropna() + 1 2 + 3 + 5 I stay + dtype: object """ inplace = validate_bool_kwarg(inplace, 'inplace') kwargs.pop('how', None) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 08a1cc29b83672..2eb2d284d85182 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1322,19 +1322,75 @@ def str_slice(arr, start=None, stop=None, step=None): def str_slice_replace(arr, start=None, stop=None, repl=None): """ - Replace a slice of each string in the Series/Index with another - string. + Replace a positional slice of a string with another value. Parameters ---------- - start : int or None - stop : int or None - repl : str or None - String for replacement + start : int, optional + Left index position to use for the slice. If not specified (None), + the slice is unbounded on the left, i.e. slice from the start + of the string. + stop : int, optional + Right index position to use for the slice. If not specified (None), + the slice is unbounded on the right, i.e. slice until the + end of the string. + repl : str, optional + String for replacement. If not specified (None), the sliced region + is replaced with an empty string. Returns ------- - replaced : Series/Index of objects + replaced : Series or Index + Same type as the original object. + + See Also + -------- + Series.str.slice : Just slicing without replacement. + + Examples + -------- + >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) + >>> s + 0 a + 1 ab + 2 abc + 3 abdc + 4 abcde + dtype: object + + Specify just `start`, meaning replace `start` until the end of the + string with `repl`. + + >>> s.str.slice_replace(1, repl='X') + 0 aX + 1 aX + 2 aX + 3 aX + 4 aX + dtype: object + + Specify just `stop`, meaning the start of the string to `stop` is replaced + with `repl`, and the rest of the string is included. + + >>> s.str.slice_replace(stop=2, repl='X') + 0 X + 1 X + 2 Xc + 3 Xdc + 4 Xcde + dtype: object + + Specify `start` and `stop`, meaning the slice from `start` to `stop` is + replaced with `repl`. Everything before or after `start` and `stop` is + included as is. + + >>> s.str.slice_replace(start=1, stop=3, repl='X') + 0 aX + 1 aX + 2 aX + 3 aXc + 4 aXde + dtype: object """ if repl is None: repl = '' diff --git a/pandas/core/window.py b/pandas/core/window.py index 358ef98e1c072b..b6217ae344ca5f 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -395,7 +395,48 @@ def aggregate(self, arg, *args, **kwargs): """) _shared_docs['mean'] = dedent(""" - %(name)s mean""") + Calculate the %(name)s mean of the values. + + Parameters + ---------- + *args + Under Review. + **kwargs + Under Review. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation. + + See Also + -------- + Series.%(name)s : Calling object with Series data + DataFrame.%(name)s : Calling object with DataFrames + Series.mean : Equivalent method for Series + DataFrame.mean : Equivalent method for DataFrame + + Examples + -------- + The below examples will show rolling mean calculations with window sizes of + two and three, respectively. + + >>> s = pd.Series([1, 2, 3, 4]) + >>> s.rolling(2).mean() + 0 NaN + 1 1.5 + 2 2.5 + 3 3.5 + dtype: float64 + + >>> s.rolling(3).mean() + 0 NaN + 1 NaN + 2 2.0 + 3 3.0 + dtype: float64 + """) class Window(_Window): @@ -718,7 +759,6 @@ def sum(self, *args, **kwargs): return self._apply_window(mean=False, **kwargs) @Substitution(name='window') - @Appender(_doc_template) @Appender(_shared_docs['mean']) def mean(self, *args, **kwargs): nv.validate_window_func('mean', args, kwargs) @@ -910,7 +950,38 @@ def max(self, *args, **kwargs): return self._apply('roll_max', 'max', **kwargs) _shared_docs['min'] = dedent(""" - %(name)s minimum + Calculate the %(name)s minimum. + + Parameters + ---------- + **kwargs + Under Review. + + Returns + ------- + Series or DataFrame + Returned object type is determined by the caller of the %(name)s + calculation. + + See Also + -------- + Series.%(name)s : Calling object with a Series + DataFrame.%(name)s : Calling object with a DataFrame + Series.min : Similar method for Series + DataFrame.min : Similar method for DataFrame + + Examples + -------- + Performing a rolling minimum with a window size of 3. + + >>> s = pd.Series([4, 3, 5, 2, 6]) + >>> s.rolling(3).min() + 0 NaN + 1 NaN + 2 3.0 + 3 2.0 + 4 2.0 + dtype: float64 """) def min(self, *args, **kwargs): @@ -1410,14 +1481,12 @@ def max(self, *args, **kwargs): return super(Rolling, self).max(*args, **kwargs) @Substitution(name='rolling') - @Appender(_doc_template) @Appender(_shared_docs['min']) def min(self, *args, **kwargs): nv.validate_rolling_func('min', args, kwargs) return super(Rolling, self).min(*args, **kwargs) @Substitution(name='rolling') - @Appender(_doc_template) @Appender(_shared_docs['mean']) def mean(self, *args, **kwargs): nv.validate_rolling_func('mean', args, kwargs) @@ -1671,14 +1740,12 @@ def max(self, *args, **kwargs): return super(Expanding, self).max(*args, **kwargs) @Substitution(name='expanding') - @Appender(_doc_template) @Appender(_shared_docs['min']) def min(self, *args, **kwargs): nv.validate_expanding_func('min', args, kwargs) return super(Expanding, self).min(*args, **kwargs) @Substitution(name='expanding') - @Appender(_doc_template) @Appender(_shared_docs['mean']) def mean(self, *args, **kwargs): nv.validate_expanding_func('mean', args, kwargs) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index ff34df64c88d2f..f2794574944e7e 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -103,10 +103,42 @@ class EmptyDataError(ValueError): class ParserWarning(Warning): """ - Warning that is raised in `pd.read_csv` whenever it is necessary - to change parsers (generally from 'c' to 'python') contrary to the - one specified by the user due to lack of support or functionality for - parsing particular attributes of a CSV file with the requested engine. + Warning raised when reading a file that doesn't use the default 'c' parser. + + Raised by `pd.read_csv` and `pd.read_table` when it is necessary to change + parsers, generally from the default 'c' parser to 'python'. + + It happens due to a lack of support or functionality for parsing a + particular attribute of a CSV file with the requested engine. + + Currently, 'c' unsupported options include the following parameters: + + 1. `sep` other than a single character (e.g. regex separators) + 2. `skipfooter` higher than 0 + 3. `sep=None` with `delim_whitespace=False` + + The warning can be avoided by adding `engine='python'` as a parameter in + `pd.read_csv` and `pd.read_table` methods. + + See Also + -------- + pd.read_csv : Read CSV (comma-separated) file into DataFrame. + pd.read_table : Read general delimited file into DataFrame. + + Examples + -------- + Using a `sep` in `pd.read_csv` other than a single character: + + >>> import io + >>> csv = u'''a;b;c + ... 1;1,8 + ... 1;2,1''' + >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]') + ... # ParserWarning: Falling back to the 'python' engine... + + Adding `engine='python'` to `pd.read_csv` removes the Warning: + + >>> df = pd.read_csv(io.StringIO(csv), sep='[;,]', engine='python') """ diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index cf3ae3c0368d39..da7c58428fb54f 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -3133,19 +3133,51 @@ def area(self, x=None, y=None, **kwds): def pie(self, y=None, **kwds): """ - Pie chart + Generate a pie plot. + + A pie plot is a proportional representation of the numerical data in a + column. This function wraps :meth:`matplotlib.pyplot.pie` for the + specified column. If no column reference is passed and + ``subplots=True`` a pie plot is drawn for each numerical column + independently. Parameters ---------- - y : label or position, optional - Column to plot. - `**kwds` : optional - Additional keyword arguments are documented in - :meth:`pandas.DataFrame.plot`. + y : int or label, optional + Label or position of the column to plot. + If not provided, ``subplots=True`` argument must be passed. + **kwds + Keyword arguments to pass on to :meth:`pandas.DataFrame.plot`. Returns ------- - axes : :class:`matplotlib.axes.Axes` or numpy.ndarray of them + axes : matplotlib.axes.Axes or np.ndarray of them. + A NumPy array is returned when `subplots` is True. + + See Also + -------- + Series.plot.pie : Generate a pie plot for a Series. + DataFrame.plot : Make plots of a DataFrame. + + Examples + -------- + In the example below we have a DataFrame with the information about + planet's mass and radius. We pass the the 'mass' column to the + pie function to get a pie plot. + + .. plot:: + :context: close-figs + + >>> df = pd.DataFrame({'mass': [0.330, 4.87 , 5.97], + ... 'radius': [2439.7, 6051.8, 6378.1]}, + ... index=['Mercury', 'Venus', 'Earth']) + >>> plot = df.plot.pie(y='mass', figsize=(5, 5)) + + .. plot:: + :context: close-figs + + >>> plot = df.plot.pie(subplots=True, figsize=(6, 3)) + """ return self(kind='pie', y=y, **kwds) diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py new file mode 100644 index 00000000000000..61764ec0ff6323 --- /dev/null +++ b/pandas/tests/categorical/test_algos.py @@ -0,0 +1,49 @@ +import pytest +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + + +@pytest.mark.parametrize('ordered', [True, False]) +@pytest.mark.parametrize('categories', [ + ['b', 'a', 'c'], + ['a', 'b', 'c', 'd'], +]) +def test_factorize(categories, ordered): + cat = pd.Categorical(['b', 'b', 'a', 'c', None], + categories=categories, + ordered=ordered) + labels, uniques = pd.factorize(cat) + expected_labels = np.array([0, 0, 1, 2, -1], dtype='int64') + expected_uniques = pd.Categorical(['b', 'a', 'c'], + categories=categories, + ordered=ordered) + + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort(): + cat = pd.Categorical(['b', 'b', None, 'a']) + labels, uniques = pd.factorize(cat, sort=True) + expected_labels = np.array([1, 1, -1, 0], dtype='int64') + expected_uniques = pd.Categorical(['a', 'b']) + + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_factorized_sort_ordered(): + cat = pd.Categorical(['b', 'b', None, 'a'], + categories=['c', 'b', 'a'], + ordered=True) + + labels, uniques = pd.factorize(cat, sort=True) + expected_labels = np.array([0, 0, -1, 1], dtype='int64') + expected_uniques = pd.Categorical(['b', 'a'], + categories=['c', 'b', 'a'], + ordered=True) + + tm.assert_numpy_array_equal(labels, expected_labels) + tm.assert_categorical_equal(uniques, expected_uniques) diff --git a/pandas/tests/categorical/test_dtypes.py b/pandas/tests/categorical/test_dtypes.py index 8973d1196f6a92..00e99db628c2ac 100644 --- a/pandas/tests/categorical/test_dtypes.py +++ b/pandas/tests/categorical/test_dtypes.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- - import pytest import numpy as np import pandas.util.testing as tm from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas import Categorical, Index, CategoricalIndex, Series +from pandas.compat import long +from pandas import Categorical, Index, CategoricalIndex, Series, Timestamp class TestCategoricalDtypes(object): @@ -161,3 +161,16 @@ def test_astype_category(self, dtype_ordered, cat_ordered): result = cat.astype('category') expected = cat tm.assert_categorical_equal(result, expected) + + def test_iter_python_types(self): + # GH-19909 + # TODO(Py2): Remove long + cat = Categorical([1, 2]) + assert isinstance(list(cat)[0], (int, long)) + assert isinstance(cat.tolist()[0], (int, long)) + + def test_iter_python_types_datetime(self): + cat = Categorical([Timestamp('2017-01-01'), + Timestamp('2017-01-02')]) + assert isinstance(list(cat)[0], Timestamp) + assert isinstance(cat.tolist()[0], Timestamp) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index bfec229d32b22d..2960a12b133d26 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -16,44 +16,46 @@ class TestPandasDtype(object): # Passing invalid dtype, both as a string or object, must raise TypeError # Per issue GH15520 - def test_invalid_dtype_error(self): - msg = 'not understood' - invalid_list = [pd.Timestamp, 'pd.Timestamp', list] - for dtype in invalid_list: - with tm.assert_raises_regex(TypeError, msg): - com.pandas_dtype(dtype) - - valid_list = [object, 'float64', np.object_, np.dtype('object'), 'O', - np.float64, float, np.dtype('float64')] - for dtype in valid_list: - com.pandas_dtype(dtype) - - def test_numpy_dtype(self): - for dtype in ['M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']: - assert com.pandas_dtype(dtype) == np.dtype(dtype) + @pytest.mark.parametrize('box', [pd.Timestamp, 'pd.Timestamp', list]) + def test_invalid_dtype_error(self, box): + with tm.assert_raises_regex(TypeError, 'not understood'): + com.pandas_dtype(box) + + @pytest.mark.parametrize('dtype', [ + object, 'float64', np.object_, np.dtype('object'), 'O', + np.float64, float, np.dtype('float64')]) + def test_pandas_dtype_valid(self, dtype): + assert com.pandas_dtype(dtype) == dtype + + @pytest.mark.parametrize('dtype', [ + 'M8[ns]', 'm8[ns]', 'object', 'float64', 'int64']) + def test_numpy_dtype(self, dtype): + assert com.pandas_dtype(dtype) == np.dtype(dtype) def test_numpy_string_dtype(self): # do not parse freq-like string as period dtype assert com.pandas_dtype('U') == np.dtype('U') assert com.pandas_dtype('S') == np.dtype('S') - def test_datetimetz_dtype(self): - for dtype in ['datetime64[ns, US/Eastern]', - 'datetime64[ns, Asia/Tokyo]', - 'datetime64[ns, UTC]']: - assert com.pandas_dtype(dtype) is DatetimeTZDtype(dtype) - assert com.pandas_dtype(dtype) == DatetimeTZDtype(dtype) - assert com.pandas_dtype(dtype) == dtype + @pytest.mark.parametrize('dtype', [ + 'datetime64[ns, US/Eastern]', + 'datetime64[ns, Asia/Tokyo]', + 'datetime64[ns, UTC]']) + def test_datetimetz_dtype(self, dtype): + assert com.pandas_dtype(dtype) is DatetimeTZDtype(dtype) + assert com.pandas_dtype(dtype) == DatetimeTZDtype(dtype) + assert com.pandas_dtype(dtype) == dtype def test_categorical_dtype(self): assert com.pandas_dtype('category') == CategoricalDtype() - def test_period_dtype(self): - for dtype in ['period[D]', 'period[3M]', 'period[U]', - 'Period[D]', 'Period[3M]', 'Period[U]']: - assert com.pandas_dtype(dtype) is PeriodDtype(dtype) - assert com.pandas_dtype(dtype) == PeriodDtype(dtype) - assert com.pandas_dtype(dtype) == dtype + @pytest.mark.parametrize('dtype', [ + 'period[D]', 'period[3M]', 'period[U]', + 'Period[D]', 'Period[3M]', 'Period[U]']) + def test_period_dtype(self, dtype): + assert com.pandas_dtype(dtype) is PeriodDtype(dtype) + assert com.pandas_dtype(dtype) == PeriodDtype(dtype) + assert com.pandas_dtype(dtype) == dtype dtypes = dict(datetime_tz=com.pandas_dtype('datetime64[ns, US/Eastern]'), diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index ca579e2dc93908..b6c5c119ffb6fb 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -1,77 +1,53 @@ # -*- coding: utf-8 -*- -import pandas as pd +import pytest import pandas.core.dtypes.concat as _concat - - -class TestConcatCompat(object): - - def check_concat(self, to_concat, exp): - for klass in [pd.Index, pd.Series]: - to_concat_klass = [klass(c) for c in to_concat] - res = _concat.get_dtype_kinds(to_concat_klass) - assert res == set(exp) - - def test_get_dtype_kinds(self): - to_concat = [['a'], [1, 2]] - self.check_concat(to_concat, ['i', 'object']) - - to_concat = [[3, 4], [1, 2]] - self.check_concat(to_concat, ['i']) - - to_concat = [[3, 4], [1, 2.1]] - self.check_concat(to_concat, ['i', 'f']) - - def test_get_dtype_kinds_datetimelike(self): - to_concat = [pd.DatetimeIndex(['2011-01-01']), - pd.DatetimeIndex(['2011-01-02'])] - self.check_concat(to_concat, ['datetime']) - - to_concat = [pd.TimedeltaIndex(['1 days']), - pd.TimedeltaIndex(['2 days'])] - self.check_concat(to_concat, ['timedelta']) - - def test_get_dtype_kinds_datetimelike_object(self): - to_concat = [pd.DatetimeIndex(['2011-01-01']), - pd.DatetimeIndex(['2011-01-02'], tz='US/Eastern')] - self.check_concat(to_concat, - ['datetime', 'datetime64[ns, US/Eastern]']) - - to_concat = [pd.DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'), - pd.DatetimeIndex(['2011-01-02'], tz='US/Eastern')] - self.check_concat(to_concat, - ['datetime64[ns, Asia/Tokyo]', - 'datetime64[ns, US/Eastern]']) - - # timedelta has single type - to_concat = [pd.TimedeltaIndex(['1 days']), - pd.TimedeltaIndex(['2 hours'])] - self.check_concat(to_concat, ['timedelta']) - - to_concat = [pd.DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'), - pd.TimedeltaIndex(['1 days'])] - self.check_concat(to_concat, - ['datetime64[ns, Asia/Tokyo]', 'timedelta']) - - def test_get_dtype_kinds_period(self): - # because we don't have Period dtype (yet), - # Series results in object dtype - to_concat = [pd.PeriodIndex(['2011-01'], freq='M'), - pd.PeriodIndex(['2011-01'], freq='M')] - res = _concat.get_dtype_kinds(to_concat) - assert res == set(['period[M]']) - - to_concat = [pd.Series([pd.Period('2011-01', freq='M')]), - pd.Series([pd.Period('2011-02', freq='M')])] - res = _concat.get_dtype_kinds(to_concat) - assert res == set(['object']) - - to_concat = [pd.PeriodIndex(['2011-01'], freq='M'), - pd.PeriodIndex(['2011-01'], freq='D')] - res = _concat.get_dtype_kinds(to_concat) - assert res == set(['period[M]', 'period[D]']) - - to_concat = [pd.Series([pd.Period('2011-01', freq='M')]), - pd.Series([pd.Period('2011-02', freq='D')])] - res = _concat.get_dtype_kinds(to_concat) - assert res == set(['object']) +from pandas import ( + Index, DatetimeIndex, PeriodIndex, TimedeltaIndex, Series, Period) + + +@pytest.mark.parametrize('to_concat, expected', [ + # int/float/str + ([['a'], [1, 2]], ['i', 'object']), + ([[3, 4], [1, 2]], ['i']), + ([[3, 4], [1, 2.1]], ['i', 'f']), + + # datetimelike + ([DatetimeIndex(['2011-01-01']), DatetimeIndex(['2011-01-02'])], + ['datetime']), + ([TimedeltaIndex(['1 days']), TimedeltaIndex(['2 days'])], + ['timedelta']), + + # datetimelike object + ([DatetimeIndex(['2011-01-01']), + DatetimeIndex(['2011-01-02'], tz='US/Eastern')], + ['datetime', 'datetime64[ns, US/Eastern]']), + ([DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'), + DatetimeIndex(['2011-01-02'], tz='US/Eastern')], + ['datetime64[ns, Asia/Tokyo]', 'datetime64[ns, US/Eastern]']), + ([TimedeltaIndex(['1 days']), TimedeltaIndex(['2 hours'])], + ['timedelta']), + ([DatetimeIndex(['2011-01-01'], tz='Asia/Tokyo'), + TimedeltaIndex(['1 days'])], + ['datetime64[ns, Asia/Tokyo]', 'timedelta'])]) +@pytest.mark.parametrize('klass', [Index, Series]) +def test_get_dtype_kinds(klass, to_concat, expected): + to_concat_klass = [klass(c) for c in to_concat] + result = _concat.get_dtype_kinds(to_concat_klass) + assert result == set(expected) + + +@pytest.mark.parametrize('to_concat, expected', [ + # because we don't have Period dtype (yet), + # Series results in object dtype + ([PeriodIndex(['2011-01'], freq='M'), + PeriodIndex(['2011-01'], freq='M')], ['period[M]']), + ([Series([Period('2011-01', freq='M')]), + Series([Period('2011-02', freq='M')])], ['object']), + ([PeriodIndex(['2011-01'], freq='M'), + PeriodIndex(['2011-01'], freq='D')], ['period[M]', 'period[D]']), + ([Series([Period('2011-01', freq='M')]), + Series([Period('2011-02', freq='D')])], ['object'])]) +def test_get_dtype_kinds_period(to_concat, expected): + result = _concat.get_dtype_kinds(to_concat) + assert result == set(expected) diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index adc690939b36c1..74fe8f196a089b 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -11,3 +11,8 @@ def test_astype_object_series(self, all_data): ser = pd.Series({"A": all_data}) result = ser.astype(object) assert isinstance(result._data.blocks[0], ObjectBlock) + + def test_tolist(self, data): + result = pd.Series(data).tolist() + expected = list(data) + assert result == expected diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index e1596f0675f32f..2162552e9650dc 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -32,9 +32,6 @@ def test_array_interface(self, data): result = np.array(data) assert result[0] == data[0] - def test_as_ndarray_with_dtype_kind(self, data): - np.array(data, dtype=data.dtype.kind) - def test_repr(self, data): ser = pd.Series(data) assert data.dtype.name in repr(ser) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 3ae82fa1ca4324..bf404ac01bf2bd 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd import pandas.util.testing as tm @@ -45,3 +46,71 @@ def test_dropna_frame(self, data_missing): result = df.dropna() expected = df.iloc[:0] self.assert_frame_equal(result, expected) + + def test_fillna_limit_pad(self, data_missing): + arr = data_missing.take([1, 0, 0, 0, 1]) + result = pd.Series(arr).fillna(method='ffill', limit=2) + expected = pd.Series(data_missing.take([1, 1, 1, 0, 1])) + self.assert_series_equal(result, expected) + + def test_fillna_limit_backfill(self, data_missing): + arr = data_missing.take([1, 0, 0, 0, 1]) + result = pd.Series(arr).fillna(method='backfill', limit=2) + expected = pd.Series(data_missing.take([1, 0, 1, 1, 1])) + self.assert_series_equal(result, expected) + + def test_fillna_series(self, data_missing): + fill_value = data_missing[1] + ser = pd.Series(data_missing) + + result = ser.fillna(fill_value) + expected = pd.Series(type(data_missing)([fill_value, fill_value])) + self.assert_series_equal(result, expected) + + # Fill with a series + result = ser.fillna(expected) + self.assert_series_equal(result, expected) + + # Fill with a series not affecting the missing values + result = ser.fillna(ser) + self.assert_series_equal(result, ser) + + @pytest.mark.parametrize('method', ['ffill', 'bfill']) + def test_fillna_series_method(self, data_missing, method): + fill_value = data_missing[1] + + if method == 'ffill': + data_missing = type(data_missing)(data_missing[::-1]) + + result = pd.Series(data_missing).fillna(method=method) + expected = pd.Series(type(data_missing)([fill_value, fill_value])) + + self.assert_series_equal(result, expected) + + def test_fillna_frame(self, data_missing): + fill_value = data_missing[1] + + result = pd.DataFrame({ + "A": data_missing, + "B": [1, 2] + }).fillna(fill_value) + + expected = pd.DataFrame({ + "A": type(data_missing)([fill_value, fill_value]), + "B": [1, 2], + }) + + self.assert_frame_equal(result, expected) + + def test_fillna_fill_other(self, data): + result = pd.DataFrame({ + "A": data, + "B": [np.nan] * len(data) + }).fillna({"B": 0.0}) + + expected = pd.DataFrame({ + "A": data, + "B": [0.0] * len(result), + }) + + self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 8f413b4a197308..b6dd181c1d8f30 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -69,7 +69,14 @@ def test_getitem_scalar(self): class TestMissing(base.BaseMissingTests): - pass + + @pytest.mark.skip(reason="Not implemented") + def test_fillna_limit_pad(self): + pass + + @pytest.mark.skip(reason="Not implemented") + def test_fillna_limit_backfill(self): + pass class TestMethods(base.BaseMethodsTests): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 736556e4be20df..f1852542088ff4 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -71,6 +71,7 @@ def isna(self): return np.array([x.is_nan() for x in self.values]) def take(self, indexer, allow_fill=True, fill_value=None): + indexer = np.asarray(indexer) mask = indexer == -1 indexer = _ensure_platform_int(indexer) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7b4d079ecad873..01ae092bc15213 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -35,68 +35,59 @@ def na_value(): return decimal.Decimal("NaN") -class TestDtype(base.BaseDtypeTests): - pass +class BaseDecimal(object): + @staticmethod + def assert_series_equal(left, right, *args, **kwargs): + # tm.assert_series_equal doesn't handle Decimal('NaN'). + # We will ensure that the NA values match, and then + # drop those values before moving on. + left_na = left.isna() + right_na = right.isna() -class TestInterface(base.BaseInterfaceTests): - pass + tm.assert_series_equal(left_na, right_na) + tm.assert_series_equal(left[~left_na], right[~right_na], + *args, **kwargs) + @staticmethod + def assert_frame_equal(left, right, *args, **kwargs): + # TODO(EA): select_dtypes + decimals = (left.dtypes == 'decimal').index -class TestConstructors(base.BaseConstructorsTests): - pass + for col in decimals: + BaseDecimal.assert_series_equal(left[col], right[col], + *args, **kwargs) + left = left.drop(columns=decimals) + right = right.drop(columns=decimals) + tm.assert_frame_equal(left, right, *args, **kwargs) -class TestReshaping(base.BaseReshapingTests): - def test_align(self, data, na_value): - # Have to override since assert_series_equal doesn't - # compare Decimal(NaN) properly. - a = data[:3] - b = data[2:5] - r1, r2 = pd.Series(a).align(pd.Series(b, index=[1, 2, 3])) +class TestDtype(BaseDecimal, base.BaseDtypeTests): + pass - # NaN handling - e1 = pd.Series(type(data)(list(a) + [na_value])) - e2 = pd.Series(type(data)([na_value] + list(b))) - tm.assert_series_equal(r1.iloc[:3], e1.iloc[:3]) - assert r1[3].is_nan() - assert e1[3].is_nan() - tm.assert_series_equal(r2.iloc[1:], e2.iloc[1:]) - assert r2[0].is_nan() - assert e2[0].is_nan() +class TestInterface(BaseDecimal, base.BaseInterfaceTests): + pass - def test_align_frame(self, data, na_value): - # Override for Decimal(NaN) comparison - a = data[:3] - b = data[2:5] - r1, r2 = pd.DataFrame({'A': a}).align( - pd.DataFrame({'A': b}, index=[1, 2, 3]) - ) - # Assumes that the ctor can take a list of scalars of the type - e1 = pd.DataFrame({'A': type(data)(list(a) + [na_value])}) - e2 = pd.DataFrame({'A': type(data)([na_value] + list(b))}) +class TestConstructors(BaseDecimal, base.BaseConstructorsTests): + pass - tm.assert_frame_equal(r1.iloc[:3], e1.iloc[:3]) - assert r1.loc[3, 'A'].is_nan() - assert e1.loc[3, 'A'].is_nan() - tm.assert_frame_equal(r2.iloc[1:], e2.iloc[1:]) - assert r2.loc[0, 'A'].is_nan() - assert e2.loc[0, 'A'].is_nan() +class TestReshaping(BaseDecimal, base.BaseReshapingTests): + pass -class TestGetitem(base.BaseGetitemTests): +class TestGetitem(BaseDecimal, base.BaseGetitemTests): pass -class TestMissing(base.BaseMissingTests): +class TestMissing(BaseDecimal, base.BaseMissingTests): pass -class TestMethods(base.BaseMethodsTests): +class TestMethods(BaseDecimal, base.BaseMethodsTests): @pytest.mark.parametrize('dropna', [True, False]) @pytest.mark.xfail(reason="value_counts not implemented yet.") def test_value_counts(self, all_data, dropna): @@ -112,7 +103,7 @@ def test_value_counts(self, all_data, dropna): tm.assert_series_equal(result, expected) -class TestCasting(base.BaseCastingTests): +class TestCasting(BaseDecimal, base.BaseCastingTests): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index e0721bb1d8d1af..16d5e4415a79f6 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -60,7 +60,13 @@ class TestGetitem(base.BaseGetitemTests): class TestMissing(base.BaseMissingTests): - pass + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_series(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + + @pytest.mark.xfail(reason="Setting a dict as a scalar") + def test_fillna_frame(self): + """We treat dictionaries as a mapping in fillna, not a scalar.""" class TestMethods(base.BaseMethodsTests): diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 1f4582f6874156..589134632c7e9d 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -5,10 +5,10 @@ import pandas.util.testing as tm from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.common import is_extension_array_dtype -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes import dtypes -class DummyDtype(ExtensionDtype): +class DummyDtype(dtypes.ExtensionDtype): pass @@ -65,3 +65,21 @@ def test_astype_no_copy(): result = arr.astype(arr.dtype) assert arr.data is not result + + +@pytest.mark.parametrize('dtype', [ + dtypes.DatetimeTZDtype('ns', 'US/Central'), + dtypes.PeriodDtype("D"), + dtypes.IntervalDtype(), +]) +def test_is_not_extension_array_dtype(dtype): + assert not isinstance(dtype, dtypes.ExtensionDtype) + assert not is_extension_array_dtype(dtype) + + +@pytest.mark.parametrize('dtype', [ + dtypes.CategoricalDtype(), +]) +def test_is_extension_array_dtype(dtype): + assert isinstance(dtype, dtypes.ExtensionDtype) + assert is_extension_array_dtype(dtype) diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 0d1a9e65ce6c61..63d5338d88d768 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -182,7 +182,7 @@ def test_dti_summary(self): for idx, expected in zip([idx1, idx2, idx3, idx4, idx5, idx6], [exp1, exp2, exp3, exp4, exp5, exp6]): - result = idx.summary() + result = idx._summary() assert result == expected def test_dti_business_repr(self): @@ -191,15 +191,15 @@ def test_dti_business_repr(self): def test_dti_business_summary(self): rng = pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1)) - rng.summary() - rng[2:2].summary() + rng._summary() + rng[2:2]._summary() def test_dti_business_summary_pytz(self): - pd.bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + pd.bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc)._summary() def test_dti_business_summary_dateutil(self): pd.bdate_range('1/1/2005', '1/1/2009', - tz=dateutil.tz.tzutc()).summary() + tz=dateutil.tz.tzutc())._summary() def test_dti_custom_business_repr(self): # only really care that it works @@ -209,12 +209,13 @@ def test_dti_custom_business_repr(self): def test_dti_custom_business_summary(self): rng = pd.bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1), freq='C') - rng.summary() - rng[2:2].summary() + rng._summary() + rng[2:2]._summary() def test_dti_custom_business_summary_pytz(self): - pd.bdate_range('1/1/2005', '1/1/2009', freq='C', tz=pytz.utc).summary() + pd.bdate_range('1/1/2005', '1/1/2009', freq='C', + tz=pytz.utc)._summary() def test_dti_custom_business_summary_dateutil(self): pd.bdate_range('1/1/2005', '1/1/2009', freq='C', - tz=dateutil.tz.tzutc()).summary() + tz=dateutil.tz.tzutc())._summary() diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index b1a1060bf86c43..c3926cc5f16334 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -205,5 +205,5 @@ def test_summary(self): idx6, idx7, idx8, idx9], [exp1, exp2, exp3, exp4, exp5, exp6, exp7, exp8, exp9]): - result = idx.summary() + result = idx._summary() assert result == expected diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index e8f05cb928cade..22ef2fe7aa19e1 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -20,7 +20,7 @@ from pandas import (period_range, date_range, Series, DataFrame, Float64Index, Int64Index, UInt64Index, CategoricalIndex, DatetimeIndex, TimedeltaIndex, - PeriodIndex, isna) + PeriodIndex, RangeIndex, isna) from pandas.core.index import _get_combined_index, _ensure_index_from_sequences from pandas.util.testing import assert_almost_equal from pandas.compat.numpy import np_datetime64_compat @@ -44,7 +44,7 @@ def setup_method(self, method): tdIndex=tm.makeTimedeltaIndex(100), intIndex=tm.makeIntIndex(100), uintIndex=tm.makeUIntIndex(100), - rangeIndex=tm.makeIntIndex(100), + rangeIndex=tm.makeRangeIndex(100), floatIndex=tm.makeFloatIndex(100), boolIndex=Index([True, False]), catIndex=tm.makeCategoricalIndex(100), @@ -57,6 +57,15 @@ def setup_method(self, method): def create_index(self): return Index(list('abcde')) + def generate_index_types(self, skip_index_keys=[]): + """ + Return a generator of the various index types, leaving + out the ones with a key in skip_index_keys + """ + for key, idx in self.indices.items(): + if key not in skip_index_keys: + yield key, idx + def test_new_axis(self): new_index = self.dateIndex[None, :] assert new_index.ndim == 2 @@ -406,6 +415,27 @@ def test_constructor_dtypes_timedelta(self): pd.TimedeltaIndex(list(values), dtype=dtype)]: tm.assert_index_equal(res, idx) + def test_constructor_empty(self): + skip_index_keys = ["repeats", "periodIndex", "rangeIndex", + "tuples"] + for key, idx in self.generate_index_types(skip_index_keys): + empty = idx.__class__([]) + assert isinstance(empty, idx.__class__) + assert not len(empty) + + empty = PeriodIndex([], freq='B') + assert isinstance(empty, PeriodIndex) + assert not len(empty) + + empty = RangeIndex(step=1) + assert isinstance(empty, pd.RangeIndex) + assert not len(empty) + + empty = MultiIndex(levels=[[1, 2], ['blue', 'red']], + labels=[[], []]) + assert isinstance(empty, MultiIndex) + assert not len(empty) + def test_view_with_args(self): restricted = ['unicodeIndex', 'strIndex', 'catIndex', 'boolIndex', @@ -1034,6 +1064,27 @@ def test_symmetric_difference(self): assert tm.equalContents(result, expected) assert result.name == 'new_name' + def test_difference_type(self): + # GH 20040 + # If taking difference of a set and itself, it + # needs to preserve the type of the index + skip_index_keys = ['repeats'] + for key, idx in self.generate_index_types(skip_index_keys): + result = idx.difference(idx) + expected = idx.drop(idx) + tm.assert_index_equal(result, expected) + + def test_intersection_difference(self): + # GH 20040 + # Test that the intersection of an index with an + # empty index produces the same index as the difference + # of an index with itself. Test for all types + skip_index_keys = ['repeats'] + for key, idx in self.generate_index_types(skip_index_keys): + inter = idx.intersection(idx.drop(idx)) + diff = idx.difference(idx) + tm.assert_index_equal(inter, diff) + def test_is_numeric(self): assert not self.dateIndex.is_numeric() assert not self.strIndex.is_numeric() @@ -1055,14 +1106,21 @@ def test_is_all_dates(self): assert not self.intIndex.is_all_dates def test_summary(self): - self._check_method_works(Index.summary) + self._check_method_works(Index._summary) # GH3869 ind = Index(['{other}%s', "~:{range}:0"], name='A') - result = ind.summary() + result = ind._summary() # shouldn't be formatted accidentally. assert '~:{range}:0' in result assert '{other}%s' in result + # GH18217 + def test_summary_deprecated(self): + ind = Index(['{other}%s', "~:{range}:0"], name='A') + + with tm.assert_produces_warning(FutureWarning): + ind.summary() + def test_format(self): self._check_method_works(Index.format) diff --git a/pandas/tests/indexes/timedeltas/test_formats.py b/pandas/tests/indexes/timedeltas/test_formats.py index a8375459d74e46..09921fac80d22f 100644 --- a/pandas/tests/indexes/timedeltas/test_formats.py +++ b/pandas/tests/indexes/timedeltas/test_formats.py @@ -92,5 +92,5 @@ def test_summary(self): for idx, expected in zip([idx1, idx2, idx3, idx4, idx5], [exp1, exp2, exp3, exp4, exp5]): - result = idx.summary() + result = idx._summary() assert result == expected diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index 4291d59123e8bb..01c6620e50d372 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -12,6 +12,13 @@ import pandas.util.testing as tm import pandas.util._test_decorators as td +import gzip +import bz2 +try: + lzma = compat.import_lzma() +except ImportError: + lzma = None + class CompressionTests(object): @@ -64,83 +71,36 @@ def test_zip(self): pytest.raises(zipfile.BadZipfile, self.read_csv, f, compression='zip') - def test_gzip(self): - import gzip - - with open(self.csv1, 'rb') as data_file: - data = data_file.read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='gzip') - tm.assert_frame_equal(result, expected) - - with open(path, 'rb') as f: - result = self.read_csv(f, compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean('test.gz') as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - result = self.read_csv(path, compression='infer') - tm.assert_frame_equal(result, expected) - - def test_bz2(self): - import bz2 + @pytest.mark.parametrize('compress_type, compress_method, ext', [ + ('gzip', gzip.GzipFile, 'gz'), + ('bz2', bz2.BZ2File, 'bz2'), + pytest.param('xz', getattr(lzma, 'LZMAFile', None), 'xz', + marks=td.skip_if_no_lzma) + ]) + def test_other_compression(self, compress_type, compress_method, ext): with open(self.csv1, 'rb') as data_file: data = data_file.read() expected = self.read_csv(self.csv1) with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') + tmp = compress_method(path, mode='wb') tmp.write(data) tmp.close() - result = self.read_csv(path, compression='bz2') + result = self.read_csv(path, compression=compress_type) tm.assert_frame_equal(result, expected) - pytest.raises(ValueError, self.read_csv, - path, compression='bz3') + if compress_type == 'bz2': + pytest.raises(ValueError, self.read_csv, + path, compression='bz3') with open(path, 'rb') as fin: - result = self.read_csv(fin, compression='bz2') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean('test.bz2') as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - result = self.read_csv(path, compression='infer') - tm.assert_frame_equal(result, expected) - - @td.skip_if_no_lzma - def test_xz(self): - lzma = compat.import_lzma() - - with open(self.csv1, 'rb') as data_file: - data = data_file.read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = lzma.LZMAFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='xz') - tm.assert_frame_equal(result, expected) - - with open(path, 'rb') as f: - result = self.read_csv(f, compression='xz') + result = self.read_csv(fin, compression=compress_type) tm.assert_frame_equal(result, expected) - with tm.ensure_clean('test.xz') as path: - tmp = lzma.LZMAFile(path, mode='wb') + with tm.ensure_clean('test.{}'.format(ext)) as path: + tmp = compress_method(path, mode='wb') tmp.write(data) tmp.close() result = self.read_csv(path, compression='infer') diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index dbf7c7f100b0e1..3c5e7779a9e597 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1342,11 +1342,6 @@ def test_join_multi_levels2(self): .set_index(["household_id", "asset_id", "t"]) .reindex(columns=['share', 'log_return'])) - def f(): - household.join(log_return, how='inner') - - pytest.raises(NotImplementedError, f) - # this is the equivalency result = (merge(household.reset_index(), log_return.reset_index(), on=['asset_id'], how='inner') @@ -1356,7 +1351,7 @@ def f(): expected = ( DataFrame(dict( household_id=[1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4], - asset_id=["nl0000301109", "nl0000289783", "gb00b03mlx29", + asset_id=["nl0000301109", "nl0000301109", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "gb00b03mlx29", "lu0197800237", "lu0197800237", @@ -1369,12 +1364,115 @@ def f(): .09604978, -.06524096, .03532373, .03025441, .036997, None, None] )) - .set_index(["household_id", "asset_id", "t"])) + .set_index(["household_id", "asset_id", "t"]) + .reindex(columns=['share', 'log_return'])) - def f(): - household.join(log_return, how='outer') + result = (merge(household.reset_index(), log_return.reset_index(), + on=['asset_id'], how='outer') + .set_index(['household_id', 'asset_id', 't'])) + + assert_frame_equal(result, expected) + + +@pytest.fixture +def left_multi(): + return ( + DataFrame( + dict(Origin=['A', 'A', 'B', 'B', 'C'], + Destination=['A', 'B', 'A', 'C', 'A'], + Period=['AM', 'PM', 'IP', 'AM', 'OP'], + TripPurp=['hbw', 'nhb', 'hbo', 'nhb', 'hbw'], + Trips=[1987, 3647, 2470, 4296, 4444]), + columns=['Origin', 'Destination', 'Period', + 'TripPurp', 'Trips']) + .set_index(['Origin', 'Destination', 'Period', 'TripPurp'])) + + +@pytest.fixture +def right_multi(): + return ( + DataFrame( + dict(Origin=['A', 'A', 'B', 'B', 'C', 'C', 'E'], + Destination=['A', 'B', 'A', 'B', 'A', 'B', 'F'], + Period=['AM', 'PM', 'IP', 'AM', 'OP', 'IP', 'AM'], + LinkType=['a', 'a', 'c', 'b', 'a', 'b', 'a'], + Distance=[100, 80, 90, 80, 75, 35, 55]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + + +@pytest.fixture +def on_cols(): + return ['Origin', 'Destination', 'Period'] - pytest.raises(NotImplementedError, f) + +@pytest.fixture +def idx_cols(): + return ['Origin', 'Destination', 'Period', 'TripPurp', 'LinkType'] + + +class TestJoinMultiMulti(object): + + def test_join_multi_multi(self, left_multi, right_multi, join_type, + on_cols, idx_cols): + # Multi-index join tests + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, on=on_cols).set_index(idx_cols) + .sort_index()) + + result = left_multi.join(right_multi, how=join_type).sort_index() + tm.assert_frame_equal(result, expected) + + """ + def test_join_multi_multi_emptylevel(self, left_multi, right_multi, + join_type, on_cols, idx_cols): + # Join with empty level + num_lvls = len(right_multi.index.get_level_values('Period')) + # Set one level to None + right_multi.index.set_levels([np.nan] * num_lvls, level='Period', + inplace=True) + + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=how, on=on_cols).set_index(idx_cols) + .sort_index()) + + result = left_multi.join(right_multi, how=join_type).sort_index() + tm.assert_frame_equal(result, expected) + """ + + def test_join_multi_empty_frames(self, left_multi, right_multi, join_type, + on_cols, idx_cols): + + left_multi = left_multi.drop(columns=left_multi.columns) + right_multi = right_multi.drop(columns=right_multi.columns) + + expected = (pd.merge(left_multi.reset_index(), + right_multi.reset_index(), + how=join_type, on=on_cols).set_index(idx_cols) + .sort_index()) + + result = left_multi.join(right_multi, how=join_type).sort_index() + tm.assert_frame_equal(result, expected) + + def test_join_multi_multi_nonunique(self, left_multi): + # Non-unique resulting index + right_multi = ( + DataFrame( + dict(Origin=[1, 1, 2], + Destination=[1, 1, 1], + Period=['AM', 'AM', 'PM'], + LinkType=['a', 'b', 'a'], + Distance=[100, 110, 120]), + columns=['Origin', 'Destination', 'Period', + 'LinkType', 'Distance']) + .set_index(['Origin', 'Destination', 'Period', 'LinkType'])) + + def f(): + left_multi.join(right_multi, how='left') + pytest.raises(ValueError, f) @pytest.mark.parametrize("klass", [None, np.asarray, Series, Index]) def test_merge_datetime_index(self, klass): diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 3abc0f724db256..47798d0ddd7f5e 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -355,16 +355,16 @@ def test_strftime(self): datetime_index = date_range('20150301', periods=5) result = datetime_index.strftime("%Y/%m/%d") - expected = np.array(['2015/03/01', '2015/03/02', '2015/03/03', - '2015/03/04', '2015/03/05'], dtype=np.object_) + expected = Index(['2015/03/01', '2015/03/02', '2015/03/03', + '2015/03/04', '2015/03/05'], dtype=np.object_) # dtype may be S10 or U10 depending on python version - tm.assert_numpy_array_equal(result, expected, check_dtype=False) + tm.assert_index_equal(result, expected) period_index = period_range('20150301', periods=5) result = period_index.strftime("%Y/%m/%d") - expected = np.array(['2015/03/01', '2015/03/02', '2015/03/03', - '2015/03/04', '2015/03/05'], dtype='=U10') - tm.assert_numpy_array_equal(result, expected) + expected = Index(['2015/03/01', '2015/03/02', '2015/03/03', + '2015/03/04', '2015/03/05'], dtype='=U10') + tm.assert_index_equal(result, expected) s = Series([datetime(2013, 1, 1, 2, 32, 59), datetime(2013, 1, 2, 14, 32, 1)])