diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 32ffb3330564c3..b79f0f71dac23d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,17 +1,17 @@ repos: - - repo: https://github.com/python/black - rev: stable - hooks: - - id: black - language_version: python3.7 - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.7 - hooks: - - id: flake8 - language: python_venv - additional_dependencies: [flake8-comprehensions] - - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.20 - hooks: - - id: isort - language: python_venv +- repo: https://github.com/python/black + rev: stable + hooks: + - id: black + language_version: python3.7 +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.7.7 + hooks: + - id: flake8 + language: python_venv + additional_dependencies: [flake8-comprehensions] +- repo: https://github.com/pre-commit/mirrors-isort + rev: v4.3.20 + hooks: + - id: isort + language: python_venv diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 571ede1a211340..c04bbf53a86a6f 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -50,12 +50,13 @@ "xlsxwriter": [], "xlrd": [], "xlwt": [], + "odfpy": [], "pytest": [], // If using Windows with python 2.7 and want to build using the // mingw toolchain (rather than MSVC), uncomment the following line. // "libpython": [], }, - + "conda_channels": ["defaults", "conda-forge"], // Combinations of libraries/python versions can be excluded/included // from the set to test. Each entry is a dictionary containing additional // key-value pairs to include/exclude. diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index c43e5dfd729aad..501e27b9078ec6 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame try: @@ -32,4 +33,4 @@ def time_cache_readonly(self): self.obj.prop -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index fd3324b78f1c3d..58e0db67d60254 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, Series, date_range from pandas.core.algorithms import checked_add_with_arr @@ -155,4 +156,4 @@ def time_add_overflow_both_arg_nan(self): ) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 8097118a79d20d..559aa7050a6407 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,7 +1,9 @@ +import warnings + import numpy as np + import pandas as pd import pandas.util.testing as tm -import warnings try: from pandas.api.types import union_categoricals @@ -280,4 +282,4 @@ def time_sort_values(self): self.index.sort_values(ascending=False) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index 654075292cdf62..ec3dd7a48a89f4 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DatetimeIndex, Index, MultiIndex, Series, Timestamp import pandas.util.testing as tm -from pandas import Series, Index, DatetimeIndex, Timestamp, MultiIndex def no_change(arr): @@ -113,4 +114,4 @@ def time_multiindex_from_iterables(self): MultiIndex.from_product(self.iterables) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 60800b1f9cae71..24cc1c6f9fa701 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -1,14 +1,14 @@ +import numpy as np + from pandas.api.types import pandas_dtype -import numpy as np from .pandas_vb_common import ( - numeric_dtypes, datetime_dtypes, - string_dtypes, extension_dtypes, + numeric_dtypes, + string_dtypes, ) - _numpy_dtypes = [ np.dtype(dtype) for dtype in (numeric_dtypes + datetime_dtypes + string_dtypes) ] @@ -40,4 +40,4 @@ def time_pandas_dtype_invalid(self, dtype): pass -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 84e94315cc28b0..06a181875aaa85 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,4 +1,5 @@ import numpy as np + import pandas as pd try: @@ -62,4 +63,4 @@ def time_query_with_boolean_selection(self): self.df.query("(a >= @self.min_val) & (a <= @self.max_val)") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index acfb26bcf5d7ca..3944e0bc523d84 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range try: from pandas.tseries.offsets import Nano, Hour @@ -104,4 +105,4 @@ def time_frame_from_lists(self): self.df = DataFrame(self.data) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index e2f6764c76eef8..05f98c66faa2b8 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,5 +1,5 @@ -import warnings import string +import warnings import numpy as np @@ -609,4 +609,4 @@ def time_dataframe_describe(self): self.df.describe() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 0d0b75561d057a..d57492dd372680 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,7 +1,8 @@ import numpy as np -import pandas.util.testing as tm -from pandas import DataFrame, Series, read_csv, factorize, date_range + +from pandas import DataFrame, Series, date_range, factorize, read_csv from pandas.core.algorithms import take_1d +import pandas.util.testing as tm try: from pandas import ( @@ -36,7 +37,7 @@ def wrapper(fname): return wrapper -from .pandas_vb_common import BaseIO +from .pandas_vb_common import BaseIO # noqa: E402 isort:skip class ParallelGroupbyMethods: @@ -301,4 +302,4 @@ def time_loop(self, threads): self.loop() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 39b07d4734399e..d51c53e2264f1a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -15,7 +15,6 @@ ) import pandas.util.testing as tm - method_blacklist = { "object": { "median", @@ -626,4 +625,4 @@ def time_first(self): self.df_nans.groupby("key").transform("first") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 49834ae94cc387..a94960d4947077 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,15 +1,17 @@ import gc + import numpy as np -import pandas.util.testing as tm + from pandas import ( - Series, - date_range, DatetimeIndex, - Index, - RangeIndex, Float64Index, + Index, IntervalIndex, + RangeIndex, + Series, + date_range, ) +import pandas.util.testing as tm class SetOperations: @@ -243,4 +245,4 @@ def peakmem_gc_instances(self, N): gc.enable() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 84604b8196536b..ac35139c1954ab 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,22 +1,23 @@ import warnings import numpy as np -import pandas.util.testing as tm + from pandas import ( - Series, + CategoricalIndex, DataFrame, - MultiIndex, - Int64Index, - UInt64Index, Float64Index, - IntervalIndex, - CategoricalIndex, IndexSlice, + Int64Index, + IntervalIndex, + MultiIndex, + Series, + UInt64Index, concat, date_range, option_context, period_range, ) +import pandas.util.testing as tm class NumericSeriesIndexing: @@ -371,4 +372,4 @@ def time_chained_indexing(self, mode): df2["C"] = 1.0 -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 66ef4f2aec380c..e85b3bd2c76879 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,8 +1,9 @@ import numpy as np -import pandas.util.testing as tm + from pandas import DataFrame, Series, to_numeric +import pandas.util.testing as tm -from .pandas_vb_common import numeric_dtypes, lib +from .pandas_vb_common import lib, numeric_dtypes class NumericInferOps: @@ -120,4 +121,4 @@ def time_convert(self, data): lib.maybe_convert_numeric(data, set(), coerce_numeric=False) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 4525e504fc4dd5..9b8599b0a1b64a 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -1,10 +1,11 @@ +from io import StringIO import random import string import numpy as np + +from pandas import Categorical, DataFrame, date_range, read_csv, to_datetime import pandas.util.testing as tm -from pandas import DataFrame, Categorical, date_range, read_csv, to_datetime -from io import StringIO from ..pandas_vb_common import BaseIO @@ -406,4 +407,4 @@ def time_to_datetime_format_DD_MM_YYYY(self, cache_dates): to_datetime(df["date"], cache=cache_dates, format="%d-%m-%Y") -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 12e70f84e52038..c97cf768e27d97 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -1,38 +1,72 @@ from io import BytesIO + import numpy as np -from pandas import DataFrame, date_range, ExcelWriter, read_excel +from odf.opendocument import OpenDocumentSpreadsheet +from odf.table import Table, TableCell, TableRow +from odf.text import P + +from pandas import DataFrame, ExcelWriter, date_range, read_excel import pandas.util.testing as tm -class Excel: +def _generate_dataframe(): + N = 2000 + C = 5 + df = DataFrame( + np.random.randn(N, C), + columns=["float{}".format(i) for i in range(C)], + index=date_range("20000101", periods=N, freq="H"), + ) + df["object"] = tm.makeStringIndex(N) + return df + + +class WriteExcel: params = ["openpyxl", "xlsxwriter", "xlwt"] param_names = ["engine"] def setup(self, engine): - N = 2000 - C = 5 - self.df = DataFrame( - np.random.randn(N, C), - columns=["float{}".format(i) for i in range(C)], - index=date_range("20000101", periods=N, freq="H"), - ) - self.df["object"] = tm.makeStringIndex(N) - self.bio_read = BytesIO() - self.writer_read = ExcelWriter(self.bio_read, engine=engine) - self.df.to_excel(self.writer_read, sheet_name="Sheet1") - self.writer_read.save() - self.bio_read.seek(0) - - def time_read_excel(self, engine): - read_excel(self.bio_read) + self.df = _generate_dataframe() def time_write_excel(self, engine): - bio_write = BytesIO() - bio_write.seek(0) - writer_write = ExcelWriter(bio_write, engine=engine) - self.df.to_excel(writer_write, sheet_name="Sheet1") - writer_write.save() + bio = BytesIO() + bio.seek(0) + writer = ExcelWriter(bio, engine=engine) + self.df.to_excel(writer, sheet_name="Sheet1") + writer.save() + + +class ReadExcel: + + params = ["xlrd", "openpyxl", "odf"] + param_names = ["engine"] + fname_excel = "spreadsheet.xlsx" + fname_odf = "spreadsheet.ods" + + def _create_odf(self): + doc = OpenDocumentSpreadsheet() + table = Table(name="Table1") + for row in self.df.values: + tr = TableRow() + for val in row: + tc = TableCell(valuetype="string") + tc.addElement(P(text=val)) + tr.addElement(tc) + table.addElement(tr) + + doc.spreadsheet.addElement(table) + doc.save(self.fname_odf) + + def setup_cache(self): + self.df = _generate_dataframe() + + self.df.to_excel(self.fname_excel, sheet_name="Sheet1") + self._create_odf() + + def time_read_excel(self, engine): + fname = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 2874a7889156bf..8ec04a2087f1b7 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,5 +1,6 @@ import numpy as np -from pandas import DataFrame, date_range, HDFStore, read_hdf + +from pandas import DataFrame, HDFStore, date_range, read_hdf import pandas.util.testing as tm from ..pandas_vb_common import BaseIO @@ -127,4 +128,4 @@ def time_write_hdf(self, format): self.df.to_hdf(self.fname, "df", format=format) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index fc07f2a4841025..5c1d39776b91c9 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,6 +1,7 @@ import numpy as np + +from pandas import DataFrame, concat, date_range, read_json, timedelta_range import pandas.util.testing as tm -from pandas import DataFrame, date_range, timedelta_range, concat, read_json from ..pandas_vb_common import BaseIO @@ -117,7 +118,7 @@ def setup(self, orient, frame): def time_to_json(self, orient, frame): getattr(self, frame).to_json(self.fname, orient=orient) - def mem_to_json(self, orient, frame): + def peakmem_to_json(self, orient, frame): getattr(self, frame).to_json(self.fname, orient=orient) def time_to_json_wide(self, orient, frame): @@ -125,7 +126,7 @@ def time_to_json_wide(self, orient, frame): df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) df.to_json(self.fname, orient=orient) - def mem_to_json_wide(self, orient, frame): + def peakmem_to_json_wide(self, orient, frame): base_df = getattr(self, frame).copy() df = concat([base_df.iloc[:100]] * 1000, ignore_index=True, axis=1) df.to_json(self.fname, orient=orient) @@ -214,4 +215,4 @@ def peakmem_float(self, frames): df.to_json() -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py index d97b4ae13f0bd5..f5038602539ab6 100644 --- a/asv_bench/benchmarks/io/msgpack.py +++ b/asv_bench/benchmarks/io/msgpack.py @@ -1,5 +1,7 @@ import warnings + import numpy as np + from pandas import DataFrame, date_range, read_msgpack import pandas.util.testing as tm @@ -27,4 +29,4 @@ def time_write_msgpack(self): self.df.to_msgpack(self.fname) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 286ac767c02e7e..647e9d27dec9d3 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, date_range, read_pickle import pandas.util.testing as tm @@ -25,4 +26,4 @@ def time_write_pickle(self): self.df.to_pickle(self.fname) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index b80872b17a9e4a..fe84c869717e38 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -1,10 +1,11 @@ import sqlite3 import numpy as np -import pandas.util.testing as tm -from pandas import DataFrame, date_range, read_sql_query, read_sql_table from sqlalchemy import create_engine +from pandas import DataFrame, date_range, read_sql_query, read_sql_table +import pandas.util.testing as tm + class SQL: @@ -141,4 +142,4 @@ def time_read_sql_table_column(self, dtype): read_sql_table(self.table_name, self.con, columns=[dtype]) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index b3ed71af47dc8b..28829785d72e92 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -1,4 +1,5 @@ import numpy as np + from pandas import DataFrame, date_range, read_stata import pandas.util.testing as tm @@ -50,4 +51,4 @@ def setup(self, convert_dates): self.df.to_stata(self.fname, self.convert_dates) -from ..pandas_vb_common import setup # noqa: F401 +from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 7c899e3dc6ac8a..6aa82a43a4d6a0 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,8 +1,9 @@ import string import numpy as np + +from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, date_range, concat, merge, merge_asof try: from pandas import merge_ordered @@ -348,4 +349,4 @@ def time_series_align_left_monotonic(self): self.ts1.align(self.ts2, join="left") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index eda059a68e8a58..3f4fd7ad911c1e 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -1,8 +1,9 @@ import string import numpy as np + +from pandas import DataFrame, MultiIndex, date_range import pandas.util.testing as tm -from pandas import date_range, MultiIndex, DataFrame class GetLoc: @@ -146,4 +147,4 @@ def time_categorical_level(self): self.df.set_index(["a", "b"]) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py index 31c3b6fb6cb60a..d822646e712ae5 100644 --- a/asv_bench/benchmarks/offset.py +++ b/asv_bench/benchmarks/offset.py @@ -1,7 +1,8 @@ -import warnings from datetime import datetime +import warnings import numpy as np + import pandas as pd try: diff --git a/asv_bench/benchmarks/package.py b/asv_bench/benchmarks/package.py new file mode 100644 index 00000000000000..8ca33db361fa07 --- /dev/null +++ b/asv_bench/benchmarks/package.py @@ -0,0 +1,25 @@ +""" +Benchmarks for pandas at the package-level. +""" +import subprocess +import sys + +from pandas.compat import PY37 + + +class TimeImport: + def time_import(self): + if PY37: + # on py37+ we the "-X importtime" usage gives us a more precise + # measurement of the import time we actually care about, + # without the subprocess or interpreter overhead + cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] + p = subprocess.run(cmd, stderr=subprocess.PIPE) + + line = p.stderr.splitlines()[-1] + field = line.split(b"|")[-2].strip() + total = int(field) # microseconds + return total + + cmd = [sys.executable, "-c", "import pandas as pd"] + subprocess.run(cmd, stderr=subprocess.PIPE) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index fdc8207021c0f3..1faf13329110d6 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -1,7 +1,8 @@ -import os from importlib import import_module +import os import numpy as np + import pandas as pd # Compatibility import for lib diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 2f8ae0650ab751..7303240a25f292 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,4 +1,5 @@ from pandas import DataFrame, Period, PeriodIndex, Series, date_range, period_range + from pandas.tseries.frequencies import to_offset diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 4fb0876f05a0a0..5c718516360ed2 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,11 +1,12 @@ +import matplotlib import numpy as np -from pandas import DataFrame, Series, DatetimeIndex, date_range + +from pandas import DataFrame, DatetimeIndex, Series, date_range try: from pandas.plotting import andrews_curves except ImportError: from pandas.tools.plotting import andrews_curves -import matplotlib matplotlib.use("Agg") @@ -93,4 +94,4 @@ def time_plot_andrews_curves(self): andrews_curves(self.df, "Name") -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 8d4c9ebaf3e891..cd450f801c8052 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,6 +1,8 @@ import numpy as np + +from pandas import DataFrame, Index, MultiIndex, Series, date_range, period_range import pandas.util.testing as tm -from pandas import DataFrame, Series, MultiIndex, Index, date_range, period_range + from .pandas_vb_common import lib @@ -159,4 +161,4 @@ def time_lib_fast_zip(self): lib.fast_zip(self.col_array_list) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 6137e944e6b9e3..2a115fb0b4fe33 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -1,4 +1,5 @@ import numpy as np + import pandas as pd @@ -36,6 +37,23 @@ def time_replace_series(self, inplace): self.s.replace(self.to_rep, inplace=inplace) +class ReplaceList: + # GH#28099 + + params = [(True, False)] + param_names = ["inplace"] + + def setup(self, inplace): + self.df = pd.DataFrame({"A": 0, "B": 0}, index=range(4 * 10 ** 7)) + + def time_replace_list(self, inplace): + self.df.replace([np.inf, -np.inf], np.nan, inplace=inplace) + + def time_replace_list_one_match(self, inplace): + # the 1 can be held in self._df.blocks[0], while the inf and -inf cant + self.df.replace([np.inf, -np.inf, 1], np.nan, inplace=inplace) + + class Convert: params = (["DataFrame", "Series"], ["Timestamp", "Timedelta"]) @@ -56,4 +74,4 @@ def time_replace(self, constructor, replace_data): self.data.replace(self.to_replace) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index cc373f413fb885..441f4b380656ec 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,9 +1,10 @@ -import string from itertools import product +import string import numpy as np -from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long + import pandas as pd +from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long class Melt: @@ -262,4 +263,4 @@ def time_explode(self, n_rows, max_list_length): self.series.explode() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index a70977fcf539f7..b42fa553b495ce 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -1,6 +1,7 @@ -import pandas as pd import numpy as np +import pandas as pd + class Methods: @@ -20,6 +21,9 @@ def setup(self, constructor, window, dtype, method): def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() + def peakmem_rolling(self, constructor, window, dtype, method): + getattr(self.roll, method)() + class ExpandingMethods: @@ -121,4 +125,4 @@ def peakmem_fixed(self): self.roll.max() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 6038a2ab4bd9f9..a3f1d92545c3f2 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -1,8 +1,9 @@ from datetime import datetime import numpy as np + +from pandas import NaT, Series, date_range import pandas.util.testing as tm -from pandas import Series, date_range, NaT class SeriesConstructor: @@ -275,4 +276,4 @@ def time_func(self, func, N, dtype): self.func() -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 19d08c086a508a..ac78ca53679fd6 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -136,4 +136,4 @@ def time_division(self, fill_value): self.arr1 / self.arr2 -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 620a6de0f5f341..ed5ebfa61594ec 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -1,6 +1,6 @@ import numpy as np -import pandas as pd +import pandas as pd ops = ["mean", "sum", "median", "std", "skew", "kurt", "mad", "prod", "sem", "var"] @@ -113,12 +113,23 @@ def setup(self, method, use_bottleneck): nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) self.df2 = pd.DataFrame(np.random.randn(1000, 30)) + self.df_wide = pd.DataFrame(np.random.randn(1000, 200)) + self.df_wide_nans = self.df_wide.where(np.random.random((1000, 200)) < 0.9) self.s = pd.Series(np.random.randn(1000)) self.s2 = pd.Series(np.random.randn(1000)) def time_corr(self, method, use_bottleneck): self.df.corr(method=method) + def time_corr_wide(self, method, use_bottleneck): + self.df_wide.corr(method=method) + + def time_corr_wide_nans(self, method, use_bottleneck): + self.df_wide_nans.corr(method=method) + + def peakmem_corr_wide(self, method, use_bottleneck): + self.df_wide.corr(method=method) + def time_corr_series(self, method, use_bottleneck): self.s.corr(self.s2, method=method) @@ -148,4 +159,4 @@ def time_cov_series(self, use_bottleneck): self.s.cov(self.s2) -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 6be2fa92d9eac3..f30b2482615bd2 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,7 +1,8 @@ import warnings import numpy as np -from pandas import Series, DataFrame + +from pandas import DataFrame, Series import pandas.util.testing as tm diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 1020b773f8acbb..498774034d6422 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -2,7 +2,9 @@ import dateutil import numpy as np -from pandas import to_datetime, date_range, Series, DataFrame, period_range + +from pandas import DataFrame, Series, date_range, period_range, to_datetime + from pandas.tseries.frequencies import infer_freq try: @@ -426,4 +428,4 @@ def time_dt_accessor_year(self, tz): self.series.dt.year -from .pandas_vb_common import setup # noqa: F401 +from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 333136ddfddd95..f839d86318e2ec 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -203,10 +203,14 @@ if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then import sys import pandas -blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2' 'hypothesis', +blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis', 'lxml', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', - 'tables', 'xlrd', 'xlsxwriter', 'xlwt'} -mods = blacklist & set(m.split('.')[0] for m in sys.modules) + 'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'} + +# GH#28227 for some of these check for top-level modules, while others are +# more specific (e.g. urllib.request) +import_mods = set(m.split('.')[0] for m in sys.modules) | set(sys.modules) +mods = blacklist & import_mods if mods: sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) sys.exit(len(mods)) diff --git a/ci/print_skipped.py b/ci/print_skipped.py index a44281044e11d0..6bc1dcfcd320dd 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -1,8 +1,8 @@ #!/usr/bin/env python +import math import os import sys -import math import xml.etree.ElementTree as et diff --git a/doc/logo/pandas_logo.py b/doc/logo/pandas_logo.py index 5a07b094e6ad35..89410e3847bef9 100644 --- a/doc/logo/pandas_logo.py +++ b/doc/logo/pandas_logo.py @@ -1,7 +1,6 @@ # script to generate the pandas logo -from matplotlib import pyplot as plt -from matplotlib import rcParams +from matplotlib import pyplot as plt, rcParams import numpy as np rcParams["mathtext.fontset"] = "cm" diff --git a/doc/make.py b/doc/make.py index 48febef20fbe66..cbb1fa6a5324aa 100755 --- a/doc/make.py +++ b/doc/make.py @@ -11,18 +11,18 @@ $ python make.py html $ python make.py latex """ +import argparse +import csv import importlib -import sys import os import shutil -import csv import subprocess -import argparse +import sys import webbrowser + import docutils import docutils.parsers.rst - DOC_PATH = os.path.dirname(os.path.abspath(__file__)) SOURCE_PATH = os.path.join(DOC_PATH, "source") BUILD_PATH = os.path.join(DOC_PATH, "build") diff --git a/doc/source/conf.py b/doc/source/conf.py index 3ebc5d8b6333b2..1da1948e452688 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -10,15 +10,15 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys -import os -import inspect import importlib +import inspect import logging +import os +import sys + import jinja2 -from sphinx.ext.autosummary import _import_by_name from numpydoc.docscrape import NumpyDocString - +from sphinx.ext.autosummary import _import_by_name logger = logging.getLogger(__name__) @@ -141,7 +141,7 @@ # built documents. # # The short X.Y version. -import pandas +import pandas # noqa: E402 isort:skip # version = '%s r%s' % (pandas.__version__, svn_version()) version = str(pandas.__version__) @@ -315,7 +315,6 @@ import numpy as np import pandas as pd - randn = np.random.randn np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) pd.options.display.max_rows = 15 @@ -433,10 +432,14 @@ # Add custom Documenter to handle attributes/methods of an AccessorProperty # eg pandas.Series.str and pandas.Series.dt (see GH9322) -import sphinx -from sphinx.util import rpartition -from sphinx.ext.autodoc import Documenter, MethodDocumenter, AttributeDocumenter -from sphinx.ext.autosummary import Autosummary +import sphinx # noqa: E402 isort:skip +from sphinx.util import rpartition # noqa: E402 isort:skip +from sphinx.ext.autodoc import ( # noqa: E402 isort:skip + AttributeDocumenter, + Documenter, + MethodDocumenter, +) +from sphinx.ext.autosummary import Autosummary # noqa: E402 isort:skip class AccessorDocumenter(MethodDocumenter): diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b38f7767ae0733..be6555b2ab9368 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -699,6 +699,136 @@ You'll also need to See :ref:`contributing.warnings` for more. +.. _contributing.type_hints: + +Type Hints +---------- + +*pandas* strongly encourages the use of :pep:`484` style type hints. New development should contain type hints and pull requests to annotate existing code are accepted as well! + +Style Guidelines +~~~~~~~~~~~~~~~~ + +Types imports should follow the ``from typing import ...`` convention. So rather than + +.. code-block:: python + + import typing + + primes = [] # type: typing.List[int] + +You should write + +.. code-block:: python + + from typing import List, Optional, Union + + primes = [] # type: List[int] + +``Optional`` should be used where applicable, so instead of + +.. code-block:: python + + maybe_primes = [] # type: List[Union[int, None]] + +You should write + +.. code-block:: python + + maybe_primes = [] # type: List[Optional[int]] + +In some cases in the code base classes may define class variables that shadow builtins. This causes an issue as described in `Mypy 1775 `_. The defensive solution here is to create an unambiguous alias of the builtin and use that without your annotation. For example, if you come across a definition like + +.. code-block:: python + + class SomeClass1: + str = None + +The appropriate way to annotate this would be as follows + +.. code-block:: python + + str_type = str + + class SomeClass2: + str = None # type: str_type + +In some cases you may be tempted to use ``cast`` from the typing module when you know better than the analyzer. This occurs particularly when using custom inference functions. For example + +.. code-block:: python + + from typing import cast + + from pandas.core.dtypes.common import is_number + + def cannot_infer_bad(obj: Union[str, int, float]): + + if is_number(obj): + ... + else: # Reasonably only str objects would reach this but... + obj = cast(str, obj) # Mypy complains without this! + return obj.upper() + +The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 `_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable + +.. code-block:: python + + def cannot_infer_good(obj: Union[str, int, float]): + + if isinstance(obj, str): + return obj.upper() + else: + ... + +With custom types and inference this is not always possible so exceptions are made, but every effort should be exhausted to avoid ``cast`` before going down such paths. + +Syntax Requirements +~~~~~~~~~~~~~~~~~~~ + +Because *pandas* still supports Python 3.5, :pep:`526` does not apply and variables **must** be annotated with type comments. Specifically, this is a valid annotation within pandas: + +.. code-block:: python + + primes = [] # type: List[int] + +Whereas this is **NOT** allowed: + +.. code-block:: python + + primes: List[int] = [] # not supported in Python 3.5! + +Note that function signatures can always be annotated per :pep:`3107`: + +.. code-block:: python + + def sum_of_primes(primes: List[int] = []) -> int: + ... + + +Pandas-specific Types +~~~~~~~~~~~~~~~~~~~~~ + +Commonly used types specific to *pandas* will appear in `pandas._typing `_ and you should use these where applicable. This module is private for now but ultimately this should be exposed to third party libraries who want to implement type checking against pandas. + +For example, quite a few functions in *pandas* accept a ``dtype`` argument. This can be expressed as a string like ``"object"``, a ``numpy.dtype`` like ``np.int64`` or even a pandas ``ExtensionDtype`` like ``pd.CategoricalDtype``. Rather than burden the user with having to constantly annotate all of those options, this can simply be imported and reused from the pandas._typing module + +.. code-block:: python + + from pandas._typing import Dtype + + def as_type(dtype: Dtype) -> ...: + ... + +This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. + +Validating Type Hints +~~~~~~~~~~~~~~~~~~~~~ + +*pandas* uses `mypy `_ to statically analyze the code base and type hints. After making any change you can ensure your type hints are correct by running + +.. code-block:: shell + + mypy pandas .. _contributing.ci: diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 9045e5b32c29fe..41520795bde62e 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -278,7 +278,7 @@ Using a single column's values to select data. .. ipython:: python - df[df.A > 0] + df[df['A'] > 0] Selecting values from a DataFrame where a boolean condition is met. diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index 3f6f56376861fd..802ffadf2a81ef 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -926,7 +926,7 @@ Single aggregations on a ``Series`` this will return a scalar value: .. ipython:: python - tsdf.A.agg('sum') + tsdf['A'].agg('sum') Aggregating with multiple functions @@ -950,13 +950,13 @@ On a ``Series``, multiple functions return a ``Series``, indexed by the function .. ipython:: python - tsdf.A.agg(['sum', 'mean']) + tsdf['A'].agg(['sum', 'mean']) Passing a ``lambda`` function will yield a ```` named row: .. ipython:: python - tsdf.A.agg(['sum', lambda x: x.mean()]) + tsdf['A'].agg(['sum', lambda x: x.mean()]) Passing a named function will yield that name for the row: @@ -965,7 +965,7 @@ Passing a named function will yield that name for the row: def mymean(x): return x.mean() - tsdf.A.agg(['sum', mymean]) + tsdf['A'].agg(['sum', mymean]) Aggregating with a dict +++++++++++++++++++++++ @@ -1065,7 +1065,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin .. ipython:: python - tsdf.A.transform(np.abs) + tsdf['A'].transform(np.abs) Transform with multiple functions @@ -1084,7 +1084,7 @@ resulting column names will be the transforming functions. .. ipython:: python - tsdf.A.transform([np.abs, lambda x: x + 1]) + tsdf['A'].transform([np.abs, lambda x: x + 1]) Transforming with a dict diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 444e886bc951d2..f67f46fc2b29ba 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -81,7 +81,7 @@ R pandas =========================================== =========================================== ``select(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})['col_one']`` ``rename(df, col_one = col1)`` ``df.rename(columns={'col1': 'col_one'})`` -``mutate(df, c=a-b)`` ``df.assign(c=df.a-df.b)`` +``mutate(df, c=a-b)`` ``df.assign(c=df['a']-df['b'])`` =========================================== =========================================== @@ -258,8 +258,8 @@ index/slice as well as standard boolean indexing: df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.query('a <= b') - df[df.a <= df.b] - df.loc[df.a <= df.b] + df[df['a'] <= df['b']] + df.loc[df['a'] <= df['b']] For more details and examples see :ref:`the query documentation `. @@ -284,7 +284,7 @@ In ``pandas`` the equivalent expression, using the df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) df.eval('a + b') - df.a + df.b # same as the previous expression + df['a'] + df['b'] # same as the previous expression In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than evaluation in pure Python. For more details and examples see :ref:`the eval diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 366fdd546f58b5..6a03c06de3699a 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -49,6 +49,20 @@ With pandas, column selection is done by passing a list of column names to your Calling the DataFrame without the list of column names would display all columns (akin to SQL's ``*``). +In SQL, you can add a calculated column: + +.. code-block:: sql + + SELECT *, tip/total_bill as tip_rate + FROM tips + LIMIT 5; + +With pandas, you can use the :meth:`DataFrame.assign` method of a DataFrame to append a new column: + +.. ipython:: python + + tips.assign(tip_rate=tips['tip'] / tips['total_bill']).head(5) + WHERE ----- Filtering in SQL is done via a WHERE clause. diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index b57ce83cfc33c9..f5669626aa2b31 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -39,7 +39,7 @@ See the :ref:`overview` for more detail about what's in the library. :hidden: {% endif %} {% if not single_doc %} - What's New in 0.25.0 + What's New in 1.0.0 install getting_started/index user_guide/index @@ -53,7 +53,7 @@ See the :ref:`overview` for more detail about what's in the library. whatsnew/index {% endif %} -* :doc:`whatsnew/v0.25.0` +* :doc:`whatsnew/v1.0.0` * :doc:`install` * :doc:`getting_started/index` diff --git a/doc/source/reference/plotting.rst b/doc/source/reference/plotting.rst index 7615e1d20f5e27..95657dfa5fde5b 100644 --- a/doc/source/reference/plotting.rst +++ b/doc/source/reference/plotting.rst @@ -13,10 +13,14 @@ The following functions are contained in the `pandas.plotting` module. :toctree: api/ andrews_curves + autocorrelation_plot bootstrap_plot + boxplot deregister_matplotlib_converters lag_plot parallel_coordinates + plot_params radviz register_matplotlib_converters scatter_matrix + table diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 22a9791ffde30e..62a9b6396404a7 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -738,7 +738,7 @@ and allows efficient indexing and storage of an index with a large number of dup df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) df df.dtypes - df.B.cat.categories + df['B'].cat.categories Setting the index will create a ``CategoricalIndex``. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 15af5208a4f1f3..c9d3bc3a28c704 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -592,8 +592,8 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A']) - df.A.groupby((df.A != df.A.shift()).cumsum()).groups - df.A.groupby((df.A != df.A.shift()).cumsum()).cumsum() + df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).groups + df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).cumsum() Expanding data ************** @@ -719,7 +719,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc df def gm(df, const): - v = ((((df.A + df.B) + 1).cumprod()) - 1) * const + v = ((((df['A'] + df['B']) + 1).cumprod()) - 1) * const return v.iloc[-1] s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index a4eefadd54d8c4..2df5b9d82dcc37 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -393,15 +393,15 @@ Consider the following toy example of doubling each observation: .. code-block:: ipython # Custom function without numba - In [5]: %timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba) # noqa E501 + In [5]: %timeit df['col1_doubled'] = df['a'].apply(double_every_value_nonumba) # noqa E501 1000 loops, best of 3: 797 us per loop # Standard implementation (faster than a custom function) - In [6]: %timeit df['col1_doubled'] = df.a * 2 + In [6]: %timeit df['col1_doubled'] = df['a'] * 2 1000 loops, best of 3: 233 us per loop # Custom function with numba - In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df.a.to_numpy()) + In [7]: %timeit (df['col1_doubled'] = double_every_value_withnumba(df['a'].to_numpy()) 1000 loops, best of 3: 145 us per loop Caveats @@ -643,8 +643,8 @@ The equivalent in standard Python would be .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df['c'] = df.a + df.b - df['d'] = df.a + df.b + df.c + df['c'] = df['a'] + df['b'] + df['d'] = df['a'] + df['b'] + df['c'] df['a'] = 1 df @@ -688,7 +688,7 @@ name in an expression. a = np.random.randn() df.query('@a < a') - df.loc[a < df.a] # same as the previous expression + df.loc[a < df['a']] # same as the previous expression With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it isn't defined in that context. ``pandas`` will let you know this if you try to diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index e3b75afcf945e2..cf55ce0c9a6d4e 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -210,7 +210,7 @@ as an attribute: See `here for an explanation of valid identifiers `__. - - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed. + - The attribute will not be available if it conflicts with an existing method name, e.g. ``s.min`` is not allowed, but ``s['min']`` is possible. - Similarly, the attribute will not be available if it conflicts with any of the following list: ``index``, ``major_axis``, ``minor_axis``, ``items``. @@ -540,7 +540,7 @@ The ``callable`` must be a function with one argument (the calling Series or Dat columns=list('ABCD')) df1 - df1.loc[lambda df: df.A > 0, :] + df1.loc[lambda df: df['A'] > 0, :] df1.loc[:, lambda df: ['A', 'B']] df1.iloc[:, lambda df: [0, 1]] @@ -552,7 +552,7 @@ You can use callable indexing in ``Series``. .. ipython:: python - df1.A.loc[lambda s: s > 0] + df1['A'].loc[lambda s: s > 0] Using these methods / indexers, you can chain data selection operations without using a temporary variable. @@ -561,7 +561,7 @@ without using a temporary variable. bb = pd.read_csv('data/baseball.csv', index_col='id') (bb.groupby(['year', 'team']).sum() - .loc[lambda df: df.r > 100]) + .loc[lambda df: df['r'] > 100]) .. _indexing.deprecate_ix: @@ -871,9 +871,9 @@ Boolean indexing Another common operation is the use of boolean vectors to filter the data. The operators are: ``|`` for ``or``, ``&`` for ``and``, and ``~`` for ``not``. These **must** be grouped by using parentheses, since by default Python will -evaluate an expression such as ``df.A > 2 & df.B < 3`` as -``df.A > (2 & df.B) < 3``, while the desired evaluation order is -``(df.A > 2) & (df.B < 3)``. +evaluate an expression such as ``df['A'] > 2 & df['B'] < 3`` as +``df['A'] > (2 & df['B']) < 3``, while the desired evaluation order is +``(df['A > 2) & (df['B'] < 3)``. Using a boolean vector to index a Series works exactly as in a NumPy ndarray: @@ -1134,7 +1134,7 @@ between the values of columns ``a`` and ``c``. For example: df # pure python - df[(df.a < df.b) & (df.b < df.c)] + df[(df['a'] < df['b']) & (df['b'] < df['c'])] # query df.query('(a < b) & (b < c)') @@ -1241,7 +1241,7 @@ Full numpy-like syntax: df = pd.DataFrame(np.random.randint(n, size=(n, 3)), columns=list('abc')) df df.query('(a < b) & (b < c)') - df[(df.a < df.b) & (df.b < df.c)] + df[(df['a'] < df['b']) & (df['b'] < df['c'])] Slightly nicer by removing the parentheses (by binding making comparison operators bind tighter than ``&`` and ``|``). @@ -1279,12 +1279,12 @@ The ``in`` and ``not in`` operators df.query('a in b') # How you'd do it in pure Python - df[df.a.isin(df.b)] + df[df['a'].isin(df['b'])] df.query('a not in b') # pure Python - df[~df.a.isin(df.b)] + df[~df['a'].isin(df['b'])] You can combine this with other expressions for very succinct queries: @@ -1297,7 +1297,7 @@ You can combine this with other expressions for very succinct queries: df.query('a in b and c < d') # pure Python - df[df.b.isin(df.a) & (df.c < df.d)] + df[df['b'].isin(df['a']) & (df['c'] < df['d'])] .. note:: @@ -1326,7 +1326,7 @@ to ``in``/``not in``. df.query('b == ["a", "b", "c"]') # pure Python - df[df.b.isin(["a", "b", "c"])] + df[df['b'].isin(["a", "b", "c"])] df.query('c == [1, 2]') @@ -1338,7 +1338,7 @@ to ``in``/``not in``. df.query('[1, 2] not in c') # pure Python - df[df.c.isin([1, 2])] + df[df['c'].isin([1, 2])] Boolean operators @@ -1352,7 +1352,7 @@ You can negate boolean expressions with the word ``not`` or the ``~`` operator. df['bools'] = np.random.rand(len(df)) > 0.5 df.query('~bools') df.query('not bools') - df.query('not bools') == df[~df.bools] + df.query('not bools') == df[~df['bools']] Of course, expressions can be arbitrarily complex too: @@ -1362,7 +1362,10 @@ Of course, expressions can be arbitrarily complex too: shorter = df.query('a < b < c and (not bools) or bools > 2') # equivalent in pure Python - longer = df[(df.a < df.b) & (df.b < df.c) & (~df.bools) | (df.bools > 2)] + longer = df[(df['a'] < df['b']) + & (df['b'] < df['c']) + & (~df['bools']) + | (df['bools'] > 2)] shorter longer @@ -1835,14 +1838,14 @@ chained indexing expression, you can set the :ref:`option ` # This will show the SettingWithCopyWarning # but the frame values will be set - dfb['c'][dfb.a.str.startswith('o')] = 42 + dfb['c'][dfb['a'].str.startswith('o')] = 42 This however is operating on a copy and will not work. :: >>> pd.set_option('mode.chained_assignment','warn') - >>> dfb[dfb.a.str.startswith('o')]['c'] = 42 + >>> dfb[dfb['a'].str.startswith('o')]['c'] = 42 Traceback (most recent call last) ... SettingWithCopyWarning: diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1d49dbdee9c03a..f6b0c55d39f65d 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3206,7 +3206,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') # Or via pandas configuration. - from pandas import options # noqa: E402 + from pandas import options # noqa: E402 options.io.excel.xlsx.writer = 'xlsxwriter' df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') @@ -5047,6 +5047,17 @@ Example of a callable using PostgreSQL `COPY clause from io import StringIO def psql_insert_copy(table, conn, keys, data_iter): + """ + Execute SQL statement inserting data + + Parameters + ---------- + table : pandas.io.sql.SQLTable + conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection + keys : list of str + Column names + data_iter : Iterable that iterates the values to be inserted + """ # gets a DBAPI connection that can provide a cursor dbapi_conn = conn.connection with dbapi_conn.cursor() as cur: @@ -5080,6 +5091,18 @@ table name and optionally a subset of columns to read. pd.read_sql_table('data', engine) +.. note:: + + Note that pandas infers column dtypes from query outputs, and not by looking + up data types in the physical database schema. For example, assume ``userid`` + is an integer column in a table. Then, intuitively, ``select userid ...`` will + return integer-valued series, while ``select cast(userid as text) ...`` will + return object-valued (str) series. Accordingly, if the query output is empty, + then all resulting columns will be returned as object-valued (since they are + most general). If you foresee that your query will sometimes generate an empty + result, you may want to explicitly typecast afterwards to ensure dtype + integrity. + You can also specify the name of the column as the ``DataFrame`` index, and specify a subset of columns to be read. diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index f32a8adfd4d335..1f1dff417e68f3 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -163,7 +163,7 @@ determines how many rows are shown in the truncated repr. .. ipython:: python pd.set_option('max_rows', 8) - pd.set_option('max_rows', 4) + pd.set_option('min_rows', 4) # below max_rows -> all rows shown df = pd.DataFrame(np.random.randn(7, 2)) df diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index f118fe84d523a6..dd6d3062a8f0ae 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -469,7 +469,7 @@ If ``crosstab`` receives only two Series, it will provide a frequency table. 'C': [1, 1, np.nan, 1, 1]}) df - pd.crosstab(df.A, df.B) + pd.crosstab(df['A'], df['B']) Any input passed containing ``Categorical`` data will have **all** of its categories included in the cross-tabulation, even if the actual data does @@ -489,13 +489,13 @@ using the ``normalize`` argument: .. ipython:: python - pd.crosstab(df.A, df.B, normalize=True) + pd.crosstab(df['A'], df['B'], normalize=True) ``normalize`` can also normalize values within each row or within each column: .. ipython:: python - pd.crosstab(df.A, df.B, normalize='columns') + pd.crosstab(df['A'], df['B'], normalize='columns') ``crosstab`` can also be passed a third ``Series`` and an aggregation function (``aggfunc``) that will be applied to the values of the third ``Series`` within @@ -503,7 +503,7 @@ each group defined by the first two ``Series``: .. ipython:: python - pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum) + pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum) Adding margins ~~~~~~~~~~~~~~ @@ -512,7 +512,7 @@ Finally, one can also add margins or normalize this output. .. ipython:: python - pd.crosstab(df.A, df.B, values=df.C, aggfunc=np.sum, normalize=True, + pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, normalize=True, margins=True) .. _reshaping.tile: diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index fdceaa5868cecd..fa16b2f2166105 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1148,10 +1148,10 @@ To plot data on a secondary y-axis, use the ``secondary_y`` keyword: .. ipython:: python - df.A.plot() + df['A'].plot() @savefig series_plot_secondary_y.png - df.B.plot(secondary_y=True, style='g') + df['B'].plot(secondary_y=True, style='g') .. ipython:: python :suppress: @@ -1205,7 +1205,7 @@ Here is the default behavior, notice how the x-axis tick labeling is performed: plt.figure() @savefig ser_plot_suppress.png - df.A.plot() + df['A'].plot() .. ipython:: python :suppress: @@ -1219,7 +1219,7 @@ Using the ``x_compat`` parameter, you can suppress this behavior: plt.figure() @savefig ser_plot_suppress_parm.png - df.A.plot(x_compat=True) + df['A'].plot(x_compat=True) .. ipython:: python :suppress: @@ -1235,9 +1235,9 @@ in ``pandas.plotting.plot_params`` can be used in a `with statement`: @savefig ser_plot_suppress_context.png with pd.plotting.plot_params.use('x_compat', True): - df.A.plot(color='r') - df.B.plot(color='g') - df.C.plot(color='b') + df['A'].plot(color='r') + df['B'].plot(color='g') + df['C'].plot(color='b') .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index aeab2cf5809e79..fe80cc8bb959a5 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 0.25 .. toctree:: :maxdepth: 2 + v0.25.2 v0.25.1 v0.25.0 diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 59ea6b97762327..2e0442364b2f32 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -498,7 +498,7 @@ Here is a taste of what to expect. .. code-block:: ipython - In [58]: p4d = Panel4D(randn(2, 2, 5, 4), + In [58]: p4d = Panel4D(np.random.randn(2, 2, 5, 4), ....: labels=['Label1','Label2'], ....: items=['Item1', 'Item2'], ....: major_axis=date_range('1/1/2000', periods=5), diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index ef6108ae3ec909..62604dd3edd2dd 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -495,7 +495,7 @@ Other enhancements - :func:`pandas.util.hash_pandas_object` has gained the ability to hash a ``MultiIndex`` (:issue:`15224`) - ``Series/DataFrame.squeeze()`` have gained the ``axis`` parameter. (:issue:`15339`) - ``DataFrame.to_excel()`` has a new ``freeze_panes`` parameter to turn on Freeze Panes when exporting to Excel (:issue:`15160`) -- ``pd.read_html()`` will parse multiple header rows, creating a MutliIndex header. (:issue:`13434`). +- ``pd.read_html()`` will parse multiple header rows, creating a MultiIndex header. (:issue:`13434`). - HTML table output skips ``colspan`` or ``rowspan`` attribute if equal to 1. (:issue:`15403`) - :class:`pandas.io.formats.style.Styler` template now has blocks for easier extension, see the :ref:`example notebook ` (:issue:`15649`) - :meth:`Styler.render() ` now accepts ``**kwargs`` to allow user-defined variables in the template (:issue:`15649`) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst new file mode 100644 index 00000000000000..69f324211e5b28 --- /dev/null +++ b/doc/source/whatsnew/v0.25.2.rst @@ -0,0 +1,111 @@ +.. _whatsnew_0252: + +What's new in 0.25.2 (October XX, 2019) +--------------------------------------- + +These are the changes in pandas 0.25.2. See :ref:`release` for a full changelog +including other versions of pandas. + +.. _whatsnew_0252.bug_fixes: + +Bug fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- + +Datetimelike +^^^^^^^^^^^^ + +- +- +- + +Timezones +^^^^^^^^^ + +- + +Numeric +^^^^^^^ + +- +- +- +- + +Conversion +^^^^^^^^^^ + +- + +Interval +^^^^^^^^ + +- + +Indexing +^^^^^^^^ + +- +- +- +- + +Missing +^^^^^^^ + +- + +I/O +^^^ + +- Fix regression in notebook display where tags not used for :attr:`DataFrame.index` (:issue:`28204`). +- Regression in :meth:`~DataFrame.to_csv` where writing a :class:`Series` or :class:`DataFrame` indexed by an :class:`IntervalIndex` would incorrectly raise a ``TypeError`` (:issue:`28210`) +- +- + +Plotting +^^^^^^^^ + +- +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug incorrectly raising an ``IndexError`` when passing a list of quantiles to :meth:`pandas.core.groupby.DataFrameGroupBy.quantile` (:issue:`28113`). +- Bug in :meth:`pandas.core.groupby.GroupBy.shift`, :meth:`pandas.core.groupby.GroupBy.bfill` and :meth:`pandas.core.groupby.GroupBy.ffill` where timezone information would be dropped (:issue:`19995`, :issue:`27992`) +- +- +- + +Reshaping +^^^^^^^^^ + +- +- +- +- +- + +Sparse +^^^^^^ + +- + +Other +^^^^^ + +- Compatibility with Python 3.8 in :meth:`DataFrame.query` (:issue:`27261`) +- + +.. _whatsnew_0.252.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v0.25.1..HEAD diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 4a1e874f0c8d7d..557c202d889b36 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -20,8 +20,7 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ - -- +- :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) - .. _whatsnew_1000.enhancements.other: @@ -38,7 +37,25 @@ Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`). -- +- :class:`pandas.core.arrays.IntervalArray` adopts a new ``__repr__`` in accordance with other array classes (:issue:`25022`) + +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) + Out[2]: + IntervalArray([(0, 1], (2, 3]], + closed='right', + dtype='interval[int64]') + + +*pandas 1.0.0* + +.. ipython:: python + + pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) + .. _whatsnew_1000.api.other: @@ -76,7 +93,8 @@ Performance improvements - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) - Performance improvement in `MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) - +- Performance improvement in :meth:`DataFrame.corr` when ``method`` is ``"spearman"`` (:issue:`28139`) +- Performance improvement in :meth:`DataFrame.replace` when provided a list of values to replace (:issue:`28099`) .. _whatsnew_1000.bug_fixes: @@ -96,6 +114,10 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :meth:`Series.__setitem__` incorrectly casting ``np.timedelta64("NaT")`` to ``np.datetime64("NaT")`` when inserting into a :class:`Series` with datetime64 dtype (:issue:`27311`) - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) +- Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) +- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) +- Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`) +- Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`) - @@ -140,7 +162,7 @@ Interval Indexing ^^^^^^^^ -- +- Bug in assignment using a reverse slicer (:issue:`26939`) - Missing @@ -159,7 +181,8 @@ I/O ^^^ - :meth:`read_csv` now accepts binary mode file buffers when using the Python csv engine (:issue:`23779`) -- +- Bug in :meth:`DataFrame.to_json` where using a Tuple as a column or index value and using ``orient="columns"`` or ``orient="index"`` would produce invalid JSON (:issue:`20500`) +- Improve infinity parsing. :meth:`read_csv` now interprets ``Infinity``, ``+Infinity``, ``-Infinity`` as floating point values (:issue:`10065`) Plotting ^^^^^^^^ @@ -169,13 +192,16 @@ Plotting - Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`) - Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`) - Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`) +- :func:`set_option` now validates that the plot backend provided to ``'plotting.backend'`` implements the backend when the option is set, rather than when a plot is created (:issue:`28163`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - - ``IndexError`` would not raise if all index values in some index level is missing data (:issue:`20519`) +- Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) +- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) Reshaping ^^^^^^^^^ @@ -185,7 +211,7 @@ Reshaping Sparse ^^^^^^ - +- Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`) - - @@ -205,6 +231,8 @@ ExtensionArray Other ^^^^^ - Trying to set the ``display.precision``, ``display.max_rows`` or ``display.max_columns`` using :meth:`set_option` to anything but a ``None`` or a positive int will raise a ``ValueError`` (:issue:`23348`) +- Using :meth:`DataFrame.replace` with overlapping keys in a nested dictionary will no longer raise, now matching the behavior of a flat dictionary (:issue:`27660`) +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support dicts as ``compression`` argument with key ``'method'`` being the compression method and others as additional compression options when the compression method is ``'zip'``. (:issue:`26023`) .. _whatsnew_1000.contributors: diff --git a/doc/sphinxext/contributors.py b/doc/sphinxext/contributors.py index 4256e4659715d2..1a064f71792e96 100644 --- a/doc/sphinxext/contributors.py +++ b/doc/sphinxext/contributors.py @@ -8,12 +8,11 @@ code contributors and commits, and then list each contributor individually. """ +from announce import build_components from docutils import nodes from docutils.parsers.rst import Directive import git -from announce import build_components - class ContributorsDirective(Directive): required_arguments = 1 diff --git a/environment.yml b/environment.yml index 6d2cd701c38540..d72972ffc4da48 100644 --- a/environment.yml +++ b/environment.yml @@ -80,4 +80,5 @@ dependencies: - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile + - odfpy # pandas.read_excel - pyreadstat # pandas.read_spss diff --git a/pandas/_config/display.py b/pandas/_config/display.py index 6e5fabe2706e5e..067b7c503baabf 100644 --- a/pandas/_config/display.py +++ b/pandas/_config/display.py @@ -28,7 +28,10 @@ def detect_console_encoding(): if not encoding or "ascii" in encoding.lower(): try: encoding = locale.getpreferredencoding() - except Exception: + except locale.Error: + # can be raised by locale.setlocale(), which is + # called by getpreferredencoding + # (on some systems, see stdlib locale docs) pass # when all else fails. this will usually be "ascii" diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 46802c64609594..9f750d8447c6ab 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -98,13 +98,7 @@ def _valid_locales(locales, normalize): def _default_locale_getter(): - try: - raw_locales = subprocess.check_output(["locale -a"], shell=True) - except subprocess.CalledProcessError as e: - raise type(e)( - "{exception}, the 'locale -a' command cannot be found " - "on your system".format(exception=e) - ) + raw_locales = subprocess.check_output(["locale -a"], shell=True) return raw_locales @@ -139,7 +133,9 @@ def get_locales(prefix=None, normalize=True, locale_getter=_default_locale_gette """ try: raw_locales = locale_getter() - except Exception: + except subprocess.CalledProcessError: + # Raised on (some? all?) Windows platforms because Note: "locale -a" + # is not defined return None try: diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 038447ad252fe2..0f91f612994c7b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -296,6 +296,7 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): cdef: Py_ssize_t i, j, xi, yi, N, K ndarray[float64_t, ndim=2] result + ndarray[float64_t, ndim=2] ranked_mat ndarray[float64_t, ndim=1] maskedx ndarray[float64_t, ndim=1] maskedy ndarray[uint8_t, ndim=2] mask @@ -307,10 +308,18 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): result = np.empty((K, K), dtype=np.float64) mask = np.isfinite(mat).view(np.uint8) + ranked_mat = np.empty((N, K), dtype=np.float64) + + for i in range(K): + ranked_mat[:, i] = rank_1d_float64(mat[:, i]) + for xi in range(K): for yi in range(xi + 1): nobs = 0 + # Keep track of whether we need to recompute ranks + all_ranks = True for i in range(N): + all_ranks &= not (mask[i, xi] ^ mask[i, yi]) if mask[i, xi] and mask[i, yi]: nobs += 1 @@ -320,13 +329,16 @@ def nancorr_spearman(const float64_t[:, :] mat, Py_ssize_t minp=1): maskedx = np.empty(nobs, dtype=np.float64) maskedy = np.empty(nobs, dtype=np.float64) j = 0 + for i in range(N): if mask[i, xi] and mask[i, yi]: - maskedx[j] = mat[i, xi] - maskedy[j] = mat[i, yi] + maskedx[j] = ranked_mat[i, xi] + maskedy[j] = ranked_mat[i, yi] j += 1 - maskedx = rank_1d_float64(maskedx) - maskedy = rank_1d_float64(maskedy) + + if not all_ranks: + maskedx = rank_1d_float64(maskedx) + maskedy = rank_1d_float64(maskedy) mean = (nobs + 1) / 2. diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 7424c4ddc3d924..979dad6db0838f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,4 +1,5 @@ from datetime import datetime, timedelta, date +import warnings import cython diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 3c9a096e7ecc0c..4db048eeb03831 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -60,7 +60,16 @@ cdef class {{name}}Engine(IndexEngine): # A view is needed for some subclasses, such as PeriodEngine: values = self._get_index_values().view('{{dtype}}') - indexer = values == val + try: + with warnings.catch_warnings(): + # e.g. if values is float64 and `val` is a str, suppress warning + warnings.filterwarnings("ignore", category=FutureWarning) + indexer = values == val + except TypeError: + # if the equality above returns a bool, cython will raise TypeError + # when trying to cast it to ndarray + raise KeyError(val) + found = np.where(indexer)[0] count = len(found) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 47d1e98f214a11..4ef17b116a1d94 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -235,7 +235,7 @@ def fast_unique_multiple(list arrays, sort: bool=True): if sort is None: try: uniques.sort() - except Exception: + except TypeError: # TODO: RuntimeWarning? pass @@ -264,7 +264,7 @@ def fast_unique_multiple_list(lists: list, sort: bool=True) -> list: if sort: try: uniques.sort() - except Exception: + except TypeError: pass return uniques @@ -304,7 +304,7 @@ def fast_unique_multiple_list_gen(object gen, bint sort=True): if sort: try: uniques.sort() - except Exception: + except TypeError: pass return uniques @@ -1410,7 +1410,7 @@ def infer_datetimelike_array(arr: object) -> object: try: array_to_datetime(objs, errors='raise') return 'datetime' - except: + except (ValueError, TypeError): pass # we are *not* going to infer from strings diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6cc9dd22ce7c92..62a3568932def4 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1693,6 +1693,10 @@ cdef: char* cposinf = b'+inf' char* cneginf = b'-inf' + char* cinfty = b'Infinity' + char* cposinfty = b'+Infinity' + char* cneginfty = b'-Infinity' + cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, @@ -1772,9 +1776,12 @@ cdef inline int _try_double_nogil(parser_t *parser, if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or - strcasecmp(word, cposinf) == 0): + strcasecmp(word, cposinf) == 0 or + strcasecmp(word, cinfty) == 0 or + strcasecmp(word, cposinfty) == 0): data[0] = INF - elif strcasecmp(word, cneginf) == 0: + elif (strcasecmp(word, cneginf) == 0 or + strcasecmp(word, cneginfty) == 0 ): data[0] = NEGINF else: return 1 @@ -1793,9 +1800,12 @@ cdef inline int _try_double_nogil(parser_t *parser, if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or - strcasecmp(word, cposinf) == 0): + strcasecmp(word, cposinf) == 0 or + strcasecmp(word, cinfty) == 0 or + strcasecmp(word, cposinfty) == 0): data[0] = INF - elif strcasecmp(word, cneginf) == 0: + elif (strcasecmp(word, cneginf) == 0 or + strcasecmp(word, cneginfty) == 0): data[0] = NEGINF else: return 1 diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index f95685c3379696..bf940eb03e06f4 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -296,8 +296,6 @@ cdef class SeriesBinGrouper: islider.advance(group_size) vslider.advance(group_size) - except: - raise finally: # so we don't free the wrong memory islider.reset() @@ -425,8 +423,6 @@ cdef class SeriesGrouper: group_size = 0 - except: - raise finally: # so we don't free the wrong memory islider.reset() @@ -532,7 +528,8 @@ def apply_frame_axis0(object frame, object f, object names, try: piece = f(chunk) - except: + except Exception: + # We can't be more specific without knowing something about `f` raise InvalidApply('Let this error raise above us') # Need to infer if low level index slider will cause segfaults @@ -543,6 +540,7 @@ def apply_frame_axis0(object frame, object f, object names, else: mutated = True except AttributeError: + # `piece` might not have an index, could be e.g. an int pass results.append(piece) diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 1db1878a8a773f..0a767dd27b6580 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -25,11 +25,6 @@ int to_double(char *item, double *p_value, char sci, char decimal, return (error == 0) && (!*p_end); } -#if PY_VERSION_HEX < 0x02060000 -#define PyBytes_Check PyString_Check -#define PyBytes_AS_STRING PyString_AS_STRING -#endif // PY_VERSION_HEX - int floatify(PyObject *str, double *result, int *maybe_int) { int status; char *data; @@ -50,7 +45,7 @@ int floatify(PyObject *str, double *result, int *maybe_int) { status = to_double(data, result, sci, dec, maybe_int); if (!status) { - /* handle inf/-inf */ + /* handle inf/-inf infinity/-infinity */ if (strlen(data) == 3) { if (0 == strcasecmp(data, "inf")) { *result = HUGE_VAL; @@ -68,6 +63,23 @@ int floatify(PyObject *str, double *result, int *maybe_int) { } else { goto parsingerror; } + } else if (strlen(data) == 8) { + if (0 == strcasecmp(data, "infinity")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } + } else if (strlen(data) == 9) { + if (0 == strcasecmp(data, "-infinity")) { + *result = -HUGE_VAL; + *maybe_int = 0; + } else if (0 == strcasecmp(data, "+infinity")) { + *result = HUGE_VAL; + *maybe_int = 0; + } else { + goto parsingerror; + } } else { goto parsingerror; } diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 0470fef450dde3..ee6e7081bf00e2 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -307,11 +307,4 @@ EXPORTFUNCTION JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuffer); EXPORTFUNCTION void encode(JSOBJ, JSONObjectEncoder *, const char *, size_t); -#define Buffer_Reserve(__enc, __len) \ - if ((size_t)((__enc)->end - (__enc)->offset) < (size_t)(__len)) { \ - Buffer_Realloc((__enc), (__len)); \ - } - -void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded); - #endif // PANDAS__LIBS_SRC_UJSON_LIB_ULTRAJSON_H_ diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 2d6c823a45515e..d5b379bee585b4 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -714,6 +714,12 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, } } +#define Buffer_Reserve(__enc, __len) \ + if ( (size_t) ((__enc)->end - (__enc)->offset) < (size_t) (__len)) \ + { \ + Buffer_Realloc((__enc), (__len));\ + } \ + #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 926440218b5d93..dc9b906c8d76c4 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -16,18 +16,19 @@ derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND -ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +DISCLAIMED. IN NO EVENT SHALL ESN SOCIAL SOFTWARE AB OR JONAS TARNSTROM BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Portions of code from MODP_ASCII - Ascii transformations (upper/lower, etc) https://github.com/client9/stringencoders -Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights reserved. +Copyright (c) 2007 Nick Galbreath -- nickg [at] modp [dot] com. All rights +reserved. Numeric decoder derived from from TCL library http://www.opensource.apple.com/source/tcl/tcl-14/tcl/license.terms @@ -48,13 +49,13 @@ Numeric decoder derived from from TCL library #include <../../../tslibs/src/datetime/np_datetime_strings.h> #include "datetime.h" -#define NPY_JSON_BUFSIZE 32768 - static PyTypeObject *type_decimal; static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; +PyObject *cls_timestamp; +PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } @@ -64,9 +65,9 @@ typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, typedef struct __NpyArrContext { PyObject *array; char *dataptr; - int curdim; // current dimension in array's order - int stridedim; // dimension we are striding over - int inc; // stride dimension increment (+/- 1) + int curdim; // current dimension in array's order + int stridedim; // dimension we are striding over + int inc; // stride dimension increment (+/- 1) npy_intp dim; npy_intp stride; npy_intp ndim; @@ -83,8 +84,8 @@ typedef struct __PdBlockContext { int ncols; int transpose; - int *cindices; // frame column -> block column map - NpyArrContext **npyCtxts; // NpyArrContext for each column + int *cindices; // frame column -> block column map + NpyArrContext **npyCtxts; // NpyArrContext for each column } PdBlockContext; typedef struct __TypeContext { @@ -148,13 +149,12 @@ enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; int PdBlock_iterNext(JSOBJ, JSONTypeContext *); -void *initObjToJSON(void) -{ +void *initObjToJSON(void) { PyObject *mod_pandas; PyObject *mod_nattype; PyObject *mod_decimal = PyImport_ImportModule("decimal"); type_decimal = - (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); + (PyTypeObject *)PyObject_GetAttrString(mod_decimal, "Decimal"); Py_DECREF(mod_decimal); PyDateTime_IMPORT; @@ -166,13 +166,15 @@ void *initObjToJSON(void) cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); + cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); + cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } mod_nattype = PyImport_ImportModule("pandas._libs.tslibs.nattype"); if (mod_nattype) { - cls_nat = (PyTypeObject *)PyObject_GetAttrString(mod_nattype, - "NaTType"); + cls_nat = + (PyTypeObject *)PyObject_GetAttrString(mod_nattype, "NaTType"); Py_DECREF(mod_nattype); } @@ -210,7 +212,6 @@ static TypeContext *createTypeContext(void) { return pc; } - static int is_sparse_array(PyObject *obj) { // TODO can be removed again once SparseArray.values is removed (GH26421) if (PyObject_HasAttrString(obj, "_subtyp")) { @@ -225,7 +226,6 @@ static int is_sparse_array(PyObject *obj) { return 0; } - static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; @@ -240,7 +240,8 @@ static PyObject *get_values(PyObject *obj) { values = PyObject_CallMethod(values, "to_numpy", NULL); } - if (!is_sparse_array(values) && PyObject_HasAttrString(values, "values")) { + if (!is_sparse_array(values) && + PyObject_HasAttrString(values, "values")) { PyObject *subvals = get_values(values); PyErr_Clear(); PRINTMARK(); @@ -355,20 +356,20 @@ static Py_ssize_t get_attr_length(PyObject *obj, char *attr) { } static npy_int64 get_long_attr(PyObject *o, const char *attr) { - npy_int64 long_val; - PyObject *value = PyObject_GetAttrString(o, attr); - long_val = (PyLong_Check(value) ? - PyLong_AsLongLong(value) : PyLong_AsLong(value)); - Py_DECREF(value); - return long_val; + npy_int64 long_val; + PyObject *value = PyObject_GetAttrString(o, attr); + long_val = + (PyLong_Check(value) ? PyLong_AsLongLong(value) : PyLong_AsLong(value)); + Py_DECREF(value); + return long_val; } static npy_float64 total_seconds(PyObject *td) { - npy_float64 double_val; - PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); - double_val = PyFloat_AS_DOUBLE(value); - Py_DECREF(value); - return double_val; + npy_float64 double_val; + PyObject *value = PyObject_CallMethod(td, "total_seconds", NULL); + double_val = PyFloat_AS_DOUBLE(value); + Py_DECREF(value); + return double_val; } static PyObject *get_item(PyObject *obj, Py_ssize_t i) { @@ -434,7 +435,7 @@ static void *PyFloatToDOUBLE(JSOBJ _obj, JSONTypeContext *tc, void *outValue, return NULL; } -static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, +static void *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen) { PyObject *obj = (PyObject *)_obj; *_outLen = PyBytes_GET_SIZE(obj); @@ -448,7 +449,7 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, if (PyUnicode_IS_COMPACT_ASCII(obj)) { Py_ssize_t len; - char *data = (char*)PyUnicode_AsUTF8AndSize(obj, &len); + char *data = (char *)PyUnicode_AsUTF8AndSize(obj, &len); *_outLen = len; return data; } @@ -503,7 +504,7 @@ static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, // TODO(anyone): Does not appear to be reached in tests. pandas_datetime_to_datetimestruct(obj->obval, - (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); + (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); } @@ -662,9 +663,9 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->npyarr = npyarr; if (!npyarr) { - PyErr_NoMemory(); - GET_TC(tc)->iterNext = NpyArr_iterNextNone; - return; + PyErr_NoMemory(); + GET_TC(tc)->iterNext = NpyArr_iterNextNone; + return; } npyarr->array = (PyObject *)obj; @@ -675,17 +676,17 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npyarr->type_num = PyArray_DESCR(obj)->type_num; if (GET_TC(tc)->transpose) { - npyarr->dim = PyArray_DIM(obj, npyarr->ndim); - npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); - npyarr->stridedim = npyarr->ndim; - npyarr->index[npyarr->ndim] = 0; - npyarr->inc = -1; + npyarr->dim = PyArray_DIM(obj, npyarr->ndim); + npyarr->stride = PyArray_STRIDE(obj, npyarr->ndim); + npyarr->stridedim = npyarr->ndim; + npyarr->index[npyarr->ndim] = 0; + npyarr->inc = -1; } else { - npyarr->dim = PyArray_DIM(obj, 0); - npyarr->stride = PyArray_STRIDE(obj, 0); - npyarr->stridedim = 0; - npyarr->index[0] = 0; - npyarr->inc = 1; + npyarr->dim = PyArray_DIM(obj, 0); + npyarr->stride = PyArray_STRIDE(obj, 0); + npyarr->stridedim = 0; + npyarr->index[0] = 0; + npyarr->inc = 1; } npyarr->columnLabels = GET_TC(tc)->columnLabels; @@ -733,8 +734,7 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArr_freeItemValue(obj, tc); - if (PyArray_ISDATETIME(npyarr->array)) - { + if (PyArray_ISDATETIME(npyarr->array)) { PRINTMARK(); GET_TC(tc)->itemValue = obj; Py_INCREF(obj); @@ -787,30 +787,23 @@ JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -static void NpyArr_getLabel(JSOBJ obj, JSONTypeContext *tc, size_t *outLen, - npy_intp idx, char **labels) { - JSONObjectEncoder *enc = (JSONObjectEncoder *)tc->encoder; - PRINTMARK(); - *outLen = strlen(labels[idx]); - Buffer_Reserve(enc, *outLen); - memcpy(enc->offset, labels[idx], sizeof(char) * (*outLen)); - enc->offset += *outLen; - *outLen = 0; -} - char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; npy_intp idx; PRINTMARK(); + char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = npyarr->index[npyarr->stridedim - npyarr->inc] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + + return cStr; } //============================================================================= @@ -852,19 +845,22 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { idx = blkCtxt->colIdx - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = GET_TC(tc)->iterNext != PdBlock_iterNext ? npyarr->index[npyarr->stridedim - npyarr->inc] - 1 : npyarr->index[npyarr->stridedim]; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + return cStr; } char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, @@ -872,16 +868,19 @@ char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; npy_intp idx; + char *cStr; PRINTMARK(); if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->columnLabels); + cStr = npyarr->columnLabels[idx]; } else { idx = blkCtxt->colIdx; - NpyArr_getLabel(obj, tc, outLen, idx, npyarr->rowLabels); + cStr = npyarr->rowLabels[idx]; } - return NULL; + + *outLen = strlen(cStr); + return cStr; } int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { @@ -942,9 +941,9 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { dtype = PyArray_DescrFromType(NPY_INT64); obj = (PyObject *)_obj; - GET_TC(tc) - ->iterGetName = GET_TC(tc)->transpose ? PdBlock_iterGetName_Transpose - : PdBlock_iterGetName; + GET_TC(tc)->iterGetName = GET_TC(tc)->transpose + ? PdBlock_iterGetName_Transpose + : PdBlock_iterGetName; blkCtxt = PyObject_Malloc(sizeof(PdBlockContext)); if (!blkCtxt) { @@ -1395,7 +1394,7 @@ void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series + enc->outputFormat = VALUES; // for contained series if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -1454,7 +1453,7 @@ void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); - enc->outputFormat = VALUES; // for contained series & index + enc->outputFormat = VALUES; // for contained series & index if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } @@ -1578,16 +1577,30 @@ void NpyArr_freeLabels(char **labels, npy_intp len) { } } -char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, +/* + * Function: NpyArr_encodeLabels + * ----------------------------- + * + * Builds an array of "encoded" labels. + * + * labels: PyArrayObject pointer for labels to be "encoded" + * num : number of labels + * + * "encode" is quoted above because we aren't really doing encoding + * For historical reasons this function would actually encode the entire + * array into a separate buffer with a separate call to JSON_Encode + * and would leave it to complex pointer manipulation from there to + * unpack values as needed. To make things simpler and more idiomatic + * this has instead just stringified any input save for datetime values, + * which may need to be represented in various formats. + */ +char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. - PyObjectEncoder *pyenc = (PyObjectEncoder *)enc; PyObject *item = NULL; - npy_intp i, stride, len, need_quotes; + npy_intp i, stride, len; char **ret; - char *dataptr, *cLabel, *origend, *origst, *origoffset; - char labelBuffer[NPY_JSON_BUFSIZE]; - PyArray_GetItemFunc *getitem; + char *dataptr, *cLabel; int type_num; PRINTMARK(); @@ -1614,68 +1627,137 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, JSONObjectEncoder *enc, ret[i] = NULL; } - origst = enc->start; - origend = enc->end; - origoffset = enc->offset; - stride = PyArray_STRIDE(labels, 0); dataptr = PyArray_DATA(labels); - getitem = (PyArray_GetItemFunc *)PyArray_DESCR(labels)->f->getitem; type_num = PyArray_TYPE(labels); for (i = 0; i < num; i++) { - if (PyTypeNum_ISDATETIME(type_num) || PyTypeNum_ISNUMBER(type_num)) - { - item = (PyObject *)labels; - pyenc->npyType = type_num; - pyenc->npyValue = dataptr; - } else { - item = getitem(dataptr, labels); - if (!item) { + item = PyArray_GETITEM(labels, dataptr); + if (!item) { + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + // TODO: for any matches on type_num (date and timedeltas) should use a + // vectorized solution to convert to epoch or iso formats + if (enc->datetimeIso && + (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { + PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); + if (td == NULL) { + Py_DECREF(item); NpyArr_freeLabels(ret, num); ret = 0; break; } - } - cLabel = JSON_EncodeObject(item, enc, labelBuffer, NPY_JSON_BUFSIZE); + PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); + Py_DECREF(td); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } - if (item != (PyObject *)labels) { - Py_DECREF(item); + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) || + PyDate_Check(item)) { + PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); + if (ts == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + if (enc->datetimeIso) { + PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); + Py_DECREF(ts); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(iso); + Py_DECREF(iso); + len = strlen(cLabel); + } else { + npy_int64 value; + // TODO: refactor to not duplicate what goes on in + // beginTypeContext + if (PyObject_HasAttrString(ts, "value")) { + PRINTMARK(); + value = get_long_attr(ts, "value"); + } else { + PRINTMARK(); + value = total_seconds(ts) * + 1000000000LL; // nanoseconds per second + } + Py_DECREF(ts); + + switch (enc->datetimeUnit) { + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; + default: + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + char buf[21] = {0}; // 21 chars for 2**63 as string + cLabel = buf; + sprintf(buf, "%" NPY_INT64_FMT, value); + len = strlen(cLabel); + } + } else { // Fallack to string representation + PyObject *str = PyObject_Str(item); + if (str == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + cLabel = (char *)PyUnicode_AsUTF8(str); + Py_DECREF(str); + len = strlen(cLabel); } - if (PyErr_Occurred() || enc->errorMsg) { + Py_DECREF(item); + // Add 1 to include NULL terminator + ret[i] = PyObject_Malloc(len + 1); + memcpy(ret[i], cLabel, len + 1); + + if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); ret = 0; break; } - need_quotes = ((*cLabel) != '"'); - len = enc->offset - cLabel + 1 + 2 * need_quotes; - ret[i] = PyObject_Malloc(sizeof(char) * len); - if (!ret[i]) { PyErr_NoMemory(); ret = 0; break; } - if (need_quotes) { - ret[i][0] = '"'; - memcpy(ret[i] + 1, cLabel, sizeof(char) * (len - 4)); - ret[i][len - 3] = '"'; - } else { - memcpy(ret[i], cLabel, sizeof(char) * (len - 2)); - } - ret[i][len - 2] = ':'; - ret[i][len - 1] = '\0'; dataptr += stride; } - enc->start = origst; - enc->end = origend; - enc->offset = origoffset; - Py_DECREF(labels); return ret; } @@ -1787,7 +1869,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } else if (PyBytes_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyStringToUTF8; + pc->PyTypeToJSON = PyBytesToUTF8; tc->type = JT_UTF8; return; } else if (PyUnicode_Check(obj)) { @@ -1840,23 +1922,22 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { value = get_long_attr(obj, "value"); } else { PRINTMARK(); - value = - total_seconds(obj) * 1000000000LL; // nanoseconds per second + value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; switch (base) { - case NPY_FR_ns: - break; - case NPY_FR_us: - value /= 1000LL; - break; - case NPY_FR_ms: - value /= 1000000LL; - break; - case NPY_FR_s: - value /= 1000000000LL; - break; + case NPY_FR_ns: + break; + case NPY_FR_us: + value /= 1000LL; + break; + case NPY_FR_ms: + value /= 1000000LL; + break; + case NPY_FR_s: + value /= 1000000000LL; + break; } exc = PyErr_Occurred(); @@ -1971,8 +2052,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -2074,8 +2154,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2096,9 +2175,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = - NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, pc->rowLabelsLen); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, + pc->rowLabelsLen); Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") @@ -2116,8 +2194,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, - (JSONObjectEncoder *)enc, + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject *)values, enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2242,7 +2319,8 @@ void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; - if (tc->prv != &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT + if (tc->prv != + &(((PyObjectEncoder *)tc->encoder)->basicTypeContext)) { // NOLINT PyObject_Free(tc->prv); } tc->prv = NULL; @@ -2305,7 +2383,7 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject *newobj; PyObject *oinput = NULL; PyObject *oensureAscii = NULL; - int idoublePrecision = 10; // default double precision setting + int idoublePrecision = 10; // default double precision setting PyObject *oencodeHTMLChars = NULL; char *sOrient = NULL; char *sdateFormat = NULL; @@ -2328,10 +2406,10 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PyObject_Malloc, PyObject_Realloc, PyObject_Free, - -1, // recursionMax + -1, // recursionMax idoublePrecision, - 1, // forceAscii - 0, // encodeHTMLChars + 1, // forceAscii + 0, // encodeHTMLChars }}; JSONObjectEncoder *encoder = (JSONObjectEncoder *)&pyEncoder; @@ -2429,7 +2507,6 @@ PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { PRINTMARK(); ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); PRINTMARK(); - if (PyErr_Occurred()) { PRINTMARK(); return NULL; diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 01e500a80dcc41..dc06a30004d19d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -344,14 +344,13 @@ def array_with_unit_to_datetime(ndarray values, object unit, # try a quick conversion to i8 # if we have nulls that are not type-compat # then need to iterate - try: + if values.dtype.kind == "i": + # Note: this condition makes the casting="same_kind" redundant iresult = values.astype('i8', casting='same_kind', copy=False) mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False - except: - pass # check the bounds if not need_to_iterate: @@ -406,7 +405,7 @@ def array_with_unit_to_datetime(ndarray values, object unit, elif is_ignore: raise AssertionError iresult[i] = NPY_NAT - except: + except OverflowError: if is_raise: raise OutOfBoundsDatetime( "cannot convert input {val} with the unit " @@ -447,7 +446,7 @@ def array_with_unit_to_datetime(ndarray values, object unit, else: try: oresult[i] = Timestamp(cast_from_unit(val, unit)) - except: + except OverflowError: oresult[i] = val elif isinstance(val, str): @@ -574,7 +573,7 @@ cpdef array_to_datetime(ndarray[object] values, str errors='raise', # datetimes/strings, then we must coerce) try: iresult[i] = cast_from_unit(val, 'ns') - except: + except OverflowError: iresult[i] = NPY_NAT elif isinstance(val, str): diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 67a323782a836a..8d3b00e4a44b91 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -7,3 +7,6 @@ from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta from .timestamps import Timestamp from .tzconversion import tz_convert_single + +# import fails if we do this before np_datetime +from .c_timestamp import NullFrequencyError # isort:skip diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 906dabba09486c..dfa66d7e2d8626 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -42,6 +42,15 @@ from pandas._libs.tslibs.timezones import UTC from pandas._libs.tslibs.tzconversion cimport tz_convert_single +class NullFrequencyError(ValueError): + """ + Error raised when a null `freq` attribute is used in an operation + that needs a non-null frequency, particularly `DatetimeIndex.shift`, + `TimedeltaIndex.shift`, `PeriodIndex.shift`. + """ + pass + + def maybe_integer_op_deprecated(obj): # GH#22535 add/sub of integers and int-arrays is deprecated if obj.freq is not None: @@ -131,7 +140,8 @@ cdef class _Timestamp(datetime): try: stamp += zone.strftime(' %%Z') - except: + except AttributeError: + # e.g. tzlocal has no `strftime` pass tz = ", tz='{0}'".format(zone) if zone is not None else "" @@ -227,8 +237,8 @@ cdef class _Timestamp(datetime): # to be compat with Period return NaT elif self.freq is None: - raise ValueError("Cannot add integral value to Timestamp " - "without freq.") + raise NullFrequencyError( + "Cannot add integral value to Timestamp without freq.") return self.__class__((self.freq * other).apply(self), freq=self.freq) @@ -246,11 +256,17 @@ cdef class _Timestamp(datetime): result = self.__class__(self.value + nanos, tz=self.tzinfo, freq=self.freq) - if getattr(other, 'normalize', False): - # DateOffset - result = result.normalize() return result + elif is_array(other): + if other.dtype.kind in ['i', 'u']: + maybe_integer_op_deprecated(self) + if self.freq is None: + raise NullFrequencyError( + "Cannot add integer-dtype array " + "to Timestamp without freq.") + return self.freq * other + self + # index/series like elif hasattr(other, '_typ'): return NotImplemented @@ -262,24 +278,27 @@ cdef class _Timestamp(datetime): return result def __sub__(self, other): + if (is_timedelta64_object(other) or is_integer_object(other) or PyDelta_Check(other) or hasattr(other, 'delta')): # `delta` attribute is for offsets.Tick or offsets.Week obj neg_other = -other return self + neg_other - typ = getattr(other, '_typ', None) + elif is_array(other): + if other.dtype.kind in ['i', 'u']: + maybe_integer_op_deprecated(self) + if self.freq is None: + raise NullFrequencyError( + "Cannot subtract integer-dtype array " + "from Timestamp without freq.") + return self - self.freq * other - # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex - if typ in ('datetimeindex', 'datetimearray'): - # timezone comparison is performed in DatetimeIndex._sub_datelike - return -other.__sub__(self) - - # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex - elif typ in ('timedeltaindex', 'timedeltaarray'): - return (-other).__add__(self) + typ = getattr(other, '_typ', None) + if typ is not None: + return NotImplemented - elif other is NaT: + if other is NaT: return NaT # coerce if necessary if we are a Timestamp-like @@ -302,10 +321,12 @@ cdef class _Timestamp(datetime): return Timedelta(self.value - other.value) except (OverflowError, OutOfBoundsDatetime): pass + elif is_datetime64_object(self): + # GH#28286 cython semantics for __rsub__, `other` is actually + # the Timestamp + return type(other)(self) - other - # scalar Timestamp/datetime - Timedelta -> yields a Timestamp (with - # same timezone if specified) - return datetime.__sub__(self, other) + return NotImplemented cdef int64_t _maybe_convert_value_to_local(self): """Convert UTC i8 value to local i8 value if tz exists""" diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index f2dcd37b191edf..b29c8418960720 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -138,6 +138,10 @@ cpdef get_freq_code(freqstr): ------- return : tuple of base frequency code and stride (mult) + Raises + ------ + TypeError : if passed a tuple witth incorrect types + Examples -------- >>> get_freq_code('3D') @@ -156,16 +160,16 @@ cpdef get_freq_code(freqstr): if is_integer_object(freqstr[0]) and is_integer_object(freqstr[1]): # e.g., freqstr = (2000, 1) return freqstr + elif is_integer_object(freqstr[0]): + # Note: passing freqstr[1] below will raise TypeError if that + # is not a str + code = _period_str_to_code(freqstr[1]) + stride = freqstr[0] + return code, stride else: # e.g., freqstr = ('T', 5) - try: - code = _period_str_to_code(freqstr[0]) - stride = freqstr[1] - except: - if is_integer_object(freqstr[1]): - raise - code = _period_str_to_code(freqstr[1]) - stride = freqstr[0] + code = _period_str_to_code(freqstr[0]) + stride = freqstr[1] return code, stride if is_integer_object(freqstr): @@ -177,7 +181,7 @@ cpdef get_freq_code(freqstr): return code, stride -cpdef _base_and_stride(freqstr): +cpdef _base_and_stride(str freqstr): """ Return base freq and stride info from string representation @@ -207,7 +211,7 @@ cpdef _base_and_stride(freqstr): return base, stride -cpdef _period_str_to_code(freqstr): +cpdef _period_str_to_code(str freqstr): freqstr = _lite_rule_alias.get(freqstr, freqstr) if freqstr not in _dont_uppercase: diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index eb99f090e85657..3da3d1e4b1b414 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -587,15 +587,11 @@ def try_parse_dates(object[:] values, parser=None, else: parse_date = parser - try: - for i in range(n): - if values[i] == '': - result[i] = np.nan - else: - result[i] = parse_date(values[i]) - except Exception: - # raise if passed parser and it failed - raise + for i in range(n): + if values[i] == '': + result[i] = np.nan + else: + result[i] = parse_date(values[i]) return result.base # .base to access underlying ndarray @@ -814,7 +810,7 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, if dt_str_parse is None or dt_str_split is None: return None - if not isinstance(dt_str, (str, unicode)): + if not isinstance(dt_str, str): return None day_attribute_and_format = (('day',), '%d', 2) @@ -840,19 +836,16 @@ def _guess_datetime_format(dt_str, dayfirst=False, dt_str_parse=du_parse, try: parsed_datetime = dt_str_parse(dt_str, dayfirst=dayfirst) - except: + except (ValueError, OverflowError): # In case the datetime can't be parsed, its format cannot be guessed return None if parsed_datetime is None: return None - try: - tokens = dt_str_split(dt_str) - except: - # In case the datetime string can't be split, its format cannot - # be guessed - return None + # the default dt_str_split from dateutil will never raise here; we assume + # that any user-provided function will not either. + tokens = dt_str_split(dt_str) format_guess = [None] * len(tokens) found_attrs = set() diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index d93858cff5e053..fbda5f178e1647 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -341,7 +341,8 @@ def array_strptime(object[:] values, object fmt, return result, result_timezone.base -"""_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored +""" +_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored from the standard library, see https://github.com/python/cpython/blob/master/Lib/_strptime.py The original module-level docstring follows. @@ -363,7 +364,8 @@ def _getlang(): class LocaleTime: - """Stores and handles locale-specific information related to time. + """ + Stores and handles locale-specific information related to time. ATTRIBUTES: f_weekday -- full weekday names (7-item list) @@ -382,7 +384,8 @@ class LocaleTime: """ def __init__(self): - """Set all attributes. + """ + Set all attributes. Order of methods called matters for dependency reasons. @@ -399,7 +402,6 @@ class LocaleTime: Only other possible issue is if someone changed the timezone and did not call tz.tzset . That is an issue for the programmer, though, since changing the timezone is worthless without that call. - """ self.lang = _getlang() self.__calc_weekday() @@ -518,15 +520,16 @@ class TimeRE(dict): """ def __init__(self, locale_time=None): - """Create keys/values. + """ + Create keys/values. Order of execution is important for dependency reasons. - """ if locale_time: self.locale_time = locale_time else: self.locale_time = LocaleTime() + self._Z = None base = super() base.__init__({ # The " \d" part of the regex is to make %c from ANSI C work @@ -555,21 +558,29 @@ class TimeRE(dict): 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), - 'Z': self.__seqToRE(pytz.all_timezones, 'Z'), + # 'Z' key is generated lazily via __getitem__ '%': '%'}) base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) base.__setitem__('x', self.pattern(self.locale_time.LC_date)) base.__setitem__('X', self.pattern(self.locale_time.LC_time)) + def __getitem__(self, key): + if key == "Z": + # lazy computation + if self._Z is None: + self._Z = self.__seqToRE(pytz.all_timezones, 'Z') + return self._Z + return super().__getitem__(key) + def __seqToRE(self, to_convert, directive): - """Convert a list to a regex string for matching a directive. + """ + Convert a list to a regex string for matching a directive. Want possible matching values to be from longest to shortest. This prevents the possibility of a match occurring for a value that also a substring of a larger value that should have matched (e.g., 'abc' matching when 'abcdef' should have been the match). - """ to_convert = sorted(to_convert, key=len, reverse=True) for value in to_convert: @@ -582,11 +593,11 @@ class TimeRE(dict): return '%s)' % regex def pattern(self, format): - """Return regex pattern for the format string. + """ + Return regex pattern for the format string. Need to make sure that any characters that might be interpreted as regex syntax are escaped. - """ processed_format = '' # The sub() call escapes all characters that might be misconstrued @@ -619,7 +630,8 @@ _regex_cache = {} cdef int _calc_julian_from_U_or_W(int year, int week_of_year, int day_of_week, int week_starts_Mon): - """Calculate the Julian day based on the year, week of the year, and day of + """ + Calculate the Julian day based on the year, week of the year, and day of the week, with week_start_day representing whether the week of the year assumes the week starts on Sunday or Monday (6 or 0). @@ -660,8 +672,10 @@ cdef int _calc_julian_from_U_or_W(int year, int week_of_year, return 1 + days_to_week + day_of_week -cdef object _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): - """Calculate the Julian day based on the ISO 8601 year, week, and weekday. +cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): + """ + Calculate the Julian day based on the ISO 8601 year, week, and weekday. + ISO weeks start on Mondays, with week 01 being the week containing 4 Jan. ISO week days range from 1 (Monday) to 7 (Sunday). @@ -694,7 +708,7 @@ cdef object _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday): return iso_year, ordinal -cdef parse_timezone_directive(object z): +cdef parse_timezone_directive(str z): """ Parse the '%z' directive and return a pytz.FixedOffset diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index d24aafae0967df..ad7c32ca319405 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -228,8 +228,13 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): # this is where all of the error handling will take place. try: for i in range(n): - result[i] = parse_timedelta_string(values[i]) - except: + if values[i] is NaT: + # we allow this check in the fast-path because NaT is a C-object + # so this is an inexpensive check + iresult[i] = NPY_NAT + else: + result[i] = parse_timedelta_string(values[i]) + except (TypeError, ValueError): unit = parse_timedelta_unit(unit) for i in range(n): try: @@ -309,7 +314,7 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1: return (base * m) + (frac * m) -cdef inline parse_timedelta_string(object ts): +cdef inline int64_t parse_timedelta_string(str ts) except? -1: """ Parse a regular format timedelta string. Return an int64_t (in ns) or raise a ValueError on an invalid parse. diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 07c2805dd0ef61..65f4e98708f47e 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -4,11 +4,7 @@ from cpython cimport PyTypeObject cdef extern from *: """ PyObject* char_to_string(const char* data) { - #if PY_VERSION_HEX >= 0x03000000 return PyUnicode_FromString(data); - #else - return PyString_FromString(data); - #endif } """ object char_to_string(const char* data) @@ -18,7 +14,6 @@ cdef extern from "Python.h": # Note: importing extern-style allows us to declare these as nogil # functions, whereas `from cpython cimport` does not. bint PyUnicode_Check(object obj) nogil - bint PyString_Check(object obj) nogil bint PyBool_Check(object obj) nogil bint PyFloat_Check(object obj) nogil bint PyComplex_Check(object obj) nogil diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index d1aecf0a9d2947..e5d78dae9c0233 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -3,11 +3,6 @@ from cython import Py_ssize_t from cpython cimport PyBytes_GET_SIZE, PyUnicode_GET_SIZE -try: - from cpython cimport PyString_GET_SIZE -except ImportError: - from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE - import numpy as np from numpy cimport ndarray, uint8_t @@ -126,11 +121,9 @@ def max_len_string_array(pandas_string[:] arr) -> Py_ssize_t: for i in range(length): val = arr[i] if isinstance(val, str): - l = PyString_GET_SIZE(val) + l = PyUnicode_GET_SIZE(val) elif isinstance(val, bytes): l = PyBytes_GET_SIZE(val) - elif isinstance(val, unicode): - l = PyUnicode_GET_SIZE(val) if l > m: m = l diff --git a/pandas/_typing.py b/pandas/_typing.py index 837a7a89e0b839..37a5d7945955de 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -11,9 +11,9 @@ from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.frame import DataFrame # noqa: F401 from pandas.core.series import Series # noqa: F401 from pandas.core.sparse.series import SparseSeries # noqa: F401 + from pandas.core.generic import NDFrame # noqa: F401 AnyArrayLike = TypeVar( @@ -24,7 +24,10 @@ Dtype = Union[str, np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] -FrameOrSeries = TypeVar("FrameOrSeries", "Series", "DataFrame") +FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") Scalar = Union[str, int, float] Axis = Union[str, int] Ordered = Optional[bool] + +# to maintain type information across generic functions and parametrization +_T = TypeVar("_T") diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index b32da8da3a1fbe..9c778f68727c6b 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -15,6 +15,7 @@ PY35 = sys.version_info[:2] == (3, 5) PY36 = sys.version_info >= (3, 6) PY37 = sys.version_info >= (3, 7) +PY38 = sys.version_info >= (3, 8) PYPY = platform.python_implementation() == "PyPy" diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index bca33513b00698..87240a9f986c33 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -196,10 +196,6 @@ def load_newobj_ex(self): def load(fh, encoding=None, is_verbose=False): """load a pickle, with a provided encoding - if compat is True: - fake the old class hierarchy - if it works, then return the new type objects - Parameters ---------- fh : a filelike object diff --git a/pandas/conftest.py b/pandas/conftest.py index 2cf7bf6a6df41c..b032e14d8f7e1d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -123,18 +123,22 @@ def ip(): @pytest.fixture(params=[True, False, None]) def observed(request): - """ pass in the observed keyword to groupby for [True, False] + """ + Pass in the observed keyword to groupby for [True, False] This indicates whether categoricals should return values for values which are not in the grouper [False / None], or only values which appear in the grouper [True]. [None] is supported for future compatibility if we decide to change the default (and would need to warn if this - parameter is not passed)""" + parameter is not passed). + """ return request.param @pytest.fixture(params=[True, False, None]) def ordered_fixture(request): - """Boolean 'ordered' parameter for Categorical.""" + """ + Boolean 'ordered' parameter for Categorical. + """ return request.param @@ -234,7 +238,8 @@ def cython_table_items(request): def _get_cython_table_params(ndframe, func_names_and_expected): - """combine frame, functions from SelectionMixin._cython_table + """ + Combine frame, functions from SelectionMixin._cython_table keys and expected result. Parameters @@ -242,7 +247,7 @@ def _get_cython_table_params(ndframe, func_names_and_expected): ndframe : DataFrame or Series func_names_and_expected : Sequence of two items The first item is a name of a NDFrame method ('sum', 'prod') etc. - The second item is the expected return value + The second item is the expected return value. Returns ------- @@ -341,7 +346,8 @@ def strict_data_files(pytestconfig): @pytest.fixture def datapath(strict_data_files): - """Get the path to a data file. + """ + Get the path to a data file. Parameters ---------- @@ -375,7 +381,9 @@ def deco(*args): @pytest.fixture def iris(datapath): - """The iris dataset as a DataFrame.""" + """ + The iris dataset as a DataFrame. + """ return pd.read_csv(datapath("data", "iris.csv")) @@ -504,7 +512,8 @@ def tz_aware_fixture(request): @pytest.fixture(params=STRING_DTYPES) def string_dtype(request): - """Parametrized fixture for string dtypes. + """ + Parametrized fixture for string dtypes. * str * 'str' @@ -515,7 +524,8 @@ def string_dtype(request): @pytest.fixture(params=BYTES_DTYPES) def bytes_dtype(request): - """Parametrized fixture for bytes dtypes. + """ + Parametrized fixture for bytes dtypes. * bytes * 'bytes' @@ -525,7 +535,8 @@ def bytes_dtype(request): @pytest.fixture(params=OBJECT_DTYPES) def object_dtype(request): - """Parametrized fixture for object dtypes. + """ + Parametrized fixture for object dtypes. * object * 'object' @@ -535,7 +546,8 @@ def object_dtype(request): @pytest.fixture(params=DATETIME64_DTYPES) def datetime64_dtype(request): - """Parametrized fixture for datetime64 dtypes. + """ + Parametrized fixture for datetime64 dtypes. * 'datetime64[ns]' * 'M8[ns]' @@ -545,7 +557,8 @@ def datetime64_dtype(request): @pytest.fixture(params=TIMEDELTA64_DTYPES) def timedelta64_dtype(request): - """Parametrized fixture for timedelta64 dtypes. + """ + Parametrized fixture for timedelta64 dtypes. * 'timedelta64[ns]' * 'm8[ns]' diff --git a/pandas/core/api.py b/pandas/core/api.py index 73323d93b8215a..bd2a57a15bdd2b 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -2,6 +2,16 @@ import numpy as np +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.missing import isna, isnull, notna, notnull + +from pandas.core.algorithms import factorize, unique, value_counts +from pandas.core.arrays import Categorical from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, @@ -12,45 +22,38 @@ UInt32Dtype, UInt64Dtype, ) -from pandas.core.algorithms import factorize, unique, value_counts -from pandas.core.dtypes.missing import isna, isnull, notna, notnull -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - PeriodDtype, - IntervalDtype, - DatetimeTZDtype, -) -from pandas.core.arrays import Categorical from pandas.core.construction import array + from pandas.core.groupby import Grouper, NamedAgg -from pandas.io.formats.format import set_eng_float_format + +# DataFrame needs to be imported after NamedAgg to avoid a circular import +from pandas.core.frame import DataFrame # isort:skip from pandas.core.index import ( - Index, CategoricalIndex, - Int64Index, - UInt64Index, - RangeIndex, + DatetimeIndex, Float64Index, - MultiIndex, + Index, + Int64Index, IntervalIndex, - TimedeltaIndex, - DatetimeIndex, - PeriodIndex, + MultiIndex, NaT, + PeriodIndex, + RangeIndex, + TimedeltaIndex, + UInt64Index, ) +from pandas.core.indexes.datetimes import Timestamp, bdate_range, date_range +from pandas.core.indexes.interval import Interval, interval_range from pandas.core.indexes.period import Period, period_range from pandas.core.indexes.timedeltas import Timedelta, timedelta_range -from pandas.core.indexes.datetimes import Timestamp, date_range, bdate_range -from pandas.core.indexes.interval import Interval, interval_range - -from pandas.core.series import Series -from pandas.core.frame import DataFrame - -# TODO: Remove import when statsmodels updates #18264 -from pandas.core.reshape.reshape import get_dummies - from pandas.core.indexing import IndexSlice -from pandas.core.tools.numeric import to_numeric -from pandas.tseries.offsets import DateOffset +from pandas.core.reshape.reshape import ( + get_dummies, +) # TODO: Remove get_dummies import when statsmodels updates #18264 +from pandas.core.series import Series from pandas.core.tools.datetimes import to_datetime +from pandas.core.tools.numeric import to_numeric from pandas.core.tools.timedeltas import to_timedelta + +from pandas.io.formats.format import set_eng_float_format +from pandas.tseries.offsets import DateOffset diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 5c8599dbb054b6..e6766a33a613b2 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -3,7 +3,7 @@ import numpy as np -from pandas._libs import reduction +from pandas._libs import reduction as libreduction from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -199,20 +199,21 @@ def apply_empty_result(self): return self.obj.copy() # we may need to infer - reduce = self.result_type == "reduce" + should_reduce = self.result_type == "reduce" from pandas import Series - if not reduce: + if not should_reduce: EMPTY_SERIES = Series([]) try: r = self.f(EMPTY_SERIES, *self.args, **self.kwds) - reduce = not isinstance(r, Series) except Exception: pass + else: + should_reduce = not isinstance(r, Series) - if reduce: + if should_reduce: return self.obj._constructor_sliced(np.nan, index=self.agg_axis) else: return self.obj.copy() @@ -221,7 +222,7 @@ def apply_raw(self): """ apply to the values as a numpy array """ try: - result = reduction.compute_reduction(self.values, self.f, axis=self.axis) + result = libreduction.compute_reduction(self.values, self.f, axis=self.axis) except Exception: result = np.apply_along_axis(self.f, self.axis, self.values) @@ -281,7 +282,7 @@ def apply_standard(self): dummy = Series(empty_arr, index=index, dtype=values.dtype) try: - result = reduction.compute_reduction( + result = libreduction.compute_reduction( values, self.f, axis=self.axis, dummy=dummy, labels=labels ) return self.obj._constructor_sliced(result, index=labels) @@ -306,10 +307,11 @@ def apply_series_generator(self): for i, v in enumerate(series_gen): try: results[i] = self.f(v) - keys.append(v.name) - successes.append(i) except Exception: pass + else: + keys.append(v.name) + successes.append(i) # so will work with MultiIndex if len(successes) < len(res_index): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5c121172d0e4fc..0778b6726d1041 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -514,7 +514,7 @@ def fillna(self, value=None, method=None, limit=None): def dropna(self): """ - Return ExtensionArray without NA values + Return ExtensionArray without NA values. Returns ------- @@ -957,7 +957,7 @@ def _concat_same_type( cls, to_concat: Sequence[ABCExtensionArray] ) -> ABCExtensionArray: """ - Concatenate multiple array + Concatenate multiple array. Parameters ---------- diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a895da6184eeba..5929a8d51fe430 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -471,7 +471,7 @@ def ordered(self) -> Ordered: @property def dtype(self) -> CategoricalDtype: """ - The :class:`~pandas.api.types.CategoricalDtype` for this instance + The :class:`~pandas.api.types.CategoricalDtype` for this instance. """ return self._dtype diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1988726edc79b9..bda5f8f4326f18 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1300,7 +1300,7 @@ def __sub__(self, other): return result def __rsub__(self, other): - if is_datetime64_any_dtype(other) and is_timedelta64_dtype(self): + if is_datetime64_any_dtype(other) and is_timedelta64_dtype(self.dtype): # ndarray[datetime64] cannot be subtracted from self, so # we need to wrap in DatetimeArray/Index and flip the operation if not isinstance(other, DatetimeLikeArrayMixin): @@ -1310,9 +1310,9 @@ def __rsub__(self, other): other = DatetimeArray(other) return other - self elif ( - is_datetime64_any_dtype(self) + is_datetime64_any_dtype(self.dtype) and hasattr(other, "dtype") - and not is_datetime64_any_dtype(other) + and not is_datetime64_any_dtype(other.dtype) ): # GH#19959 datetime - datetime is well-defined as timedelta, # but any other type - datetime is not well-defined. @@ -1321,13 +1321,21 @@ def __rsub__(self, other): cls=type(self).__name__, typ=type(other).__name__ ) ) - elif is_period_dtype(self) and is_timedelta64_dtype(other): + elif is_period_dtype(self.dtype) and is_timedelta64_dtype(other): # TODO: Can we simplify/generalize these cases at all? raise TypeError( "cannot subtract {cls} from {dtype}".format( cls=type(self).__name__, dtype=other.dtype ) ) + elif is_timedelta64_dtype(self.dtype): + if lib.is_integer(other) or is_integer_dtype(other): + # need to subtract before negating, since that flips freq + # -self flips self.freq, messing up results + return -(self - other) + + return (-self) + other + return -(self - other) # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 093334a815938e..5dff1f93264c3e 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1063,6 +1063,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): Be careful with DST changes. When there is sequential data, pandas can infer the DST time: + >>> s = pd.to_datetime(pd.Series(['2018-10-28 01:30:00', ... '2018-10-28 02:00:00', ... '2018-10-28 02:30:00', @@ -1094,6 +1095,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. + >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', ... '2015-03-29 03:30:00'])) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') @@ -1158,7 +1160,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise", errors=None): def to_pydatetime(self): """ Return Datetime Array/Index as object ndarray of datetime.datetime - objects + objects. Returns ------- @@ -1283,7 +1285,7 @@ def to_perioddelta(self, freq): """ Calculate TimedeltaArray of difference between index values and index converted to PeriodArray at specified - freq. Used for vectorized offsets + freq. Used for vectorized offsets. Parameters ---------- @@ -2282,7 +2284,8 @@ def _infer_tz_from_endpoints(start, end, tz): """ try: inferred_tz = timezones.infer_tzinfo(start, end) - except Exception: + except AssertionError: + # infer_tzinfo raises AssertionError if passed mismatched timezones raise TypeError( "Start and end cannot both be tz-aware with different timezones" ) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 7a14d6f1b619aa..1f4b76a259f00c 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -129,9 +129,9 @@ ``Interval`` objects: >>> pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]) - IntervalArray([(0, 1], (1, 5]], - closed='right', - dtype='interval[int64]') + + [(0, 1], (1, 5]] + Length: 2, closed: right, dtype: interval[int64] It may also be constructed using one of the constructor methods: :meth:`IntervalArray.from_arrays`, @@ -248,9 +248,8 @@ def _from_factorized(cls, values, original): values = values.astype(original.dtype.subtype) return cls(values, closed=original.closed) - _interval_shared_docs[ - "from_breaks" - ] = """ + _interval_shared_docs["from_breaks"] = textwrap.dedent( + """ Construct an %(klass)s from an array of splits. Parameters @@ -277,24 +276,34 @@ def _from_factorized(cls, values, original): %(klass)s.from_arrays : Construct from a left and right array. %(klass)s.from_tuples : Construct from a sequence of tuples. - Examples - -------- - >>> pd.%(qualname)s.from_breaks([0, 1, 2, 3]) - %(klass)s([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') + %(examples)s\ """ + ) @classmethod - @Appender(_interval_shared_docs["from_breaks"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["from_breaks"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.arrays.IntervalArray.from_breaks([0, 1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): breaks = maybe_convert_platform_interval(breaks) return cls.from_arrays(breaks[:-1], breaks[1:], closed, copy=copy, dtype=dtype) - _interval_shared_docs[ - "from_arrays" - ] = """ + _interval_shared_docs["from_arrays"] = textwrap.dedent( + """ Construct from two arrays defining the left and right bounds. Parameters @@ -340,16 +349,25 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): using an unsupported type for `left` or `right`. At the moment, 'category', 'object', and 'string' subtypes are not supported. - Examples - -------- - >>> %(klass)s.from_arrays([0, 1, 2], [1, 2, 3]) - %(klass)s([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') + %(examples)s\ """ + ) @classmethod - @Appender(_interval_shared_docs["from_arrays"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["from_arrays"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> pd.arrays.IntervalArray.from_arrays([0, 1, 2], [1, 2, 3]) + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left = maybe_convert_platform_interval(left) right = maybe_convert_platform_interval(right) @@ -358,9 +376,8 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): left, right, closed, copy=copy, dtype=dtype, verify_integrity=True ) - _interval_shared_docs[ - "from_tuples" - ] = """ + _interval_shared_docs["from_tuples"] = textwrap.dedent( + """ Construct an %(klass)s from an array-like of tuples. Parameters @@ -389,15 +406,27 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): %(klass)s.from_breaks : Construct an %(klass)s from an array of splits. - Examples - -------- - >>> pd.%(qualname)s.from_tuples([(0, 1), (1, 2)]) - %(klass)s([(0, 1], (1, 2]], - closed='right', dtype='interval[int64]') + %(examples)s\ """ + ) @classmethod - @Appender(_interval_shared_docs["from_tuples"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["from_tuples"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2)]) + + [(0, 1], (1, 2]] + Length: 2, closed: right, dtype: interval[int64] + """ + ), + ) + ) def from_tuples(cls, data, closed="right", copy=False, dtype=None): if len(data): left, right = [], [] @@ -832,16 +861,20 @@ def _format_data(self): return summary def __repr__(self): - tpl = textwrap.dedent( - """\ - {cls}({data}, - {lead}closed='{closed}', - {lead}dtype='{dtype}')""" + template = ( + "{class_name}" + "{data}\n" + "Length: {length}, closed: {closed}, dtype: {dtype}" ) - return tpl.format( - cls=self.__class__.__name__, - data=self._format_data(), - lead=" " * len(self.__class__.__name__) + " ", + # the short repr has no trailing newline, while the truncated + # repr does. So we include a newline in our template, and strip + # any trailing newlines from format_object_summary + data = self._format_data() + class_name = "<{}>\n".format(self.__class__.__name__) + return template.format( + class_name=class_name, + data=data, + length=len(self), closed=self.closed, dtype=self.dtype, ) @@ -874,9 +907,8 @@ def closed(self): """ return self._closed - _interval_shared_docs[ - "set_closed" - ] = """ + _interval_shared_docs["set_closed"] = textwrap.dedent( + """ Return an %(klass)s identical to the current one, but closed on the specified side. @@ -892,20 +924,31 @@ def closed(self): ------- new_index : %(klass)s + %(examples)s\ + """ + ) + + @Appender( + _interval_shared_docs["set_closed"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ Examples -------- - >>> index = pd.interval_range(0, 3) + >>> index = pd.arrays.IntervalArray.from_breaks(range(4)) >>> index - IntervalIndex([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') + + [(0, 1], (1, 2], (2, 3]] + Length: 3, closed: right, dtype: interval[int64] >>> index.set_closed('both') - IntervalIndex([[0, 1], [1, 2], [2, 3]], - closed='both', - dtype='interval[int64]') + + [[0, 1], [1, 2], [2, 3]] + Length: 3, closed: both, dtype: interval[int64] """ - - @Appender(_interval_shared_docs["set_closed"] % _shared_docs_kwargs) + ), + ) + ) def set_closed(self, closed): if closed not in _VALID_CLOSED: msg = "invalid option for 'closed': {closed}" @@ -1028,9 +1071,8 @@ def repeat(self, repeats, axis=None): right_repeat = self.right.repeat(repeats) return self._shallow_copy(left=left_repeat, right=right_repeat) - _interval_shared_docs[ - "contains" - ] = """ + _interval_shared_docs["contains"] = textwrap.dedent( + """ Check elementwise if the Intervals contain the value. Return a boolean mask whether the value is contained in the Intervals @@ -1055,16 +1097,27 @@ def repeat(self, repeats, axis=None): Examples -------- - >>> intervals = pd.%(qualname)s.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - %(klass)s([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') + %(examples)s >>> intervals.contains(0.5) array([ True, False, False]) """ + ) - @Appender(_interval_shared_docs["contains"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["contains"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> intervals = pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) def contains(self, other): if isinstance(other, Interval): raise NotImplementedError("contains not implemented for two intervals") @@ -1073,9 +1126,8 @@ def contains(self, other): other < self.right if self.open_right else other <= self.right ) - _interval_shared_docs[ - "overlaps" - ] = """ + _interval_shared_docs["overlaps"] = textwrap.dedent( + """ Check elementwise if an Interval overlaps the values in the %(klass)s. Two intervals overlap if they share a common point, including closed @@ -1086,7 +1138,7 @@ def contains(self, other): Parameters ---------- - other : Interval + other : %(klass)s Interval to check against for an overlap. Returns @@ -1100,11 +1152,7 @@ def contains(self, other): Examples -------- - >>> intervals = pd.%(qualname)s.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - %(klass)s([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') + %(examples)s >>> intervals.overlaps(pd.Interval(0.5, 1.5)) array([ True, True, False]) @@ -1117,9 +1165,25 @@ def contains(self, other): >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) array([False, True, False]) - """ + """ + ) - @Appender(_interval_shared_docs["overlaps"] % _shared_docs_kwargs) + @Appender( + _interval_shared_docs["overlaps"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> data = [(0, 1), (1, 3), (2, 4)] + >>> intervals = pd.arrays.IntervalArray.from_tuples(data) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) def overlaps(self, other): if isinstance(other, (IntervalArray, ABCIntervalIndex)): raise NotImplementedError diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 20ce11c70c3443..f2d74794eadf53 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -426,7 +426,7 @@ def __array__(self, dtype=None): @property def is_leap_year(self): """ - Logical indicating if the date belongs to a leap year + Logical indicating if the date belongs to a leap year. """ return isleapyear_arr(np.asarray(self.year)) diff --git a/pandas/core/base.py b/pandas/core/base.py index 7d2a62318232c3..2d5ffb5e913923 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -47,7 +47,6 @@ class PandasObject(DirNamesMixin): - """baseclass for various pandas objects""" @property @@ -1462,7 +1461,7 @@ def is_monotonic_decreasing(self): def memory_usage(self, deep=False): """ - Memory usage of the values + Memory usage of the values. Parameters ---------- diff --git a/pandas/core/common.py b/pandas/core/common.py index a507625ccfa01f..cf113c8aecbfe5 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -211,7 +211,7 @@ def try_sort(iterable): listed = list(iterable) try: return sorted(listed) - except Exception: + except TypeError: return listed diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index a58f256cf61d41..45319a4d63d948 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -367,8 +367,8 @@ def f(cls): @disallow(_unsupported_nodes) @add_ops(_op_classes) class BaseExprVisitor(ast.NodeVisitor): - - """Custom ast walker. Parsers of other engines should subclass this class + """ + Custom ast walker. Parsers of other engines should subclass this class if necessary. Parameters @@ -582,6 +582,9 @@ def visit_NameConstant(self, node, **kwargs): def visit_Num(self, node, **kwargs): return self.const_type(node.n, self.env) + def visit_Constant(self, node, **kwargs): + return self.const_type(node.n, self.env) + def visit_Str(self, node, **kwargs): name = self.env.add_tmp(node.s) return self.term_type(name, self.env) @@ -800,8 +803,8 @@ def __init__(self, env, engine, parser, preparser=lambda x: x): class Expr: - - """Object encapsulating an expression. + """ + Object encapsulating an expression. Parameters ---------- diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 29c8239fa518fc..90bb12b4cd727f 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -62,8 +62,9 @@ def set_numexpr_threads(n=None): ne.set_num_threads(n) -def _evaluate_standard(op, op_str, a, b, **eval_kwargs): +def _evaluate_standard(op, op_str, a, b, reversed=False): """ standard evaluation """ + # `reversed` kwarg is included for compatibility with _evaluate_numexpr if _TEST_MODE: _store_test_result(False) with np.errstate(all="ignore"): @@ -96,7 +97,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): return False -def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwargs): +def _evaluate_numexpr(op, op_str, a, b, reversed=False): result = None if _can_use_numexpr(op, op_str, a, b, "evaluate"): @@ -111,8 +112,6 @@ def _evaluate_numexpr(op, op_str, a, b, truediv=True, reversed=False, **eval_kwa "a_value {op} b_value".format(op=op_str), local_dict={"a_value": a_value, "b_value": b_value}, casting="safe", - truediv=truediv, - **eval_kwargs ) except ValueError as detail: if "unknown type object" in str(detail): @@ -201,7 +200,7 @@ def _bool_arith_check( return True -def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): +def evaluate(op, op_str, a, b, use_numexpr=True, reversed=False): """ Evaluate and return the expression of the op on a and b. @@ -214,11 +213,12 @@ def evaluate(op, op_str, a, b, use_numexpr=True, **eval_kwargs): b : right operand use_numexpr : bool, default True Whether to try to use numexpr. + reversed : bool, default False """ use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: - return _evaluate(op, op_str, a, b, **eval_kwargs) + return _evaluate(op, op_str, a, b, reversed=reversed) return _evaluate_standard(op, op_str, a, b) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 1523eb05ac41dd..81658ab23ba466 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -478,7 +478,6 @@ def _validate_where(w): class Expr(expr.Expr): - """ hold a pytables like expression, comprised of possibly multiple 'terms' Parameters @@ -573,7 +572,6 @@ def evaluate(self): class TermValue: - """ hold a term value the we use to construct a condition/filter """ def __init__(self, value, converted, kind): diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 08dce6aca6e6d1..dfc80140433f8e 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -9,8 +9,6 @@ module is imported, register them here rather then in the module. """ -import importlib - import pandas._config.config as cf from pandas._config.config import ( is_bool, @@ -581,26 +579,12 @@ def use_inf_as_na_cb(key): def register_plotting_backend_cb(key): - backend_str = cf.get_option(key) - if backend_str == "matplotlib": - try: - import pandas.plotting._matplotlib # noqa - except ImportError: - raise ImportError( - "matplotlib is required for plotting when the " - 'default backend "matplotlib" is selected.' - ) - else: - return + if key == "matplotlib": + # We defer matplotlib validation, since it's the default + return + from pandas.plotting._core import _get_plot_backend - try: - importlib.import_module(backend_str) - except ImportError: - raise ValueError( - '"{}" does not seem to be an installed module. ' - "A pandas plotting backend must be a module that " - "can be imported".format(backend_str) - ) + _get_plot_backend(key) with cf.config_prefix("plotting"): @@ -608,8 +592,7 @@ def register_plotting_backend_cb(key): "backend", defval="matplotlib", doc=plotting_backend_doc, - validator=str, - cb=register_plotting_backend_cb, + validator=register_plotting_backend_cb, ) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 12f3fd2c75dc8a..1094ab22238e97 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -89,10 +89,9 @@ def concat_compat(to_concat, axis=0): # filter empty arrays # 1-d dtypes always are included here def is_nonempty(x): - try: - return x.shape[axis] > 0 - except Exception: + if x.ndim <= axis: return True + return x.shape[axis] > 0 # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index ee1866e60644b8..aa7e6801ba431c 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -23,7 +23,7 @@ ordered_sentinel = object() # type: object -def register_extension_dtype(cls: Type[ExtensionDtype],) -> Type[ExtensionDtype]: +def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: """ Register an ExtensionType with pandas as class decorator. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f2bb964f35dbd4..16fece1c7eb8ba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -86,12 +86,7 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.index import ( - Index, - MultiIndex, - ensure_index, - ensure_index_from_sequences, -) +from pandas.core.index import Index, ensure_index, ensure_index_from_sequences from pandas.core.indexes import base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.multi import maybe_droplevels @@ -676,10 +671,25 @@ def _repr_html_(self): formatter = fmt.DataFrameFormatter( self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, max_rows=max_rows, min_rows=min_rows, max_cols=max_cols, show_dimensions=show_dimensions, + decimal=".", + table_id=None, + render_links=False, ) return formatter.to_html(notebook=True) else: @@ -1734,7 +1744,7 @@ def to_records( if is_datetime64_any_dtype(self.index) and convert_datetime64: ix_vals = [self.index.to_pydatetime()] else: - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): # array of tuples to numpy cols. copy copy copy ix_vals = list(map(np.array, zip(*self.index.values))) else: @@ -1745,7 +1755,7 @@ def to_records( count = 0 index_names = list(self.index.names) - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): for i, n in enumerate(index_names): if n is None: index_names[i] = "level_%d" % count @@ -2868,7 +2878,7 @@ def __getitem__(self, key): # The behavior is inconsistent. It returns a Series, except when # - the key itself is repeated (test on data.shape, #9519), or # - we have a MultiIndex on columns (test on self.columns, #21309) - if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + if data.shape[1] == 1 and not isinstance(self.columns, ABCMultiIndex): data = data[key] return data @@ -3657,7 +3667,7 @@ def reindexer(value): elif isinstance(value, DataFrame): # align right-hand-side columns if self.columns # is multi-index and self[key] is a sub-frame - if isinstance(self.columns, MultiIndex) and key in self.columns: + if isinstance(self.columns, ABCMultiIndex) and key in self.columns: loc = self.columns.get_loc(key) if isinstance(loc, (slice, Series, np.ndarray, Index)): cols = maybe_droplevels(self.columns[loc], key) @@ -3706,7 +3716,7 @@ def reindexer(value): # broadcast across multiple columns if necessary if broadcast and key in self.columns and value.ndim == 1: - if not self.columns.is_unique or isinstance(self.columns, MultiIndex): + if not self.columns.is_unique or isinstance(self.columns, ABCMultiIndex): existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)) @@ -4601,7 +4611,7 @@ def _maybe_casted_values(index, labels=None): new_index = self.index.droplevel(level) if not drop: - if isinstance(self.index, MultiIndex): + if isinstance(self.index, ABCMultiIndex): names = [ n if n is not None else ("level_%d" % i) for (i, n) in enumerate(self.index.names) @@ -4612,7 +4622,7 @@ def _maybe_casted_values(index, labels=None): names = [default] if self.index.name is None else [self.index.name] to_insert = ((self.index, None),) - multi_col = isinstance(self.columns, MultiIndex) + multi_col = isinstance(self.columns, ABCMultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): if not (level is None or i in level): continue @@ -4994,7 +5004,7 @@ def sort_index( level, ascending=ascending, sort_remaining=sort_remaining ) - elif isinstance(labels, MultiIndex): + elif isinstance(labels, ABCMultiIndex): from pandas.core.sorting import lexsort_indexer indexer = lexsort_indexer( @@ -5280,7 +5290,7 @@ def reorder_levels(self, order, axis=0): type of caller (new object) """ axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover raise TypeError("Can only reorder levels on a hierarchical axis.") result = self.copy() @@ -5298,12 +5308,19 @@ def _combine_frame(self, other, func, fill_value=None, level=None): this, other = self.align(other, join="outer", level=level, copy=False) new_index, new_columns = this.index, this.columns - def _arith_op(left, right): - # for the mixed_type case where we iterate over columns, - # _arith_op(left, right) is equivalent to - # left._binop(right, func, fill_value=fill_value) - left, right = ops.fill_binop(left, right, fill_value) - return func(left, right) + if fill_value is None: + # since _arith_op may be called in a loop, avoid function call + # overhead if possible by doing this check once + _arith_op = func + + else: + + def _arith_op(left, right): + # for the mixed_type case where we iterate over columns, + # _arith_op(left, right) is equivalent to + # left._binop(right, func, fill_value=fill_value) + left, right = ops.fill_binop(left, right, fill_value) + return func(left, right) if ops.should_series_dispatch(this, other, func): # iterate over columns @@ -5318,7 +5335,7 @@ def _arith_op(left, right): def _combine_match_index(self, other, func, level=None): left, right = self.align(other, join="outer", axis=0, level=level, copy=False) - assert left.index.equals(right.index) + # at this point we have `left.index.equals(right.index)` if left._is_mixed_type or right._is_mixed_type: # operate column-wise; avoid costly object-casting in `.values` @@ -5331,14 +5348,13 @@ def _combine_match_index(self, other, func, level=None): new_data, index=left.index, columns=self.columns, copy=False ) - def _combine_match_columns(self, other, func, level=None): - assert isinstance(other, Series) + def _combine_match_columns(self, other: Series, func, level=None): left, right = self.align(other, join="outer", axis=1, level=level, copy=False) - assert left.columns.equals(right.index) + # at this point we have `left.columns.equals(right.index)` return ops.dispatch_to_series(left, right, func, axis="columns") def _combine_const(self, other, func): - assert lib.is_scalar(other) or np.ndim(other) == 0 + # scalar other or np.ndim(other) == 0 return ops.dispatch_to_series(self, other, func) def combine(self, other, func, fill_value=None, overwrite=True): @@ -6183,14 +6199,14 @@ def stack(self, level=-1, dropna=True): def explode(self, column: Union[str, Tuple]) -> "DataFrame": """ - Transform each element of a list-like to a row, replicating the - index values. + Transform each element of a list-like to a row, replicating index values. .. versionadded:: 0.25.0 Parameters ---------- column : str or tuple + Column to explode. Returns ------- @@ -6206,8 +6222,8 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame": See Also -------- DataFrame.unstack : Pivot a level of the (necessarily hierarchical) - index labels - DataFrame.melt : Unpivot a DataFrame from wide format to long format + index labels. + DataFrame.melt : Unpivot a DataFrame from wide format to long format. Series.explode : Explode a DataFrame from list-like columns to long format. Notes @@ -7778,7 +7794,7 @@ def _count_level(self, level, axis=0, numeric_only=False): count_axis = frame._get_axis(axis) agg_axis = frame._get_agg_axis(axis) - if not isinstance(count_axis, MultiIndex): + if not isinstance(count_axis, ABCMultiIndex): raise TypeError( "Can only count levels on hierarchical " "{ax}.".format(ax=self._get_axis_name(axis)) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba1c516b9b444e..831543ee660392 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7,7 +7,17 @@ import pickle import re from textwrap import dedent -from typing import Callable, Dict, FrozenSet, List, Optional, Set +from typing import ( + Callable, + Dict, + FrozenSet, + Hashable, + List, + Optional, + Sequence, + Set, + Union, +) import warnings import weakref @@ -50,7 +60,7 @@ from pandas.core.dtypes.missing import isna, notna import pandas as pd -from pandas._typing import Dtype +from pandas._typing import Dtype, FilePathOrBuffer from pandas.core import missing, nanops import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin @@ -122,6 +132,9 @@ def _single_replace(self, to_replace, method, inplace, limit): return result +bool_t = bool # Need alias because NDFrame has def bool: + + class NDFrame(PandasObject, SelectionMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a @@ -1875,7 +1888,7 @@ def __iter__(self): # can we get a better explanation of this? def keys(self): """ - Get the 'info axis' (see Indexing for more) + Get the 'info axis' (see Indexing for more). This is index for Series, columns for DataFrame. @@ -2581,13 +2594,14 @@ def to_sql( `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional - Rows will be written in batches of this size at a time. By default, - all rows will be written at once. - dtype : dict, optional - Specifying the datatype for columns. The keys should be the column - names and the values should be the SQLAlchemy types or strings for - the sqlite3 legacy mode. - method : {None, 'multi', callable}, default None + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 legacy mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: * None : Uses standard SQL ``INSERT`` clause (one per row). @@ -2911,15 +2925,21 @@ def to_latex( multicolumn=None, multicolumn_format=None, multirow=None, + caption=None, + label=None, ): r""" - Render an object to a LaTeX tabular environment table. + Render object to a LaTeX tabular, longtable, or nested table/tabular. - Render an object to a tabular environment table. You can splice - this into a LaTeX document. Requires \usepackage{booktabs}. + Requires ``\usepackage{booktabs}``. The output can be copy/pasted + into a main LaTeX document or read from an external file + with ``\input{table.tex}``. .. versionchanged:: 0.20.2 - Added to Series + Added to Series. + + .. versionchanged:: 1.0.0 + Added caption and label arguments. Parameters ---------- @@ -2988,6 +3008,17 @@ def to_latex( from the pandas config module. .. versionadded:: 0.20.0 + + caption : str, optional + The LaTeX caption to be placed inside ``\caption{}`` in the output. + + .. versionadded:: 1.0.0 + + label : str, optional + The LaTeX label to be placed inside ``\label{}`` in the output. + This is used with ``\ref{}`` in the main ``.tex`` file. + + .. versionadded:: 1.0.0 %(returns)s See Also -------- @@ -3000,7 +3031,7 @@ def to_latex( >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], ... 'mask': ['red', 'purple'], ... 'weapon': ['sai', 'bo staff']}) - >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE + >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE \begin{tabular}{lll} \toprule name & mask & weapon \\ @@ -3047,30 +3078,32 @@ def to_latex( multicolumn=multicolumn, multicolumn_format=multicolumn_format, multirow=multirow, + caption=caption, + label=label, ) def to_csv( self, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression="infer", - quoting=None, - quotechar='"', - line_terminator=None, - chunksize=None, - date_format=None, - doublequote=True, - escapechar=None, - decimal=".", - ): + path_or_buf: Optional[FilePathOrBuffer] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, + columns: Optional[Sequence[Hashable]] = None, + header: Union[bool_t, List[str]] = True, + index: bool_t = True, + index_label: Optional[Union[bool_t, str, Sequence[Hashable]]] = None, + mode: str = "w", + encoding: Optional[str] = None, + compression: Optional[Union[str, Dict[str, str]]] = "infer", + quoting: Optional[int] = None, + quotechar: str = '"', + line_terminator: Optional[str] = None, + chunksize: Optional[int] = None, + date_format: Optional[str] = None, + doublequote: bool_t = True, + escapechar: Optional[str] = None, + decimal: Optional[str] = ".", + ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3117,16 +3150,21 @@ def to_csv( encoding : str, optional A string representing the encoding to use in the output file, defaults to 'utf-8'. - compression : str, default 'infer' - Compression mode among the following possible values: {'infer', - 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` - is path-like, then detect compression from the following - extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no - compression). - - .. versionchanged:: 0.24.0 - - 'infer' option added and set to default. + compression : str or dict, default 'infer' + If str, represents compression mode. If dict, value at 'method' is + the compression mode. Compression mode may be any of the following + possible values: {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If + compression mode is 'infer' and `path_or_buf` is path-like, then + detect compression mode from the following extensions: '.gz', + '.bz2', '.zip' or '.xz'. (otherwise no compression). If dict given + and mode is 'zip' or inferred as 'zip', other entries passed as + additional compression options. + + .. versionchanged:: 0.25.0 + + May now be a dict with key 'method' as compression mode + and other entries as additional compression options if + compression mode is 'zip'. quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` @@ -3171,6 +3209,13 @@ def to_csv( ... 'weapon': ['sai', 'bo staff']}) >>> df.to_csv(index=False) 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + + # create 'out.zip' containing 'out.csv' + >>> compression_opts = dict(method='zip', + ... archive_name='out.csv') # doctest: +SKIP + + >>> df.to_csv('out.zip', index=False, + ... compression=compression_opts) # doctest: +SKIP """ df = self if isinstance(self, ABCDataFrame) else self.to_frame() @@ -3204,6 +3249,8 @@ def to_csv( if path_or_buf is None: return formatter.path_or_buf.getvalue() + return None + # ---------------------------------------------------------------------- # Fancy Indexing @@ -5733,11 +5780,11 @@ def astype(self, dtype, copy=True, errors="raise", **kwargs): Control raising of exceptions on invalid data for provided dtype. - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original object + - ``ignore`` : suppress exceptions. On error return original object. .. versionadded:: 0.20.0 - kwargs : keyword arguments to pass on to the constructor + **kwargs : keyword arguments to pass on to the constructor Returns ------- @@ -5798,7 +5845,7 @@ def astype(self, dtype, copy=True, errors="raise", **kwargs): Convert to ordered categorical type with custom ordering: >>> cat_dtype = pd.api.types.CategoricalDtype( - ... categories=[2, 1], ordered=True) + ... categories=[2, 1], ordered=True) >>> ser.astype(cat_dtype) 0 1 1 2 @@ -5808,7 +5855,7 @@ def astype(self, dtype, copy=True, errors="raise", **kwargs): Note that using ``copy=False`` and changing data on a new pandas object may propagate changes: - >>> s1 = pd.Series([1,2]) + >>> s1 = pd.Series([1, 2]) >>> s2 = s1.astype('int64', copy=False) >>> s2[0] = 10 >>> s1 # note that s1[0] has changed too @@ -6642,11 +6689,7 @@ def replace( for k, v in items: keys, values = list(zip(*v.items())) or ([], []) - if set(keys) & set(values): - raise ValueError( - "Replacement not allowed with " - "overlapping keys and values" - ) + to_rep_dict[k] = list(keys) value_dict[k] = list(values) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ea2bd22cccc3d0..e731cffea0671a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -242,15 +242,18 @@ def aggregate(self, func, *args, **kwargs): # grouper specific aggregations if self.grouper.nkeys > 1: return self._python_agg_general(func, *args, **kwargs) + elif args or kwargs: + result = self._aggregate_generic(func, *args, **kwargs) else: # try to treat as if we are passing a list try: - assert not args and not kwargs result = self._aggregate_multiple_funcs( [func], _level=_level, _axis=self.axis ) - + except Exception: + result = self._aggregate_generic(func) + else: result.columns = Index( result.columns.levels[0], name=self._selected_obj.columns.name ) @@ -260,15 +263,15 @@ def aggregate(self, func, *args, **kwargs): # values. concat no longer converts DataFrame[Sparse] # to SparseDataFrame, so we do it here. result = SparseDataFrame(result._data) - except Exception: - result = self._aggregate_generic(func, *args, **kwargs) if not self.as_index: self._insert_inaxis_grouper_inplace(result) result.index = np.arange(len(result)) if relabeling: - result = result[order] + + # used reordered index of columns + result = result.iloc[:, order] result.columns = columns return result._convert(datetime=True) @@ -311,17 +314,21 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): cannot_agg = [] errors = None for item in obj: - try: - data = obj[item] - colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + data = obj[item] + colg = SeriesGroupBy(data, selection=item, grouper=self.grouper) + try: cast = self._transform_should_cast(func) result[item] = colg.aggregate(func, *args, **kwargs) if cast: result[item] = self._try_cast(result[item], data) - except ValueError: + except ValueError as err: + if "Must produce aggregated value" in str(err): + # raised in _aggregate_named, handle at higher level + # see test_apply_with_mutated_index + raise cannot_agg.append(item) continue except TypeError as e: @@ -346,7 +353,7 @@ def _decide_output_index(self, output, labels): output_keys = sorted(output) try: output_keys.sort() - except Exception: # pragma: no cover + except TypeError: pass if isinstance(labels, MultiIndex): @@ -646,20 +653,21 @@ def _choose_path(self, fast_path, slow_path, group): # if we make it here, test if we can use the fast path try: res_fast = fast_path(group) - - # verify fast path does not change columns (and names), otherwise - # its results cannot be joined with those of the slow path - if res_fast.columns != group.columns: - return path, res - # verify numerical equality with the slow path - if res.shape == res_fast.shape: - res_r = res.values.ravel() - res_fast_r = res_fast.values.ravel() - mask = notna(res_r) - if (res_r[mask] == res_fast_r[mask]).all(): - path = fast_path except Exception: - pass + # Hard to know ex-ante what exceptions `fast_path` might raise + return path, res + + # verify fast path does not change columns (and names), otherwise + # its results cannot be joined with those of the slow path + if not isinstance(res_fast, DataFrame): + return path, res + + if not res_fast.columns.equals(group.columns): + return path, res + + if res_fast.equals(res): + path = fast_path + return path, res def _transform_item_by_item(self, obj, wrapper): @@ -682,7 +690,7 @@ def _transform_item_by_item(self, obj, wrapper): return DataFrame(output, index=obj.index, columns=columns) - def filter(self, func, dropna=True, *args, **kwargs): # noqa + def filter(self, func, dropna=True, *args, **kwargs): """ Return a copy of a DataFrame excluding elements from groups that do not satisfy the boolean criterion specified by func. @@ -833,45 +841,45 @@ def apply(self, func, *args, **kwargs): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, func_or_funcs=None, *args, **kwargs): + def aggregate(self, func=None, *args, **kwargs): _level = kwargs.pop("_level", None) - relabeling = func_or_funcs is None + relabeling = func is None columns = None - no_arg_message = "Must provide 'func_or_funcs' or named aggregation **kwargs." + no_arg_message = "Must provide 'func' or named aggregation **kwargs." if relabeling: columns = list(kwargs) if not PY36: # sort for 3.5 and earlier columns = list(sorted(columns)) - func_or_funcs = [kwargs[col] for col in columns] + func = [kwargs[col] for col in columns] kwargs = {} if not columns: raise TypeError(no_arg_message) - if isinstance(func_or_funcs, str): - return getattr(self, func_or_funcs)(*args, **kwargs) + if isinstance(func, str): + return getattr(self, func)(*args, **kwargs) - if isinstance(func_or_funcs, abc.Iterable): + if isinstance(func, abc.Iterable): # Catch instances of lists / tuples # but not the class list / tuple itself. - func_or_funcs = _maybe_mangle_lambdas(func_or_funcs) - ret = self._aggregate_multiple_funcs(func_or_funcs, (_level or 0) + 1) + func = _maybe_mangle_lambdas(func) + ret = self._aggregate_multiple_funcs(func, (_level or 0) + 1) if relabeling: ret.columns = columns else: - cyfunc = self._get_cython_func(func_or_funcs) + cyfunc = self._get_cython_func(func) if cyfunc and not args and not kwargs: return getattr(self, cyfunc)() if self.grouper.nkeys > 1: - return self._python_agg_general(func_or_funcs, *args, **kwargs) + return self._python_agg_general(func, *args, **kwargs) try: - return self._python_agg_general(func_or_funcs, *args, **kwargs) + return self._python_agg_general(func, *args, **kwargs) except Exception: - result = self._aggregate_named(func_or_funcs, *args, **kwargs) + result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) ret = Series(result, index=index) @@ -1005,7 +1013,7 @@ def _aggregate_named(self, func, *args, **kwargs): group.name = name output = func(group, *args, **kwargs) if isinstance(output, (Series, Index, np.ndarray)): - raise Exception("Must produce aggregated value") + raise ValueError("Must produce aggregated value") result[name] = self._try_cast(output, group) return result @@ -1143,6 +1151,10 @@ def nunique(self, dropna=True): val = self.obj._internal_get_values() + # GH 27951 + # temporary fix while we wait for NumPy bug 12629 to be fixed + val[isna(val)] = np.datetime64("NaT") + try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes @@ -1464,8 +1476,8 @@ class DataFrameGroupBy(NDFrameGroupBy): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg=None, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func=None, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate @@ -1731,8 +1743,8 @@ def _normalize_keyword_aggregation(kwargs): The transformed kwargs. columns : List[str] The user-provided keys. - order : List[Tuple[str, str]] - Pairs of the input and output column names. + col_idx_order : List[int] + List of columns indices. Examples -------- @@ -1759,7 +1771,39 @@ def _normalize_keyword_aggregation(kwargs): else: aggspec[column] = [aggfunc] order.append((column, com.get_callable_name(aggfunc) or aggfunc)) - return aggspec, columns, order + + # uniquify aggfunc name if duplicated in order list + uniquified_order = _make_unique(order) + + # GH 25719, due to aggspec will change the order of assigned columns in aggregation + # uniquified_aggspec will store uniquified order list and will compare it with order + # based on index + aggspec_order = [ + (column, com.get_callable_name(aggfunc) or aggfunc) + for column, aggfuncs in aggspec.items() + for aggfunc in aggfuncs + ] + uniquified_aggspec = _make_unique(aggspec_order) + + # get the new indice of columns by comparison + col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order) + return aggspec, columns, col_idx_order + + +def _make_unique(seq): + """Uniquify aggfunc name of the pairs in the order list + + Examples: + -------- + >>> _make_unique([('a', ''), ('a', ''), ('b', '')]) + [('a', '_0'), ('a', '_1'), ('b', '')] + """ + return [ + (pair[0], "_".join([pair[1], str(seq[:i].count(pair))])) + if seq.count(pair) > 1 + else pair + for i, pair in enumerate(seq) + ] # TODO: Can't use, because mypy doesn't like us setting __name__ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3e8d079e47326b..e010e615e176e6 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -653,7 +653,8 @@ def curried(x): # mark this column as an error try: return self._aggregate_item_by_item(name, *args, **kwargs) - except (AttributeError): + except AttributeError: + # e.g. SparseArray has no flags attr raise ValueError return wrapper @@ -726,8 +727,7 @@ def f(g): with option_context("mode.chained_assignment", None): try: result = self._python_apply_general(f) - except Exception: - + except TypeError: # gh-20949 # try again, with .apply acting as a filtering # operation, by excluding the grouping column @@ -1011,7 +1011,6 @@ def _apply_filter(self, indices, dropna): class GroupBy(_GroupBy): - """ Class for grouping and aggregating relational data. @@ -1947,8 +1946,8 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: arrays = [] for i in range(self.ngroups): - arr = arr + i - arrays.append(arr) + arr2 = arr + i + arrays.append(arr2) indices = np.concatenate(arrays) assert len(indices) == len(result) @@ -2264,26 +2263,28 @@ def _get_cythonized_result( base_func = getattr(libgroupby, how) for name, obj in self._iterate_slices(): + values = obj._data._values + if aggregate: result_sz = ngroups else: - result_sz = len(obj.values) + result_sz = len(values) if not cython_dtype: - cython_dtype = obj.values.dtype + cython_dtype = values.dtype result = np.zeros(result_sz, dtype=cython_dtype) func = partial(base_func, result, labels) inferences = None if needs_values: - vals = obj.values + vals = values if pre_processing: vals, inferences = pre_processing(vals) func = partial(func, vals) if needs_mask: - mask = isna(obj.values).view(np.uint8) + mask = isna(values).view(np.uint8) func = partial(func, mask) if needs_ngroups: @@ -2292,7 +2293,7 @@ def _get_cythonized_result( func(**kwargs) # Call func to modify indexer values in place if result_is_index: - result = algorithms.take_nd(obj.values, result) + result = algorithms.take_nd(values, result) if post_processing: result = post_processing(result, inferences) @@ -2370,8 +2371,9 @@ def head(self, n=5): """ Return first n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.head(n))``, - except ignores as_index flag. + Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). Returns ------- @@ -2382,10 +2384,6 @@ def head(self, n=5): >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], ... columns=['A', 'B']) - >>> df.groupby('A', as_index=False).head(1) - A B - 0 1 2 - 2 5 6 >>> df.groupby('A').head(1) A B 0 1 2 @@ -2401,8 +2399,9 @@ def tail(self, n=5): """ Return last n rows of each group. - Essentially equivalent to ``.apply(lambda x: x.tail(n))``, - except ignores as_index flag. + Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows + from the original DataFrame with original index and order preserved + (``as_index`` flag is ignored). Returns ------- @@ -2417,10 +2416,6 @@ def tail(self, n=5): A B 1 a 2 3 b 2 - >>> df.groupby('A').head(1) - A B - 0 a 1 - 2 b 1 """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 143755a47b97b3..2ebfbed0b132a2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -37,7 +37,7 @@ class Grouper: """ A Grouper allows the user to specify a groupby instruction for a target - object + object. This specification will select a column via the key parameter, or if the level and/or axis parameters are given, a level of the index of the target @@ -217,7 +217,6 @@ def __repr__(self): class Grouping: - """ Holds the grouping information for a single key @@ -584,18 +583,22 @@ def _get_grouper( # if the actual grouper should be obj[key] def is_in_axis(key): if not _is_label_like(key): + items = obj._data.items try: - obj._data.items.get_loc(key) - except Exception: + items.get_loc(key) + except (KeyError, TypeError): + # TypeError shows up here if we pass e.g. Int64Index return False return True # if the grouper is obj[name] def is_in_obj(gpr): + if not hasattr(gpr, "name"): + return False try: - return id(gpr) == id(obj[gpr.name]) - except Exception: + return gpr is obj[gpr.name] + except (KeyError, IndexError): return False for i, (gpr, level) in enumerate(zip(keys, levels)): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b0c629f017dd34..40517eefe4d5db 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -12,7 +12,7 @@ from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby -import pandas._libs.reduction as reduction +import pandas._libs.reduction as libreduction from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -207,14 +207,17 @@ def apply(self, f, data, axis=0): if len(result_values) == len(group_keys): return group_keys, result_values, mutated - except reduction.InvalidApply: + except libreduction.InvalidApply: # Cannot fast apply on MultiIndex (_has_complex_internals). # This Exception is also raised if `f` triggers an exception # but it is preferable to raise the exception in Python. pass - except Exception: - # raise this error to the caller - pass + except TypeError as err: + if "Cannot convert" in str(err): + # via apply_frame_axis0 if we pass a non-ndarray + pass + else: + raise for key, (i, group) in zip(group_keys, splitter): object.__setattr__(group, "name", key) @@ -463,9 +466,7 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): - raise NotImplementedError( - "{} are not support in cython ops".format(values.dtype) - ) + raise NotImplementedError("{} dtype not supported".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( @@ -615,14 +616,9 @@ def _aggregate( is_datetimelike, min_count=-1, ): - if values.ndim > 3: + if values.ndim > 2: # punting for now - raise NotImplementedError("number of dimensions is currently limited to 3") - elif values.ndim > 2: - for i, chunk in enumerate(values.transpose(2, 0, 1)): - - chunk = chunk.squeeze() - agg_func(result[:, :, i], counts, chunk, comp_ids, min_count) + raise NotImplementedError("number of dimensions is currently limited to 2") else: agg_func(result, counts, values, comp_ids, min_count) @@ -640,20 +636,9 @@ def _transform( ): comp_ids, _, ngroups = self.group_info - if values.ndim > 3: + if values.ndim > 2: # punting for now - raise NotImplementedError("number of dimensions is currently limited to 3") - elif values.ndim > 2: - for i, chunk in enumerate(values.transpose(2, 0, 1)): - - transform_func( - result[:, :, i], - values, - comp_ids, - ngroups, - is_datetimelike, - **kwargs - ) + raise NotImplementedError("number of dimensions is currently limited to 2") else: transform_func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) @@ -678,7 +663,7 @@ def _aggregate_series_fast(self, obj, func): indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) - grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) + grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts @@ -706,7 +691,6 @@ def _aggregate_series_pure_python(self, obj, func): class BinGrouper(BaseGrouper): - """ This is an internal Grouper class @@ -852,7 +836,7 @@ def groupings(self): def agg_series(self, obj, func): dummy = obj[:0] - grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() @@ -933,14 +917,10 @@ def _chop(self, sdata, slice_obj): class FrameSplitter(DataSplitter): def fast_apply(self, f, names): # must return keys::list, values::list, mutated::bool - try: - starts, ends = lib.generate_slices(self.slabels, self.ngroups) - except Exception: - # fails when all -1 - return [], True + starts, ends = lib.generate_slices(self.slabels, self.ngroups) sdata = self._get_sorted_data() - return reduction.apply_frame_axis0(sdata, f, names, starts, ends) + return libreduction.apply_frame_axis0(sdata, f, names, starts, ends) def _chop(self, sdata, slice_obj): if self.axis == 0: diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 70c48e969172f5..433bca940c0285 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -226,6 +226,7 @@ def length_of_indexer(indexer, target=None) -> int: if step is None: step = 1 elif step < 0: + start, stop = stop + 1, start + 1 step = -step return (stop - start + step - 1) // step elif isinstance(indexer, (ABCSeries, ABCIndexClass, np.ndarray, list)): diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 2036728e702f30..cc8ecc0e64684f 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -316,7 +316,7 @@ def __new__(cls, data): # do all the validation here. from pandas import Series - if not isinstance(data, Series): + if not isinstance(data, ABCSeries): raise TypeError( "cannot convert an object of type {0} to a " "datetimelike index".format(type(data)) @@ -326,18 +326,15 @@ def __new__(cls, data): if orig is not None: data = Series(orig.values.categories, name=orig.name, copy=False) - try: - if is_datetime64_dtype(data.dtype): - return DatetimeProperties(data, orig) - elif is_datetime64tz_dtype(data.dtype): - return DatetimeProperties(data, orig) - elif is_timedelta64_dtype(data.dtype): - return TimedeltaProperties(data, orig) - elif is_period_arraylike(data): - return PeriodProperties(data, orig) - elif is_datetime_arraylike(data): - return DatetimeProperties(data, orig) - except Exception: - pass # we raise an attribute error anyway + if is_datetime64_dtype(data.dtype): + return DatetimeProperties(data, orig) + elif is_datetime64tz_dtype(data.dtype): + return DatetimeProperties(data, orig) + elif is_timedelta64_dtype(data.dtype): + return TimedeltaProperties(data, orig) + elif is_period_arraylike(data): + return PeriodProperties(data, orig) + elif is_datetime_arraylike(data): + return DatetimeProperties(data, orig) raise AttributeError("Can only use .dt accessor with datetimelike values") diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 415255cdbad06c..2dbd592fc67873 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -10,6 +10,7 @@ import pandas._libs.join as libjoin from pandas._libs.lib import is_datetime_array from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp +from pandas._libs.tslibs.period import IncompatibleFrequency from pandas._libs.tslibs.timezones import tz_compare from pandas.compat import set_function_name from pandas.compat.numpy import function as nv @@ -262,7 +263,13 @@ def __new__( fastpath=None, tupleize_cols=True, **kwargs - ): + ) -> "Index": + + from .range import RangeIndex + from pandas import PeriodIndex, DatetimeIndex, TimedeltaIndex + from .numeric import Float64Index, Int64Index, UInt64Index + from .interval import IntervalIndex + from .category import CategoricalIndex if name is None and hasattr(data, "name"): name = data.name @@ -277,8 +284,6 @@ def __new__( if fastpath: return cls._simple_new(data, name) - from .range import RangeIndex - if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. data = data.to_numpy() @@ -291,16 +296,12 @@ def __new__( # categorical elif is_categorical_dtype(data) or is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval elif ( is_interval_dtype(data) or is_interval_dtype(dtype) ) and not is_object_dtype(dtype): - from .interval import IntervalIndex - closed = kwargs.get("closed", None) return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) @@ -309,8 +310,6 @@ def __new__( or is_datetime64_any_dtype(dtype) or "tz" in kwargs ): - from pandas import DatetimeIndex - if is_dtype_equal(_o_dtype, dtype): # GH#23524 passing `dtype=object` to DatetimeIndex is invalid, # will raise in the where `data` is already tz-aware. So @@ -318,33 +317,24 @@ def __new__( # the DatetimeIndex construction. # Note we can pass copy=False because the .astype below # will always make a copy - result = DatetimeIndex(data, copy=False, name=name, **kwargs) + result = DatetimeIndex( + data, copy=False, name=name, **kwargs + ) # type: "Index" return result.astype(object) else: - result = DatetimeIndex( - data, copy=copy, name=name, dtype=dtype, **kwargs - ) - return result + return DatetimeIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) elif is_timedelta64_dtype(data) or is_timedelta64_dtype(dtype): - from pandas import TimedeltaIndex - if is_dtype_equal(_o_dtype, dtype): # Note we can pass copy=False because the .astype below # will always make a copy result = TimedeltaIndex(data, copy=False, name=name, **kwargs) return result.astype(object) else: - result = TimedeltaIndex( - data, copy=copy, name=name, dtype=dtype, **kwargs - ) - return result + return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) elif is_period_dtype(data) and not is_object_dtype(dtype): - from pandas import PeriodIndex - - result = PeriodIndex(data, copy=copy, name=name, **kwargs) - return result + return PeriodIndex(data, copy=copy, name=name, **kwargs) # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): @@ -387,8 +377,6 @@ def __new__( pass # Return an actual float index. - from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, name=name) elif inferred == "string": @@ -405,19 +393,11 @@ def __new__( data = np.array(data, dtype=dtype, copy=copy) # maybe coerce to a sub-class - from pandas.core.indexes.period import PeriodIndex, IncompatibleFrequency - if is_signed_integer_dtype(data.dtype): - from .numeric import Int64Index - return Int64Index(data, copy=copy, dtype=dtype, name=name) elif is_unsigned_integer_dtype(data.dtype): - from .numeric import UInt64Index - return UInt64Index(data, copy=copy, dtype=dtype, name=name) elif is_float_dtype(data.dtype): - from .numeric import Float64Index - return Float64Index(data, copy=copy, dtype=dtype, name=name) elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): subarr = data.astype("object") @@ -440,12 +420,8 @@ def __new__( return Index(subarr, copy=copy, dtype=object, name=name) elif inferred in ["floating", "mixed-integer-float", "integer-na"]: # TODO: Returns IntegerArray for integer-na case in the future - from .numeric import Float64Index - return Float64Index(subarr, copy=copy, name=name) elif inferred == "interval": - from .interval import IntervalIndex - try: return IntervalIndex(subarr, name=name, copy=copy) except ValueError: @@ -456,8 +432,6 @@ def __new__( pass elif inferred != "string": if inferred.startswith("datetime"): - from pandas import DatetimeIndex - try: return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) except (ValueError, OutOfBoundsDatetime): @@ -467,8 +441,6 @@ def __new__( pass elif inferred.startswith("timedelta"): - from pandas import TimedeltaIndex - return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == "period": try: @@ -2020,7 +1992,7 @@ def notna(self): _index_shared_docs[ "fillna" ] = """ - Fill NA/NaN values with the specified value + Fill NA/NaN values with the specified value. Parameters ---------- @@ -2051,7 +2023,7 @@ def fillna(self, value=None, downcast=None): _index_shared_docs[ "dropna" ] = """ - Return Index without NA/NaN values + Return Index without NA/NaN values. Parameters ---------- diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 51daad3b426493..cce390d98c0378 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -661,7 +661,7 @@ def _get_time_micros(self): def to_series(self, keep_tz=None, index=None, name=None): """ Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index + useful with map for returning an indexer based on an index. Parameters ---------- @@ -687,10 +687,10 @@ def to_series(self, keep_tz=None, index=None, name=None): behaviour and silence the warning. index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index + Index of resulting Series. If None, defaults to original index. + name : str, optional + Name of resulting Series. If None, defaults to name of original + index. Returns ------- @@ -735,7 +735,7 @@ def to_series(self, keep_tz=None, index=None, name=None): def snap(self, freq="S"): """ - Snap time stamps to nearest occurring frequency + Snap time stamps to nearest occurring frequency. Returns ------- @@ -1594,7 +1594,7 @@ def bdate_range( ): """ Return a fixed frequency DatetimeIndex, with business day as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 2e5b3ff8ef502d..a6c39d049c50cf 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -22,7 +22,6 @@ class FrozenList(PandasObject, list): - """ Container that doesn't allow setting item *but* because it's technically non-hashable, will be used @@ -71,12 +70,7 @@ def difference(self, other): # TODO: Consider deprecating these in favor of `union` (xref gh-15506) __add__ = __iadd__ = union - # Python 2 compat - def __getslice__(self, i, j): - return self.__class__(super().__getslice__(i, j)) - def __getitem__(self, n): - # Python 3 compat if isinstance(n, slice): return self.__class__(super().__getitem__(n)) return super().__getitem__(n) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9361408290bb16..29e297cb28a3b8 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -250,7 +250,22 @@ def _simple_new(cls, array, name, closed=None): return result @classmethod - @Appender(_interval_shared_docs["from_breaks"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["from_breaks"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_breaks([0, 1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) def from_breaks(cls, breaks, closed="right", name=None, copy=False, dtype=None): with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray.from_breaks( @@ -259,7 +274,22 @@ def from_breaks(cls, breaks, closed="right", name=None, copy=False, dtype=None): return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs["from_arrays"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["from_arrays"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_arrays([0, 1, 2], [1, 2, 3]) + IntervalIndex([(0, 1], (1, 2], (2, 3]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) def from_arrays( cls, left, right, closed="right", name=None, copy=False, dtype=None ): @@ -270,7 +300,22 @@ def from_arrays( return cls._simple_new(array, name=name) @classmethod - @Appender(_interval_shared_docs["from_tuples"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["from_tuples"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> pd.IntervalIndex.from_tuples([(0, 1), (1, 2)]) + IntervalIndex([(0, 1], (1, 2]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) def from_tuples(cls, data, closed="right", name=None, copy=False, dtype=None): with rewrite_exception("IntervalArray", cls.__name__): arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, dtype=dtype) @@ -331,7 +376,8 @@ def __contains__(self, key): >>> idx.to_tuples() Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') >>> idx.to_tuples(na_tuple=False) - Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object')""", + Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') + """, ) ) def to_tuples(self, na_tuple=True): @@ -366,7 +412,27 @@ def closed(self): """ return self._data._closed - @Appender(_interval_shared_docs["set_closed"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["set_closed"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + Examples + -------- + >>> index = pd.interval_range(0, 3) + >>> index + IntervalIndex([(0, 1], (1, 2], (2, 3]], + closed='right', + dtype='interval[int64]') + >>> index.set_closed('both') + IntervalIndex([[0, 1], [1, 2], [2, 3]], + closed='both', + dtype='interval[int64]') + """ + ), + ) + ) def set_closed(self, closed): if closed not in _VALID_CLOSED: msg = "invalid option for 'closed': {closed}" @@ -788,7 +854,7 @@ def _find_non_overlapping_monotonic_bounds(self, key): return start, stop def get_loc( - self, key: Any, method: Optional[str] = None + self, key: Any, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: """ Get integer location, slice or boolean mask for requested label. @@ -982,7 +1048,7 @@ def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: List of indices. """ if self.is_overlapping: - return self.get_indexer_non_unique(target, **kwargs)[0] + return self.get_indexer_non_unique(target)[0] return self.get_indexer(target, **kwargs) @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) @@ -1095,12 +1161,8 @@ def _format_with_header(self, header, **kwargs): return header + list(self._format_native_types(**kwargs)) def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): - """ actually format my specific types """ - from pandas.io.formats.format import ExtensionArrayFormatter - - return ExtensionArrayFormatter( - values=self, na_rep=na_rep, justify="all", leading_space=False - ).get_result() + # GH 28210: use base method but with different default na_rep + return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) def _format_data(self, name=None): @@ -1171,11 +1233,41 @@ def equals(self, other): and self.closed == other.closed ) - @Appender(_interval_shared_docs["contains"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["contains"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + IntervalIndex([(0, 1], (1, 3], (2, 4]], + closed='right', + dtype='interval[int64]') + >>> intervals.contains(0.5) + array([ True, False, False]) + """ + ), + ) + ) def contains(self, other): return self._data.contains(other) - @Appender(_interval_shared_docs["overlaps"] % _index_doc_kwargs) + @Appender( + _interval_shared_docs["overlaps"] + % dict( + klass="IntervalIndex", + examples=textwrap.dedent( + """\ + >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) + >>> intervals + IntervalIndex([(0, 1], (1, 3], (2, 4]], + closed='right', + dtype='interval[int64]') + """ + ), + ) + ) def overlaps(self, other): return self._data.overlaps(other) @@ -1310,7 +1402,7 @@ def interval_range( start=None, end=None, periods=None, freq=None, name=None, closed="right" ): """ - Return a fixed frequency IntervalIndex + Return a fixed frequency IntervalIndex. Parameters ---------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 37c91bd9f2b6f4..8d7800ebcf6758 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1250,7 +1250,7 @@ def _set_names(self, names, level=None, validate=True): self.levels[l].rename(name, inplace=True) names = property( - fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex\n""" + fset=_set_names, fget=_get_names, doc="""\nNames of levels in MultiIndex.\n""" ) @Appender(_index_shared_docs["_get_grouper_for_level"]) @@ -1765,7 +1765,7 @@ def is_all_dates(self): def is_lexsorted(self): """ - Return True if the codes are lexicographically sorted + Return True if the codes are lexicographically sorted. Returns ------- @@ -2249,7 +2249,7 @@ def swaplevel(self, i=-2, j=-1): def reorder_levels(self, order): """ - Rearrange levels using input order. May not drop or duplicate levels + Rearrange levels using input order. May not drop or duplicate levels. Parameters ---------- diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 5a2ca109597e85..f7bf77928bdc7c 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -994,7 +994,7 @@ def memory_usage(self, deep=False): def period_range(start=None, end=None, periods=None, freq=None, name=None): """ Return a fixed frequency PeriodIndex, with day (calendar) as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 43ed6e7b122eae..8783351cc74d1c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -236,7 +236,7 @@ def _format_with_header(self, header, na_rep="NaN", **kwargs): @cache_readonly def start(self): """ - The value of the `start` parameter (``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied). """ # GH 25710 return self._range.start @@ -244,7 +244,7 @@ def start(self): @property def _start(self): """ - The value of the `start` parameter (``0`` if this was not supplied) + The value of the `start` parameter (``0`` if this was not supplied). .. deprecated:: 0.25.0 Use ``start`` instead. @@ -259,14 +259,14 @@ def _start(self): @cache_readonly def stop(self): """ - The value of the `stop` parameter + The value of the `stop` parameter. """ return self._range.stop @property def _stop(self): """ - The value of the `stop` parameter + The value of the `stop` parameter. .. deprecated:: 0.25.0 Use ``stop`` instead. @@ -282,7 +282,7 @@ def _stop(self): @cache_readonly def step(self): """ - The value of the `step` parameter (``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied). """ # GH 25710 return self._range.step @@ -290,7 +290,7 @@ def step(self): @property def _step(self): """ - The value of the `step` parameter (``1`` if this was not supplied) + The value of the `step` parameter (``1`` if this was not supplied). .. deprecated:: 0.25.0 Use ``step`` instead. diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index d06afa3daa792f..b03d60c7b5b371 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -68,20 +68,20 @@ class TimedeltaIndex( ): """ Immutable ndarray of timedelta64 data, represented internally as int64, and - which can be boxed to timedelta objects + which can be boxed to timedelta objects. Parameters ---------- data : array-like (1-dimensional), optional - Optional timedelta-like data to construct index with + Optional timedelta-like data to construct index with. unit : unit of the arg (D,h,m,s,ms,us,ns) denote the unit, optional - which is an integer/float number - freq : string or pandas offset object, optional + Which is an integer/float number. + freq : str or pandas offset object, optional One of pandas date offset strings or corresponding objects. The string 'infer' can be passed in order to set the frequency of the index as the - inferred frequency upon creation + inferred frequency upon creation. copy : bool - Make a copy of input ndarray + Make a copy of input ndarray. start : starting value, timedelta-like, optional If data is None, start is used as the start point in generating regular timedelta data. @@ -90,24 +90,24 @@ class TimedeltaIndex( periods : int, optional, > 0 Number of periods to generate, if generating index. Takes precedence - over end argument + over end argument. .. deprecated:: 0.24.0 end : end time, timedelta-like, optional If periods is none, generated index will extend to first conforming - time on or just past end argument + time on or just past end argument. .. deprecated:: 0.24. 0 - closed : string or None, default None + closed : str or None, default None Make the interval closed with respect to the given frequency to - the 'left', 'right', or both sides (None) + the 'left', 'right', or both sides (None). .. deprecated:: 0.24. 0 name : object - Name to be stored in the index + Name to be stored in the index. Attributes ---------- @@ -713,7 +713,7 @@ def timedelta_range( ): """ Return a fixed frequency TimedeltaIndex, with day as the default - frequency + frequency. Parameters ---------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 7bb5e2fa3018d1..3d495eeb8c885b 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -22,11 +22,11 @@ is_sparse, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.index import Index, InvalidIndexError, MultiIndex +from pandas.core.index import Index, InvalidIndexError from pandas.core.indexers import is_list_like_indexer, length_of_indexer @@ -49,7 +49,7 @@ def get_indexers_list(): # the public IndexSlicerMaker class _IndexSlice: """ - Create an object to more easily perform multi-index slicing + Create an object to more easily perform multi-index slicing. See Also -------- @@ -172,7 +172,7 @@ def _get_setitem_indexer(self, key): ax = self.obj._get_axis(0) - if isinstance(ax, MultiIndex) and self.name != "iloc": + if isinstance(ax, ABCMultiIndex) and self.name != "iloc": try: return ax.get_loc(key) except Exception: @@ -241,7 +241,7 @@ def _has_valid_tuple(self, key: Tuple): ) def _is_nested_tuple_indexer(self, tup: Tuple): - if any(isinstance(ax, MultiIndex) for ax in self.obj.axes): + if any(isinstance(ax, ABCMultiIndex) for ax in self.obj.axes): return any(is_nested_tuple(tup, ax) for ax in self.obj.axes) return False @@ -329,7 +329,7 @@ def _setitem_with_indexer(self, indexer, value): # GH 10360, GH 27841 if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes): for i, ax in zip(indexer, self.obj.axes): - if isinstance(ax, MultiIndex) and not ( + if isinstance(ax, ABCMultiIndex) and not ( is_integer(i) or com.is_null_slice(i) ): take_split_path = True @@ -422,7 +422,9 @@ def _setitem_with_indexer(self, indexer, value): # if we have a partial multiindex, then need to adjust the plane # indexer here - if len(labels) == 1 and isinstance(self.obj[labels[0]].axes[0], MultiIndex): + if len(labels) == 1 and isinstance( + self.obj[labels[0]].axes[0], ABCMultiIndex + ): item = labels[0] obj = self.obj[item] index = obj.index @@ -495,7 +497,7 @@ def setter(item, v): # we have an equal len Frame if isinstance(value, ABCDataFrame): sub_indexer = list(indexer) - multiindex_indexer = isinstance(labels, MultiIndex) + multiindex_indexer = isinstance(labels, ABCMultiIndex) for item in labels: if item in value: @@ -777,8 +779,8 @@ def _align_frame(self, indexer, df: ABCDataFrame): # we have a multi-index and are trying to align # with a particular, level GH3738 if ( - isinstance(ax, MultiIndex) - and isinstance(df.index, MultiIndex) + isinstance(ax, ABCMultiIndex) + and isinstance(df.index, ABCMultiIndex) and ax.nlevels != df.index.nlevels ): raise TypeError( @@ -904,7 +906,7 @@ def _getitem_lowerdim(self, tup: Tuple): ax0 = self.obj._get_axis(0) # ...but iloc should handle the tuple as simple integer-location # instead of checking it as multiindex representation (GH 13797) - if isinstance(ax0, MultiIndex) and self.name != "iloc": + if isinstance(ax0, ABCMultiIndex) and self.name != "iloc": result = self._handle_lowerdim_multi_index_axis0(tup) if result is not None: return result @@ -1004,7 +1006,7 @@ def _getitem_axis(self, key, axis: int): if isinstance(key, slice): return self._get_slice_axis(key, axis=axis) elif is_list_like_indexer(key) and not ( - isinstance(key, tuple) and isinstance(labels, MultiIndex) + isinstance(key, tuple) and isinstance(labels, ABCMultiIndex) ): if hasattr(key, "ndim") and key.ndim > 1: @@ -1017,7 +1019,7 @@ def _getitem_axis(self, key, axis: int): key = labels._maybe_cast_indexer(key) if is_integer(key): - if axis == 0 and isinstance(labels, MultiIndex): + if axis == 0 and isinstance(labels, ABCMultiIndex): try: return self._get_label(key, axis=axis) except (KeyError, TypeError): @@ -1228,7 +1230,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): try: return labels.get_loc(obj) except LookupError: - if isinstance(obj, tuple) and isinstance(labels, MultiIndex): + if isinstance(obj, tuple) and isinstance(labels, ABCMultiIndex): if len(obj) == labels.nlevels: return {"key": obj} raise @@ -1248,7 +1250,7 @@ def _convert_to_indexer(self, obj, axis: int, raise_missing: bool = False): # always valid return {"key": obj} - if obj >= self.obj.shape[axis] and not isinstance(labels, MultiIndex): + if obj >= self.obj.shape[axis] and not isinstance(labels, ABCMultiIndex): # a positional raise ValueError("cannot set by positional indexing with enlargement") @@ -1715,7 +1717,7 @@ def _is_scalar_access(self, key: Tuple): return False ax = self.obj.axes[i] - if isinstance(ax, MultiIndex): + if isinstance(ax, ABCMultiIndex): return False if isinstance(k, str) and ax._supports_partial_string_indexing: @@ -1737,7 +1739,7 @@ def _getitem_scalar(self, key): def _get_partial_string_timestamp_match_key(self, key, labels): """Translate any partial string timestamp matches in key, returning the new key (GH 10331)""" - if isinstance(labels, MultiIndex): + if isinstance(labels, ABCMultiIndex): if ( isinstance(key, str) and labels.levels[0]._supports_partial_string_indexing @@ -1781,7 +1783,7 @@ def _getitem_axis(self, key, axis: int): # to a list of keys # we will use the *values* of the object # and NOT the index if its a PandasObject - if isinstance(labels, MultiIndex): + if isinstance(labels, ABCMultiIndex): if isinstance(key, (ABCSeries, np.ndarray)) and key.ndim <= 1: # Series, or 0,1 ndim ndarray @@ -1809,7 +1811,7 @@ def _getitem_axis(self, key, axis: int): key = tuple([key]) # an iterable multi-selection - if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)): + if not (isinstance(key, tuple) and isinstance(labels, ABCMultiIndex)): if hasattr(key, "ndim") and key.ndim > 1: raise ValueError("Cannot index with multidimensional key") @@ -2474,7 +2476,7 @@ def is_nested_tuple(tup, labels): for i, k in enumerate(tup): if is_list_like(k) or isinstance(k, slice): - return isinstance(labels, MultiIndex) + return isinstance(labels, ABCMultiIndex) return False diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e24e6e088b92aa..2a44177d445df8 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -7,7 +7,7 @@ import numpy as np -from pandas._libs import NaT, Timestamp, lib, tslib, tslibs +from pandas._libs import NaT, Timestamp, lib, tslib import pandas._libs.internals as libinternals from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare @@ -407,7 +407,7 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): return self.copy() if self._can_hold_element(value): - # equivalent: self._try_coerce_args(value) would not raise + # equivalent: _try_coerce_args(value) would not raise blocks = self.putmask(mask, value, inplace=inplace) return self._maybe_downcast(blocks, downcast) @@ -416,15 +416,16 @@ def fillna(self, value, limit=None, inplace=False, downcast=None): return self if inplace else self.copy() # operate column-by-column - def f(m, v, i): + def f(mask, val, idx): block = self.coerce_to_target_dtype(value) # slice out our block - if i is not None: - block = block.getitem_block(slice(i, i + 1)) + if idx is not None: + # i.e. self.ndim == 2 + block = block.getitem_block(slice(idx, idx + 1)) return block.fillna(value, limit=limit, inplace=inplace, downcast=None) - return self.split_and_operate(mask, f, inplace) + return self.split_and_operate(None, f, inplace) def split_and_operate(self, mask, f, inplace: bool): """ @@ -444,7 +445,8 @@ def split_and_operate(self, mask, f, inplace: bool): """ if mask is None: - mask = np.ones(self.shape, dtype=bool) + mask = np.broadcast_to(True, shape=self.shape) + new_values = self.values def make_a_block(nv, ref_loc): @@ -523,19 +525,14 @@ def downcast(self, dtypes=None): raise ValueError( "downcast must have a dictionary or 'infer' as its argument" ) + elif dtypes != "infer": + raise AssertionError("dtypes as dict is not supported yet") # operate column-by-column # this is expensive as it splits the blocks items-by-item - def f(m, v, i): - - if dtypes == "infer": - dtype = "infer" - else: - raise AssertionError("dtypes as dict is not supported yet") - - if dtype is not None: - v = maybe_downcast_to_dtype(v, dtype) - return v + def f(mask, val, idx): + val = maybe_downcast_to_dtype(val, dtype="infer") + return val return self.split_and_operate(None, f, False) @@ -669,7 +666,7 @@ def convert( return self.copy() if copy else self - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: """ require the same dtype as ourselves """ dtype = self.values.dtype.type tipo = maybe_infer_dtype_type(element) @@ -743,6 +740,26 @@ def replace( return [self] return [self.copy()] + to_replace = [x for x in to_replace if self._can_hold_element(x)] + if not len(to_replace): + # GH#28084 avoid costly checks since we can infer + # that there is nothing to replace in this block + if inplace: + return [self] + return [self.copy()] + + if len(to_replace) == 1: + # _can_hold_element checks have reduced this back to the + # scalar case and we can avoid a costly object cast + return self.replace( + to_replace[0], + value, + inplace=inplace, + filter=filter, + regex=regex, + convert=convert, + ) + # GH 22083, TypeError or ValueError occurred within error handling # causes infinite loop. Cast and retry only if not objectblock. if is_object_dtype(self): @@ -751,7 +768,7 @@ def replace( # try again with a compatible block block = self.astype(object) return block.replace( - to_replace=original_to_replace, + to_replace=to_replace, value=value, inplace=inplace, filter=filter, @@ -837,12 +854,6 @@ def setitem(self, indexer, value): if self._can_hold_element(value): value = self._try_coerce_args(value) - # can keep its own dtype - if hasattr(value, "dtype") and is_dtype_equal(values.dtype, value.dtype): - dtype = self.dtype - else: - dtype = "infer" - else: # current dtype cannot store value, coerce to common dtype find_dtype = False @@ -851,15 +862,9 @@ def setitem(self, indexer, value): dtype = value.dtype find_dtype = True - elif lib.is_scalar(value): - if isna(value): - # NaN promotion is handled in latter path - dtype = False - else: - dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) - find_dtype = True - else: - dtype = "infer" + elif lib.is_scalar(value) and not isna(value): + dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) + find_dtype = True if find_dtype: dtype = find_common_type([values.dtype, dtype]) @@ -994,15 +999,15 @@ def putmask(self, mask, new, align=True, inplace=False, axis=0, transpose=False) new = new.reshape(tuple(new_shape)) # operate column-by-column - def f(m, v, i): + def f(mask, val, idx): - if i is None: + if idx is None: # ndim==1 case. n = new else: if isinstance(new, np.ndarray): - n = np.squeeze(new[i % new.shape[0]]) + n = np.squeeze(new[idx % new.shape[0]]) else: n = np.array(new) @@ -1012,7 +1017,7 @@ def f(m, v, i): # we need to explicitly astype here to make a copy n = n.astype(dtype) - nv = _putmask_smart(v, m, n) + nv = _putmask_smart(val, mask, n) return nv new_blocks = self.split_and_operate(mask, f, inplace) @@ -1068,7 +1073,7 @@ def coerce_to_target_dtype(self, other): mytz = getattr(self.dtype, "tz", None) othertz = getattr(dtype, "tz", None) - if str(mytz) != str(othertz): + if not tz_compare(mytz, othertz): return self.astype(object) raise AssertionError( @@ -1288,7 +1293,7 @@ def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None): else: return self.make_block_same_class(new_values, new_mgr_locs) - def diff(self, n, axis=1): + def diff(self, n: int, axis: int = 1) -> List["Block"]: """ return block for the diff of the values """ new_values = algos.diff(self.values, n, axis=axis) return [self.make_block(values=new_values)] @@ -1377,7 +1382,7 @@ def func(cond, values, other): if not ( (self.is_integer or self.is_bool) - and lib.is_scalar(other) + and lib.is_float(other) and np.isnan(other) ): # np.where will cast integer array to floats in this case @@ -1430,7 +1435,7 @@ def func(cond, values, other): return result_blocks - def equals(self, other): + def equals(self, other) -> bool: if self.dtype != other.dtype or self.shape != other.shape: return False return array_equivalent(self.values, other.values) @@ -1810,7 +1815,7 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): return self.make_block_same_class(new_values, new_mgr_locs) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: # XXX: We may need to think about pushing this onto the array. # We're doing the same as CategoricalBlock here. return True @@ -1980,7 +1985,7 @@ class NumericBlock(Block): class FloatOrComplexBlock(NumericBlock): __slots__ = () - def equals(self, other): + def equals(self, other) -> bool: if self.dtype != other.dtype or self.shape != other.shape: return False left, right = self.values, other.values @@ -1991,7 +1996,7 @@ class FloatBlock(FloatOrComplexBlock): __slots__ = () is_float = True - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass( @@ -2055,7 +2060,7 @@ class ComplexBlock(FloatOrComplexBlock): __slots__ = () is_complex = True - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, (np.floating, np.integer, np.complexfloating)) @@ -2072,7 +2077,7 @@ class IntBlock(NumericBlock): is_integer = True _can_hold_na = False - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return ( @@ -2162,7 +2167,7 @@ def _astype(self, dtype, **kwargs): # delegate return super()._astype(dtype=dtype, **kwargs) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: if self.is_datetimetz: @@ -2352,41 +2357,19 @@ def _slice(self, slicer): return self.values[slicer] def _try_coerce_args(self, other): - """ - localize and return i8 for the values - - Parameters - ---------- - other : ndarray-like or scalar - - Returns - ------- - base-type other - """ - if is_valid_nat_for_dtype(other, self.dtype): - other = np.datetime64("NaT", "ns") - elif isinstance(other, self._holder): - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - - elif isinstance(other, (np.datetime64, datetime, date)): - other = tslibs.Timestamp(other) - - # test we can have an equal time zone - if not tz_compare(other.tz, self.values.tz): - raise ValueError("incompatible or non tz-aware value") - else: - raise TypeError(other) - + # DatetimeArray handles this for us return other - def diff(self, n, axis=0): - """1st discrete difference + def diff(self, n: int, axis: int = 0) -> List["Block"]: + """ + 1st discrete difference. Parameters ---------- - n : int, number of periods to diff - axis : int, axis to diff upon. default 0 + n : int + Number of periods to diff. + axis : int, default 0 + Axis to diff upon. Returns ------- @@ -2448,7 +2431,7 @@ def setitem(self, indexer, value): ) return newb.setitem(indexer, value) - def equals(self, other): + def equals(self, other) -> bool: # override for significant performance improvement if self.dtype != other.dtype or self.shape != other.shape: return False @@ -2487,7 +2470,7 @@ def __init__(self, values, placement, ndim=None): def _holder(self): return TimedeltaArray - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.timedelta64) @@ -2580,7 +2563,7 @@ class BoolBlock(NumericBlock): is_bool = True _can_hold_na = False - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: tipo = maybe_infer_dtype_type(element) if tipo is not None: return issubclass(tipo.type, np.bool_) @@ -2641,10 +2624,10 @@ def convert( """ # operate column-by-column - def f(m, v, i): - shape = v.shape + def f(mask, val, idx): + shape = val.shape values = soft_convert_objects( - v.ravel(), + val.ravel(), datetime=datetime, numeric=numeric, timedelta=timedelta, @@ -2674,7 +2657,7 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] # split and convert the blocks return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) - def _can_hold_element(self, element): + def _can_hold_element(self, element: Any) -> bool: return True def _try_coerce_args(self, other): @@ -2830,9 +2813,9 @@ def _replace_single( regex = regex_re or to_rep_re # try to get the pattern attribute (compiled re) or it's a string - try: + if is_re(to_replace): pattern = to_replace.pattern - except AttributeError: + else: pattern = to_replace # if the pattern is not empty and to_replace is either a string or a @@ -2853,18 +2836,18 @@ def _replace_single( if isna(value) or not isinstance(value, str): def re_replacer(s): - try: + if is_re(rx) and isinstance(s, str): return value if rx.search(s) is not None else s - except TypeError: + else: return s else: # value is guaranteed to be a string here, s can be either a string # or null if it's null it gets returned def re_replacer(s): - try: + if is_re(rx) and isinstance(s, str): return rx.sub(value, s) - except TypeError: + else: return s f = np.vectorize(re_replacer, otypes=[self.dtype]) @@ -3186,14 +3169,15 @@ def _safe_reshape(arr, new_shape): return arr -def _putmask_smart(v, m, n): +def _putmask_smart(v, mask, n): """ Return a new ndarray, try to preserve dtype if possible. Parameters ---------- v : `values`, updated in-place (array like) - m : `mask`, applies to both sides (array like) + mask : np.ndarray + Applies to both sides (array like). n : `new values` either scalar or an array like aligned with `values` Returns @@ -3211,12 +3195,12 @@ def _putmask_smart(v, m, n): # n should be the length of the mask or a scalar here if not is_list_like(n): - n = np.repeat(n, len(m)) + n = np.repeat(n, len(mask)) # see if we are only masking values that if putted # will work in the current dtype try: - nn = n[m] + nn = n[mask] except TypeError: # TypeError: only integer scalar arrays can be converted to a scalar index pass @@ -3241,16 +3225,16 @@ def _putmask_smart(v, m, n): comp = nn == nn_at if is_list_like(comp) and comp.all(): nv = v.copy() - nv[m] = nn_at + nv[mask] = nn_at return nv n = np.asarray(n) def _putmask_preserve(nv, n): try: - nv[m] = n[m] + nv[mask] = n[mask] except (IndexError, ValueError): - nv[m] = n + nv[mask] = n return nv # preserves dtype if possible diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 7e03b9544ee727..f1f4777cedbc57 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -5,7 +5,7 @@ """ import datetime import operator -from typing import Any, Callable, Tuple +from typing import Any, Callable, Tuple, Union import numpy as np @@ -34,10 +34,11 @@ ABCIndexClass, ABCSeries, ABCSparseSeries, + ABCTimedeltaArray, + ABCTimedeltaIndex, ) from pandas.core.dtypes.missing import isna, notna -import pandas as pd from pandas._typing import ArrayLike from pandas.core.construction import array, extract_array from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY, define_na_arithmetic_op @@ -148,6 +149,8 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): Be careful to call this *after* determining the `name` attribute to be attached to the result of the arithmetic operation. """ + from pandas.core.arrays import TimedeltaArray + if type(obj) is datetime.timedelta: # GH#22390 cast up to Timedelta to rely on Timedelta # implementation; otherwise operation against numeric-dtype @@ -157,23 +160,21 @@ def maybe_upcast_for_op(obj, shape: Tuple[int, ...]): if isna(obj): # wrapping timedelta64("NaT") in Timedelta returns NaT, # which would incorrectly be treated as a datetime-NaT, so - # we broadcast and wrap in a Series + # we broadcast and wrap in a TimedeltaArray + obj = obj.astype("timedelta64[ns]") right = np.broadcast_to(obj, shape) - - # Note: we use Series instead of TimedeltaIndex to avoid having - # to worry about catching NullFrequencyError. - return pd.Series(right) + return TimedeltaArray(right) # In particular non-nanosecond timedelta64 needs to be cast to # nanoseconds, or else we get undesired behavior like # np.timedelta64(3, 'D') / 2 == np.timedelta64(1, 'D') return Timedelta(obj) - elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj): + elif isinstance(obj, np.ndarray) and is_timedelta64_dtype(obj.dtype): # GH#22390 Unfortunately we need to special-case right-hand # timedelta64 dtypes because numpy casts integer dtypes to # timedelta64 when operating with timedelta64 - return pd.TimedeltaIndex(obj) + return TimedeltaArray._from_sequence(obj) return obj @@ -212,12 +213,6 @@ def _gen_eval_kwargs(name): # Exclude commutative operations kwargs["reversed"] = True - if name in ["truediv", "rtruediv"]: - kwargs["truediv"] = True - - if name in ["ne"]: - kwargs["masker"] = True - return kwargs @@ -246,7 +241,7 @@ def _get_frame_op_default_axis(name): return "columns" -def _get_opstr(op, cls): +def _get_opstr(op): """ Find the operation string, if any, to pass to numexpr for this operation. @@ -254,19 +249,11 @@ def _get_opstr(op, cls): Parameters ---------- op : binary operator - cls : class Returns ------- op_str : string or None """ - # numexpr is available for non-sparse classes - subtyp = getattr(cls, "_subtyp", "") - use_numexpr = "sparse" not in subtyp - - if not use_numexpr: - # if we're not using numexpr, then don't pass a str_rep - return None return { operator.add: "+", @@ -415,7 +402,7 @@ def should_extension_dispatch(left: ABCSeries, right: Any) -> bool: ): return True - if is_extension_array_dtype(right) and not is_scalar(right): + if not is_scalar(right) and is_extension_array_dtype(right): # GH#22378 disallow scalar to exclude e.g. "category", "Int64" return True @@ -520,13 +507,34 @@ def column_op(a, b): return result -def dispatch_to_extension_op(op, left, right): +def dispatch_to_extension_op( + op, + left: Union[ABCExtensionArray, np.ndarray], + right: Any, + keep_null_freq: bool = False, +): """ Assume that left or right is a Series backed by an ExtensionArray, apply the operator defined by op. + + Parameters + ---------- + op : binary operator + left : ExtensionArray or np.ndarray + right : object + keep_null_freq : bool, default False + Whether to re-raise a NullFrequencyError unchanged, as opposed to + catching and raising TypeError. + + Returns + ------- + ExtensionArray or np.ndarray + 2-tuple of these if op is divmod or rdivmod """ + # NB: left and right should already be unboxed, so neither should be + # a Series or Index. - if left.dtype.kind in "mM": + if left.dtype.kind in "mM" and isinstance(left, np.ndarray): # We need to cast datetime64 and timedelta64 ndarrays to # DatetimeArray/TimedeltaArray. But we avoid wrapping others in # PandasArray as that behaves poorly with e.g. IntegerArray. @@ -535,15 +543,15 @@ def dispatch_to_extension_op(op, left, right): # The op calls will raise TypeError if the op is not defined # on the ExtensionArray - # unbox Series and Index to arrays - new_left = extract_array(left, extract_numpy=True) - new_right = extract_array(right, extract_numpy=True) - try: - res_values = op(new_left, new_right) + res_values = op(left, right) except NullFrequencyError: # DatetimeIndex and TimedeltaIndex with freq == None raise ValueError # on add/sub of integers (or int-like). We re-raise as a TypeError. + if keep_null_freq: + # TODO: remove keep_null_freq after Timestamp+int deprecation + # GH#22535 is enforced + raise raise TypeError( "incompatible type for a datetime/timedelta " "operation [{name}]".format(name=op.__name__) @@ -602,7 +610,7 @@ def _arith_method_SERIES(cls, op, special): Wrapper function for Series arithmetic operations, to avoid code duplication. """ - str_rep = _get_opstr(op, cls) + str_rep = _get_opstr(op) op_name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(op_name) construct_result = ( @@ -615,25 +623,29 @@ def wrapper(left, right): if isinstance(right, ABCDataFrame): return NotImplemented + keep_null_freq = isinstance( + right, + (ABCDatetimeIndex, ABCDatetimeArray, ABCTimedeltaIndex, ABCTimedeltaArray), + ) + left, right = _align_method_SERIES(left, right) res_name = get_op_result_name(left, right) - right = maybe_upcast_for_op(right, left.shape) - if should_extension_dispatch(left, right): - result = dispatch_to_extension_op(op, left, right) + lvalues = extract_array(left, extract_numpy=True) + rvalues = extract_array(right, extract_numpy=True) - elif is_timedelta64_dtype(right) or isinstance( - right, (ABCDatetimeArray, ABCDatetimeIndex) - ): - # We should only get here with td64 right with non-scalar values - # for right upcast by maybe_upcast_for_op - assert not isinstance(right, (np.timedelta64, np.ndarray)) - result = op(left._values, right) + rvalues = maybe_upcast_for_op(rvalues, lvalues.shape) - else: - lvalues = extract_array(left, extract_numpy=True) - rvalues = extract_array(right, extract_numpy=True) + if should_extension_dispatch(lvalues, rvalues): + result = dispatch_to_extension_op(op, lvalues, rvalues, keep_null_freq) + + elif is_timedelta64_dtype(rvalues) or isinstance(rvalues, ABCDatetimeArray): + # We should only get here with td64 rvalues with non-scalar values + # for rvalues upcast by maybe_upcast_for_op + assert not isinstance(rvalues, (np.timedelta64, np.ndarray)) + result = dispatch_to_extension_op(op, lvalues, rvalues, keep_null_freq) + else: with np.errstate(all="ignore"): result = na_op(lvalues, rvalues) @@ -672,10 +684,7 @@ def na_op(x, y): return result - def wrapper(self, other, axis=None): - # Validate the axis parameter - if axis is not None: - self._get_axis_number(axis) + def wrapper(self, other): res_name = get_op_result_name(self, other) other = lib.item_from_zerodim(other) @@ -708,25 +717,25 @@ def wrapper(self, other, axis=None): if len(self) != len(other): raise ValueError("Lengths must match to compare") - if should_extension_dispatch(self, other): - res_values = dispatch_to_extension_op(op, self, other) + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + + if should_extension_dispatch(lvalues, rvalues): + res_values = dispatch_to_extension_op(op, lvalues, rvalues) - elif is_scalar(other) and isna(other): + elif is_scalar(rvalues) and isna(rvalues): # numpy does not like comparisons vs None if op is operator.ne: - res_values = np.ones(len(self), dtype=bool) + res_values = np.ones(len(lvalues), dtype=bool) else: - res_values = np.zeros(len(self), dtype=bool) + res_values = np.zeros(len(lvalues), dtype=bool) else: - lvalues = extract_array(self, extract_numpy=True) - rvalues = extract_array(other, extract_numpy=True) - with np.errstate(all="ignore"): res_values = na_op(lvalues, rvalues) if is_scalar(res_values): raise TypeError( - "Could not compare {typ} type with Series".format(typ=type(other)) + "Could not compare {typ} type with Series".format(typ=type(rvalues)) ) result = self._constructor(res_values, index=self.index) @@ -755,7 +764,7 @@ def na_op(x, y): assert not isinstance(y, (list, ABCSeries, ABCIndexClass)) if isinstance(y, np.ndarray): # bool-bool dtype operations should be OK, should not get here - assert not (is_bool_dtype(x) and is_bool_dtype(y)) + assert not (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)) x = ensure_object(x) y = ensure_object(y) result = libops.vec_binop(x, y, op) @@ -783,7 +792,13 @@ def na_op(x, y): return result fill_int = lambda x: x.fillna(0) - fill_bool = lambda x: x.fillna(False).astype(bool) + + def fill_bool(x, left=None): + # if `left` is specifically not-boolean, we do not cast to bool + x = x.fillna(False) + if left is None or is_bool_dtype(left.dtype): + x = x.astype(bool) + return x def wrapper(self, other): is_self_int_dtype = is_integer_dtype(self.dtype) @@ -791,28 +806,40 @@ def wrapper(self, other): self, other = _align_method_SERIES(self, other, align_asobject=True) res_name = get_op_result_name(self, other) + # TODO: shouldn't we be applying finalize whenever + # not isinstance(other, ABCSeries)? + finalizer = ( + lambda x: x.__finalize__(self) + if not isinstance(other, (ABCSeries, ABCIndexClass)) + else x + ) + if isinstance(other, ABCDataFrame): # Defer to DataFrame implementation; fail early return NotImplemented + elif should_extension_dispatch(self, other): + lvalues = extract_array(self, extract_numpy=True) + rvalues = extract_array(other, extract_numpy=True) + res_values = dispatch_to_extension_op(op, lvalues, rvalues) + result = self._constructor(res_values, index=self.index, name=res_name) + return finalizer(result) + elif isinstance(other, (ABCSeries, ABCIndexClass)): is_other_int_dtype = is_integer_dtype(other.dtype) - other = fill_int(other) if is_other_int_dtype else fill_bool(other) - - ovalues = other.values - finalizer = lambda x: x + other = other if is_other_int_dtype else fill_bool(other, self) else: # scalars, list, tuple, np.array - is_other_int_dtype = is_integer_dtype(np.asarray(other)) + is_other_int_dtype = is_integer_dtype(np.asarray(other).dtype) if is_list_like(other) and not isinstance(other, np.ndarray): # TODO: Can we do this before the is_integer_dtype check? # could the is_integer_dtype check be checking the wrong # thing? e.g. other = [[0, 1], [2, 3], [4, 5]]? other = construct_1d_object_array_from_listlike(other) - ovalues = other - finalizer = lambda x: x.__finalize__(self) + # TODO: use extract_array once we handle EA correctly, see GH#27959 + ovalues = lib.values_from_object(other) # For int vs int `^`, `|`, `&` are bitwise operators and return # integer dtypes. Otherwise these are boolean ops @@ -958,7 +985,7 @@ def to_series(right): def _arith_method_FRAME(cls, op, special): - str_rep = _get_opstr(op, cls) + str_rep = _get_opstr(op) op_name = _get_op_name(op, special) eval_kwargs = _gen_eval_kwargs(op_name) default_axis = _get_frame_op_default_axis(op_name) @@ -988,10 +1015,10 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): self, other, pass_op, fill_value=fill_value, axis=axis, level=level ) else: + # in this case we always have `np.ndim(other) == 0` if fill_value is not None: self = self.fillna(fill_value) - assert np.ndim(other) == 0 return self._combine_const(other, op) f.__name__ = op_name @@ -1000,7 +1027,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): def _flex_comp_method_FRAME(cls, op, special): - str_rep = _get_opstr(op, cls) + str_rep = _get_opstr(op) op_name = _get_op_name(op, special) default_axis = _get_frame_op_default_axis(op_name) @@ -1032,7 +1059,7 @@ def f(self, other, axis=default_axis, level=None): self, other, na_op, fill_value=None, axis=axis, level=level ) else: - assert np.ndim(other) == 0, other + # in this case we always have `np.ndim(other) == 0` return self._combine_const(other, na_op) f.__name__ = op_name @@ -1041,7 +1068,7 @@ def f(self, other, axis=default_axis, level=None): def _comp_method_FRAME(cls, func, special): - str_rep = _get_opstr(func, cls) + str_rep = _get_opstr(func) op_name = _get_op_name(func, special) @Appender("Wrapper for comparison method {name}".format(name=op_name)) @@ -1066,7 +1093,7 @@ def f(self, other): # straight boolean comparisons we want to allow all columns # (regardless of dtype to pass thru) See #4537 for discussion. res = self._combine_const(other, func) - return res.fillna(True).astype(bool) + return res f.__name__ = op_name diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 523ba5d42a69cf..f5f6d77676f1f3 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -11,7 +11,7 @@ find_common_type, maybe_upcast_putmask, ) -from pandas.core.dtypes.common import is_object_dtype, is_period_dtype, is_scalar +from pandas.core.dtypes.common import is_object_dtype, is_scalar from pandas.core.dtypes.generic import ABCIndex, ABCSeries from pandas.core.dtypes.missing import notna @@ -57,9 +57,9 @@ def masked_arith_op(x, y, op): dtype = find_common_type([x.dtype, y.dtype]) result = np.empty(x.size, dtype=dtype) - # PeriodIndex.ravel() returns int64 dtype, so we have - # to work around that case. See GH#19956 - yrav = y if is_period_dtype(y) else y.ravel() + # NB: ravel() is only safe since y is ndarray; for e.g. PeriodIndex + # we would get int64 dtype, see GH#19956 + yrav = y.ravel() mask = notna(xrav) & notna(yrav) if yrav.shape != mask.shape: @@ -82,9 +82,9 @@ def masked_arith_op(x, y, op): mask = notna(xrav) # 1 ** np.nan is 1. So we have to unmask those. - if op == pow: + if op is pow: mask = np.where(x == 1, False, mask) - elif op == rpow: + elif op is rpow: mask = np.where(y == 1, False, mask) if mask.any(): diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 01bc345a40b83c..45fa6a2830af64 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -40,7 +40,7 @@ def fill_zeros(result, x, y, name, fill): Mask the nan's from x. """ - if fill is None or is_float_dtype(result): + if fill is None or is_float_dtype(result.dtype): return result if name.startswith(("r", "__r")): @@ -55,7 +55,7 @@ def fill_zeros(result, x, y, name, fill): if is_scalar_type: y = np.array(y) - if is_integer_dtype(y): + if is_integer_dtype(y.dtype): if (y == 0).any(): diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 225de3f11cf7d7..d7fbe464cb1e52 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -178,7 +178,7 @@ def merge_ordered( """ Perform merge with optional filling/interpolation designed for ordered data like time series data. Optionally perform group-wise merge (see - examples) + examples). Parameters ---------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 8b6c963e40e9d7..10d50e89ca92eb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1114,9 +1114,6 @@ def __getitem__(self, key): return self.__getitem__(new_key) raise - except Exception: - raise - if is_iterator(key): key = list(key) @@ -3620,7 +3617,7 @@ def explode(self) -> "Series": Series.str.split : Split string values on specified separator. Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame. - DataFrame.melt : Unpivot a DataFrame from wide format to long format + DataFrame.melt : Unpivot a DataFrame from wide format to long format. DataFrame.explode : Explode a DataFrame from list-like columns to long format. diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 5db31fe6664eaf..e6edad656d430e 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -271,7 +271,6 @@ def nargsort(items, kind="quicksort", ascending=True, na_position="last"): class _KeyMapper: - """ Ease my suffering. Map compressed group id -> key tuple """ diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index f5add426297a73..3d6ba0b8d97745 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -569,15 +569,15 @@ def _combine_frame(self, other, func, fill_value=None, level=None): ).__finalize__(self) def _combine_match_index(self, other, func, level=None): - new_data = {} if level is not None: raise NotImplementedError("'level' argument is not supported") this, other = self.align(other, join="outer", axis=0, level=level, copy=False) - for col, series in this.items(): - new_data[col] = func(series.values, other.values) + new_data = {} + for col in this.columns: + new_data[col] = func(this[col], other) fill_value = self._get_op_result_fill_value(other, func) @@ -603,7 +603,7 @@ def _combine_match_columns(self, other, func, level=None): new_data = {} for col in left.columns: - new_data[col] = func(left[col], float(right[col])) + new_data[col] = func(left[col], right[col]) return self._constructor( new_data, diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 73e126cf230a5e..bcdbf0855cbb49 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -58,7 +58,7 @@ def hash_pandas_object( obj, index=True, encoding="utf8", hash_key=None, categorize=True ): """ - Return a data hash of the Index/Series/DataFrame + Return a data hash of the Index/Series/DataFrame. Parameters ---------- diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 0ce6d5ddec2ad7..40e6c679ba72d8 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -206,8 +206,8 @@ def _constructor(self): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index c43ca6b0565f36..47bd8f2ec593b5 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -136,8 +136,8 @@ def _get_window(self, other=None, **kwargs): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 323089b3fdf6b4..29ef2e917ae57f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -901,12 +901,12 @@ def func(arg, window, min_periods=None, closed=None): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - result, how = self._aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + result, how = self._aggregate(func, *args, **kwargs) if result is None: # these must apply directly - result = arg(self) + result = func(self) return result @@ -1653,7 +1653,10 @@ def is_datetimelike(self): def _on(self): if self.on is None: - return self.obj.index + if self.axis == 0: + return self.obj.index + elif self.axis == 1: + return self.obj.columns elif isinstance(self.obj, ABCDataFrame) and self.on in self.obj.columns: return Index(self.obj[self.on]) else: @@ -1788,8 +1791,8 @@ def _validate_freq(self): axis="", ) @Appender(_shared_docs["aggregate"]) - def aggregate(self, arg, *args, **kwargs): - return super().aggregate(arg, *args, **kwargs) + def aggregate(self, func, *args, **kwargs): + return super().aggregate(func, *args, **kwargs) agg = aggregate diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 3177937ac4ba19..a85fc8bfb14142 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -4,7 +4,7 @@ Expose public exceptions & warnings """ -from pandas._libs.tslibs import OutOfBoundsDatetime +from pandas._libs.tslibs import NullFrequencyError, OutOfBoundsDatetime class PerformanceWarning(Warning): @@ -157,14 +157,6 @@ class MergeError(ValueError): """ -class NullFrequencyError(ValueError): - """ - Error raised when a null `freq` attribute is used in an operation - that needs a non-null frequency, particularly `DatetimeIndex.shift`, - `TimedeltaIndex.shift`, `PeriodIndex.shift`. - """ - - class AccessorRegistrationWarning(Warning): """Warning for attribute conflicts in accessor registration.""" diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index d38221d7842739..76c01535a26e79 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -9,8 +9,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover r""" - Read text from clipboard and pass to read_csv. See read_csv for the - full argument list + Read text from clipboard and pass to read_csv. Parameters ---------- @@ -18,9 +17,13 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. + **kwargs + See read_csv for the full argument list. + Returns ------- - parsed : DataFrame + DataFrame + A parsed DataFrame object. """ encoding = kwargs.pop("encoding", "utf-8") diff --git a/pandas/io/common.py b/pandas/io/common.py index 26b68dda7b464a..0bbac8a8b7c1cf 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -4,13 +4,23 @@ import codecs import csv import gzip -from http.client import HTTPException # noqa from io import BufferedIOBase, BytesIO import mmap import os import pathlib -from typing import IO, AnyStr, BinaryIO, Optional, TextIO, Type -from urllib.error import URLError # noqa +from typing import ( + IO, + Any, + AnyStr, + BinaryIO, + Dict, + List, + Optional, + TextIO, + Tuple, + Type, + Union, +) from urllib.parse import ( # noqa urlencode, urljoin, @@ -19,7 +29,6 @@ uses_params, uses_relative, ) -from urllib.request import pathname2url, urlopen import zipfile from pandas.compat import _get_lzma_file, _import_lzma @@ -81,7 +90,8 @@ def __next__(self): def _is_url(url) -> bool: - """Check to see if a URL has a valid protocol. + """ + Check to see if a URL has a valid protocol. Parameters ---------- @@ -92,10 +102,9 @@ def _is_url(url) -> bool: isurl : bool If `url` has a valid protocol return True otherwise False. """ - try: - return parse_url(url).scheme in _VALID_URLS - except Exception: + if not isinstance(url, str): return False + return parse_url(url).scheme in _VALID_URLS def _expand_user( @@ -162,18 +171,26 @@ def _stringify_path( def is_s3_url(url) -> bool: """Check for an s3, s3n, or s3a url""" - try: - return parse_url(url).scheme in ["s3", "s3n", "s3a"] - except Exception: + if not isinstance(url, str): return False + return parse_url(url).scheme in ["s3", "s3n", "s3a"] def is_gcs_url(url) -> bool: """Check for a gcs url""" - try: - return parse_url(url).scheme in ["gcs", "gs"] - except Exception: + if not isinstance(url, str): return False + return parse_url(url).scheme in ["gcs", "gs"] + + +def urlopen(*args, **kwargs): + """ + Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of + the stdlib. + """ + import urllib.request + + return urllib.request.urlopen(*args, **kwargs) def get_filepath_or_buffer( @@ -249,12 +266,49 @@ def file_path_to_url(path: str) -> str: ------- a valid FILE URL """ + # lazify expensive import (~30ms) + from urllib.request import pathname2url + return urljoin("file:", pathname2url(path)) _compression_to_extension = {"gzip": ".gz", "bz2": ".bz2", "zip": ".zip", "xz": ".xz"} +def _get_compression_method( + compression: Optional[Union[str, Dict[str, str]]] +) -> Tuple[Optional[str], Dict[str, str]]: + """ + Simplifies a compression argument to a compression method string and + a dict containing additional arguments. + + Parameters + ---------- + compression : str or dict + If string, specifies the compression method. If dict, value at key + 'method' specifies compression method. + + Returns + ------- + tuple of ({compression method}, Optional[str] + {compression arguments}, Dict[str, str]) + + Raises + ------ + ValueError on dict missing 'method' key + """ + # Handle dict + if isinstance(compression, dict): + compression_args = compression.copy() + try: + compression = compression_args.pop("method") + except KeyError: + raise ValueError("If dict, compression must have key 'method'") + else: + compression_args = {} + return compression, compression_args + + def _infer_compression( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] ) -> Optional[str]: @@ -266,8 +320,8 @@ def _infer_compression( Parameters ---------- - filepath_or_buffer : - a path (str) or buffer + filepath_or_buffer : str or file handle + File path or object. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None} If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', @@ -275,12 +329,11 @@ def _infer_compression( Returns ------- - string or None : - compression method + string or None Raises ------ - ValueError on invalid compression specified + ValueError on invalid compression specified. """ # No compression has been explicitly specified @@ -312,32 +365,49 @@ def _infer_compression( def _get_handle( - path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True + path_or_buf, + mode: str, + encoding=None, + compression: Optional[Union[str, Dict[str, Any]]] = None, + memory_map: bool = False, + is_text: bool = True, ): """ Get file handle for given path/buffer and mode. Parameters ---------- - path_or_buf : - a path (str) or buffer + path_or_buf : str or file handle + File path or object. mode : str - mode to open path_or_buf with + Mode to open path_or_buf with. encoding : str or None - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None - If 'infer' and `filepath_or_buffer` is path-like, then detect - compression from the following extensions: '.gz', '.bz2', '.zip', - or '.xz' (otherwise no compression). + Encoding to use. + compression : str or dict, default None + If string, specifies compression mode. If dict, value at key 'method' + specifies compression mode. Compression mode must be one of {'infer', + 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' + and `filepath_or_buffer` is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise + no compression). If dict and compression mode is 'zip' or inferred as + 'zip', other entries passed as additional compression options. + + .. versionchanged:: 1.0.0 + + May now be a dict with key 'method' as compression mode + and other keys as compression options if compression + mode is 'zip'. + memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.) + mode (pickle, etc.). Returns ------- f : file-like - A file-like object + A file-like object. handles : list of file-like objects A list of file-like object that were opened in this function. """ @@ -346,15 +416,16 @@ def _get_handle( need_text_wrapping = (BufferedIOBase, S3File) except ImportError: - need_text_wrapping = BufferedIOBase + need_text_wrapping = BufferedIOBase # type: ignore - handles = list() + handles = list() # type: List[IO] f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) + compression, compression_args = _get_compression_method(compression) if is_path: compression = _infer_compression(path_or_buf, compression) @@ -376,7 +447,7 @@ def _get_handle( # ZIP Compression elif compression == "zip": - zf = BytesZipFile(path_or_buf, mode) + zf = BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": @@ -429,9 +500,9 @@ def _get_handle( if memory_map and hasattr(f, "fileno"): try: - g = MMapWrapper(f) + wrapped = MMapWrapper(f) f.close() - f = g + f = wrapped except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level @@ -456,15 +527,19 @@ def __init__( self, file: FilePathOrBuffer, mode: str, - compression: int = zipfile.ZIP_DEFLATED, + archive_name: Optional[str] = None, **kwargs ): if mode in ["wb", "rb"]: mode = mode.replace("b", "") - super().__init__(file, mode, compression, **kwargs) + self.archive_name = archive_name + super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) def write(self, data): - super().writestr(self.filename, data) + archive_name = self.filename + if self.archive_name is not None: + archive_name = self.archive_name + super().writestr(archive_name, data) @property def closed(self): @@ -509,7 +584,6 @@ def __next__(self) -> str: class UTF8Recoder(BaseIterator): - """ Iterator that reads an encoded stream and re-encodes the input to UTF-8 """ diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 154656fbb250b5..6dba5e042562b7 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -4,7 +4,6 @@ from io import BytesIO import os from textwrap import fill -from urllib.request import urlopen from pandas._config import config @@ -21,6 +20,7 @@ _stringify_path, _validate_header_arg, get_filepath_or_buffer, + urlopen, ) from pandas.io.excel._util import ( _fill_mi_header, @@ -112,7 +112,7 @@ engine : str, default None If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd. + Acceptable values are None, "xlrd", "openpyxl" or "odf". converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one @@ -783,11 +783,12 @@ class ExcelFile: Parameters ---------- io : string, path object (pathlib.Path or py._path.local.LocalPath), - file-like object or xlrd workbook - If a string or path object, expected to be a path to xls or xlsx file. + a file-like object, xlrd workbook or openpypl workbook. + If a string or path object, expected to be a path to xls, xlsx or odf file. engine : string, default None If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or ``xlrd``. + Acceptable values are None, ``xlrd``, ``openpyxl`` or ``odf``. + Note that ``odf`` reads tables out of OpenDocument formatted files. """ from pandas.io.excel._odfreader import _ODFReader @@ -837,10 +838,10 @@ def parse( **kwds ): """ - Parse specified sheet(s) into a DataFrame + Parse specified sheet(s) into a DataFrame. Equivalent to read_excel(ExcelFile, ...) See the read_excel - docstring for more info on accepted parameters + docstring for more info on accepted parameters. Returns ------- diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 60daf311397e80..e25862537cbfc5 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -22,6 +22,7 @@ from pandas.io.common import ( UnicodeWriter, + _get_compression_method, _get_handle, _infer_compression, get_filepath_or_buffer, @@ -58,6 +59,9 @@ def __init__( if path_or_buf is None: path_or_buf = StringIO() + # Extract compression mode as given, if dict + compression, self.compression_args = _get_compression_method(compression) + self.path_or_buf, _, _, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode ) @@ -178,7 +182,7 @@ def save(self): self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=dict(self.compression_args, method=self.compression), ) close = True @@ -206,11 +210,13 @@ def save(self): if hasattr(self.path_or_buf, "write"): self.path_or_buf.write(buf) else: + compression = dict(self.compression_args, method=self.compression) + f, handles = _get_handle( self.path_or_buf, self.mode, encoding=self.encoding, - compression=self.compression, + compression=compression, ) f.write(buf) close = True diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 61af935bd82276..4a66ad48d13185 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -5,6 +5,7 @@ import codecs from contextlib import contextmanager +from datetime import tzinfo import decimal from functools import partial from io import StringIO @@ -27,8 +28,6 @@ ) from unicodedata import east_asian_width -from dateutil.tz.tz import tzutc -from dateutil.zoneinfo import tzfile import numpy as np from pandas._config.config import get_option, set_option @@ -549,7 +548,8 @@ def __init__( decimal: str = ".", table_id: Optional[str] = None, render_links: bool = False, - **kwds + bold_rows: bool = False, + escape: bool = True, ): self.frame = frame self.show_index_names = index_names @@ -580,7 +580,8 @@ def __init__( else: self.justify = justify - self.kwds = kwds + self.bold_rows = bold_rows + self.escape = escape if columns is not None: self.columns = ensure_index(columns) @@ -886,6 +887,8 @@ def to_latex( multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, ) -> Optional[str]: """ Render a DataFrame to a LaTeX tabular/longtable environment output. @@ -900,6 +903,8 @@ def to_latex( multicolumn=multicolumn, multicolumn_format=multicolumn_format, multirow=multirow, + caption=caption, + label=label, ).get_result(buf=buf, encoding=encoding) def _format_col(self, i: int) -> List[str]: @@ -1546,9 +1551,7 @@ def _is_dates_only( def _format_datetime64( - x: Union[NaTType, Timestamp], - tz: Optional[Union[tzfile, tzutc]] = None, - nat_rep: str = "NaT", + x: Union[NaTType, Timestamp], tz: Optional[tzinfo] = None, nat_rep: str = "NaT" ) -> str: if x is None or (is_scalar(x) and isna(x)): return nat_rep diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 4b44893df70ed5..8c4a7f4a1213d9 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -37,7 +37,7 @@ class HTMLFormatter(TableFormatter): def __init__( self, formatter: DataFrameFormatter, - classes: Optional[Union[str, List, Tuple]] = None, + classes: Optional[Union[str, List[str], Tuple[str, ...]]] = None, border: Optional[int] = None, ) -> None: self.fmt = formatter @@ -46,11 +46,11 @@ def __init__( self.frame = self.fmt.frame self.columns = self.fmt.tr_frame.columns self.elements = [] # type: List[str] - self.bold_rows = self.fmt.kwds.get("bold_rows", False) - self.escape = self.fmt.kwds.get("escape", True) + self.bold_rows = self.fmt.bold_rows + self.escape = self.fmt.escape self.show_dimensions = self.fmt.show_dimensions if border is None: - border = get_option("display.html.border") + border = cast(int, get_option("display.html.border")) self.border = border self.table_id = self.fmt.table_id self.render_links = self.fmt.render_links diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index c60e15b733f0a9..ca9db88ae7be46 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -36,19 +36,25 @@ def __init__( multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame - self.bold_rows = self.fmt.kwds.get("bold_rows", False) + self.bold_rows = self.fmt.bold_rows self.column_format = column_format self.longtable = longtable self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow + self.caption = caption + self.label = label + self.escape = self.fmt.escape def write_result(self, buf: IO[str]) -> None: """ - Render a DataFrame to a LaTeX tabular/longtable environment output. + Render a DataFrame to a LaTeX tabular, longtable, or table/tabular + environment output. """ # string representation of the columns @@ -113,12 +119,12 @@ def pad_empties(x): "not {typ}".format(typ=type(column_format)) ) - if not self.longtable: - buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format)) - buf.write("\\toprule\n") + if self.longtable: + self._write_longtable_begin(buf, column_format) else: - buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) - buf.write("\\toprule\n") + self._write_tabular_begin(buf, column_format) + + buf.write("\\toprule\n") ilevels = self.frame.index.nlevels clevels = self.frame.columns.nlevels @@ -142,7 +148,7 @@ def pad_empties(x): buf.write("\\endfoot\n\n") buf.write("\\bottomrule\n") buf.write("\\endlastfoot\n") - if self.fmt.kwds.get("escape", True): + if self.escape: # escape backslashes first crow = [ ( @@ -182,11 +188,10 @@ def pad_empties(x): if self.multirow and i < len(strrows) - 1: self._print_cline(buf, i, len(strcols)) - if not self.longtable: - buf.write("\\bottomrule\n") - buf.write("\\end{tabular}\n") + if self.longtable: + self._write_longtable_end(buf) else: - buf.write("\\end{longtable}\n") + self._write_tabular_end(buf) def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: r""" @@ -267,3 +272,107 @@ def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: buf.write("\\cline{{{cl:d}-{icol:d}}}\n".format(cl=cl[1], icol=icol)) # remove entries that have been written to buffer self.clinebuf = [x for x in self.clinebuf if x[0] != i] + + def _write_tabular_begin(self, buf, column_format): + """ + Write the beginning of a tabular environment or + nested table/tabular environments including caption and label. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' + for 3 columns + + """ + if self.caption is not None or self.label is not None: + # then write output in a nested table/tabular environment + if self.caption is None: + caption_ = "" + else: + caption_ = "\n\\caption{{{}}}".format(self.caption) + + if self.label is None: + label_ = "" + else: + label_ = "\n\\label{{{}}}".format(self.label) + + buf.write("\\begin{{table}}\n\\centering{}{}\n".format(caption_, label_)) + else: + # then write output only in a tabular environment + pass + + buf.write("\\begin{{tabular}}{{{fmt}}}\n".format(fmt=column_format)) + + def _write_tabular_end(self, buf): + """ + Write the end of a tabular environment or nested table/tabular + environment. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + + """ + buf.write("\\bottomrule\n") + buf.write("\\end{tabular}\n") + if self.caption is not None or self.label is not None: + buf.write("\\end{table}\n") + else: + pass + + def _write_longtable_begin(self, buf, column_format): + """ + Write the beginning of a longtable environment including caption and + label if provided by user. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' + for 3 columns + + """ + buf.write("\\begin{{longtable}}{{{fmt}}}\n".format(fmt=column_format)) + + if self.caption is not None or self.label is not None: + if self.caption is None: + pass + else: + buf.write("\\caption{{{}}}".format(self.caption)) + + if self.label is None: + pass + else: + buf.write("\\label{{{}}}".format(self.label)) + + # a double-backslash is required at the end of the line + # as discussed here: + # https://tex.stackexchange.com/questions/219138 + buf.write("\\\\\n") + else: + pass + + @staticmethod + def _write_longtable_end(buf): + """ + Write the end of a longtable environment. + + Parameters + ---------- + buf : string or file handle + File path or object. If not specified, the result is returned as + a string. + + """ + buf.write("\\end{longtable}\n") diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 4ec9094ce4abe4..ead51693da7919 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -3,12 +3,14 @@ """ import sys -from typing import Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple, Union from pandas._config import get_option from pandas.core.dtypes.inference import is_sequence +EscapeChars = Union[Dict[str, str], Iterable[str]] + def adjoin(space: int, *lists: List[str], **kwargs) -> str: """ @@ -148,19 +150,16 @@ def _pprint_dict( def pprint_thing( - thing, + thing: Any, _nest_lvl: int = 0, - escape_chars: Optional[Union[Dict[str, str], Iterable[str]]] = None, + escape_chars: Optional[EscapeChars] = None, default_escapes: bool = False, quote_strings: bool = False, max_seq_items: Optional[int] = None, ) -> str: """ This function is the sanctioned way of converting objects - to a unicode representation. - - properly handles nested sequences containing unicode strings - (unicode(object) does not) + to a string representation and properly handles nested sequences. Parameters ---------- @@ -178,21 +177,13 @@ def pprint_thing( Returns ------- - result - unicode str + str """ - def as_escaped_unicode(thing, escape_chars=escape_chars): - # Unicode is fine, else we try to decode using utf-8 and 'replace' - # if that's not it either, we have no way of knowing and the user - # should deal with it himself. - - try: - result = str(thing) # we should try this first - except UnicodeDecodeError: - # either utf-8 or we replace errors - result = str(thing).decode("utf-8", "replace") - + def as_escaped_string( + thing: Any, escape_chars: Optional[EscapeChars] = escape_chars + ) -> str: translate = {"\t": r"\t", "\n": r"\n", "\r": r"\r"} if isinstance(escape_chars, dict): if default_escapes: @@ -202,10 +193,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): escape_chars = list(escape_chars.keys()) else: escape_chars = escape_chars or tuple() + + result = str(thing) for c in escape_chars: result = result.replace(c, translate[c]) - - return str(result) + return result if hasattr(thing, "__next__"): return str(thing) @@ -224,11 +216,11 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): max_seq_items=max_seq_items, ) elif isinstance(thing, str) and quote_strings: - result = "'{thing}'".format(thing=as_escaped_unicode(thing)) + result = "'{thing}'".format(thing=as_escaped_string(thing)) else: - result = as_escaped_unicode(thing) + result = as_escaped_string(thing) - return str(result) # always unicode + return result def pprint_thing_encoded( diff --git a/pandas/io/html.py b/pandas/io/html.py index 9d2647f226f009..490c574463b9bd 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -1,4 +1,5 @@ -""":mod:`pandas.io.html` is a module containing functionality for dealing with +""" +:mod:`pandas.io.html` is a module containing functionality for dealing with HTML IO. """ @@ -58,7 +59,8 @@ def _importers(): def _remove_whitespace(s, regex=_RE_WHITESPACE): - """Replace extra whitespace inside of a string with a single space. + """ + Replace extra whitespace inside of a string with a single space. Parameters ---------- @@ -77,7 +79,8 @@ def _remove_whitespace(s, regex=_RE_WHITESPACE): def _get_skiprows(skiprows): - """Get an iterator given an integer, slice or container. + """ + Get an iterator given an integer, slice or container. Parameters ---------- @@ -107,7 +110,8 @@ def _get_skiprows(skiprows): def _read(obj): - """Try to read from a url, file or string. + """ + Try to read from a url, file or string. Parameters ---------- @@ -136,7 +140,8 @@ def _read(obj): class _HtmlFrameParser: - """Base class for parsers that parse HTML into DataFrames. + """ + Base class for parsers that parse HTML into DataFrames. Parameters ---------- @@ -515,7 +520,8 @@ def _handle_hidden_tables(self, tbl_list, attr_name): class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): - """HTML to DataFrame parser that uses BeautifulSoup under the hood. + """ + HTML to DataFrame parser that uses BeautifulSoup under the hood. See Also -------- @@ -622,7 +628,8 @@ def _build_xpath_expr(attrs): class _LxmlFrameParser(_HtmlFrameParser): - """HTML to DataFrame parser that uses lxml under the hood. + """ + HTML to DataFrame parser that uses lxml under the hood. Warning ------- @@ -937,7 +944,8 @@ def read_html( keep_default_na=True, displayed_only=True, ): - r"""Read HTML tables into a ``list`` of ``DataFrame`` objects. + r""" + Read HTML tables into a ``list`` of ``DataFrame`` objects. Parameters ---------- diff --git a/pandas/io/msgpack/__init__.py b/pandas/io/msgpack/__init__.py index 9b09cffd83f755..7107263c180cb1 100644 --- a/pandas/io/msgpack/__init__.py +++ b/pandas/io/msgpack/__init__.py @@ -2,8 +2,8 @@ from collections import namedtuple -from pandas.io.msgpack.exceptions import * # noqa -from pandas.io.msgpack._version import version # noqa +from pandas.io.msgpack.exceptions import * # noqa: F401,F403 isort:skip +from pandas.io.msgpack._version import version # noqa: F401 isort:skip class ExtType(namedtuple("ExtType", "code data")): @@ -19,10 +19,14 @@ def __new__(cls, code, data): return super().__new__(cls, code, data) -import os # noqa +import os # noqa: F401,E402 isort:skip -from pandas.io.msgpack._packer import Packer # noqa -from pandas.io.msgpack._unpacker import unpack, unpackb, Unpacker # noqa +from pandas.io.msgpack._unpacker import ( # noqa: F401,E402 isort:skip + Unpacker, + unpack, + unpackb, +) +from pandas.io.msgpack._packer import Packer # noqa: E402 isort:skip def pack(o, stream, **kwargs): diff --git a/pandas/io/msgpack/_packer.pyi b/pandas/io/msgpack/_packer.pyi new file mode 100644 index 00000000000000..e95a1622c56153 --- /dev/null +++ b/pandas/io/msgpack/_packer.pyi @@ -0,0 +1,22 @@ +# flake8: noqa + +class Packer: + def __cinit__(self): ... + def __init__( + self, + default=..., + encoding=..., + unicode_errors=..., + use_single_float=..., + autoreset: int = ..., + use_bin_type: int = ..., + ): ... + def __dealloc__(self): ... + def _pack(self, o, nest_limit: int = ...) -> int: ... + def pack(self, obj): ... + def pack_ext_type(self, typecode, data): ... + def pack_array_header(self, size): ... + def pack_map_header(self, size): ... + def pack_map_pairs(self, pairs): ... + def reset(self) -> None: ... + def bytes(self): ... diff --git a/pandas/io/msgpack/_unpacker.pyi b/pandas/io/msgpack/_unpacker.pyi new file mode 100644 index 00000000000000..9910895947fb64 --- /dev/null +++ b/pandas/io/msgpack/_unpacker.pyi @@ -0,0 +1,59 @@ +# flake8: noqa + +def unpackb( + packed, + object_hook=..., + list_hook=..., + use_list=..., + encoding=..., + unicode_errors=..., + object_pairs_hook=..., + ext_hook=..., + max_str_len=..., + max_bin_len=..., + max_array_len=..., + max_map_len=..., + max_ext_len=..., +): ... +def unpack( + stream, + object_hook=..., + list_hook=..., + use_list=..., + encoding=..., + unicode_errors=..., + object_pairs_hook=..., +): ... + +class Unpacker: + def __cinit__(self): ... + def __dealloc__(self): ... + def __init__( + self, + file_like=..., + read_size=..., + use_list=..., + object_hook=..., + object_pairs_hook=..., + list_hook=..., + encoding=..., + unicode_errors=..., + max_buffer_size: int = ..., + ext_hook=..., + max_str_len=..., + max_bin_len=..., + max_array_len=..., + max_map_len=..., + max_ext_len=..., + ): ... + def feed(self, next_bytes): ... + def append_buffer(self, _buf, _buf_len): ... + def read_from_file(self): ... + def _unpack(self, execute, write_bytes, iter=...): ... + def read_bytes(self, nbytes): ... + def unpack(self, write_bytes=...): ... + def skip(self, write_bytes=...): ... + def read_array_header(self, write_bytes=...): ... + def read_map_header(self, write_bytes=...): ... + def __iter__(self): ... + def __next__(self): ... diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 04e49708ff082b..ad47ba23b9221d 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -846,7 +846,6 @@ def __init__( class Iterator: - """ manage the unpacking iteration, close the file on completion """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a3ff837bc7f52c..72f1adf0aad3dc 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1064,7 +1064,6 @@ def _clean_options(self, options, engine): ) if result.get(arg, depr_default) != depr_default: - # raise Exception(result.get(arg, depr_default), depr_default) depr_warning += msg + "\n\n" else: result[arg] = parser_default diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 4e390de87fc607..4b9a52a1fb8f33 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -153,10 +153,10 @@ def read_pickle(path, compression="infer"): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) return pickle.load(f) - except Exception: # noqa: E722 + except Exception: try: return pc.load(f, encoding=None) - except Exception: # noqa: E722 + except Exception: return pc.load(f, encoding="latin1") finally: f.close() diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6af5dd6f1bf372..1ff3400323e54a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -429,10 +429,10 @@ def _is_metadata_of(group, parent_group): class HDFStore: - """ - Dict-like IO interface for storing pandas objects in PyTables - either Fixed or Table format. + Dict-like IO interface for storing pandas objects in PyTables. + + Either Fixed or Table format. Parameters ---------- @@ -564,13 +564,12 @@ def __exit__(self, exc_type, exc_value, traceback): def keys(self): """ - Return a (potentially unordered) list of the keys corresponding to the - objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. - have the leading '/' + Return a list of keys corresponding to objects stored in HDFStore. Returns ------- list + List of ABSOLUTE path-names (e.g. have the leading '/'). """ return [n._v_pathname for n in self.groups()] @@ -703,7 +702,7 @@ def flush(self, fsync=False): def get(self, key): """ - Retrieve pandas object stored in file + Retrieve pandas object stored in file. Parameters ---------- @@ -711,7 +710,8 @@ def get(self, key): Returns ------- - obj : same type as object stored in file + object + Same type as object stored in file. """ group = self.get_node(key) if group is None: @@ -731,25 +731,31 @@ def select( **kwargs ): """ - Retrieve pandas object stored in file, optionally based on where - criteria + Retrieve pandas object stored in file, optionally based on where criteria. Parameters ---------- key : object - where : list of Term (or convertible) objects, optional - start : integer (defaults to None), row number to start selection - stop : integer (defaults to None), row number to stop selection - columns : a list of columns that if not None, will limit the return - columns - iterator : boolean, return an iterator, default False - chunksize : nrows to include in iteration, return an iterator - auto_close : boolean, should automatically close the store when - finished, default is False + Object being retrieved from file. + where : list, default None + List of Term (or convertible) objects, optional. + start : int, default None + Row number to start selection. + stop : int, default None + Row number to stop selection. + columns : list, default None + A list of columns that if not None, will limit the return columns. + iterator : bool, default False + Returns an iterator. + chunksize : int, default None + Number or rows to include in iteration, return an iterator. + auto_close : bool, default False + Should automatically close the store when finished. Returns ------- - The selected object + object + Retrieved object from file. """ group = self.get_node(key) if group is None: @@ -929,28 +935,30 @@ def func(_start, _stop, _where): def put(self, key, value, format=None, append=False, **kwargs): """ - Store object in HDFStore + Store object in HDFStore. Parameters ---------- - key : object - value : {Series, DataFrame} - format : 'fixed(f)|table(t)', default is 'fixed' + key : object + value : {Series, DataFrame} + format : 'fixed(f)|table(t)', default is 'fixed' fixed(f) : Fixed format - Fast writing/reading. Not-appendable, nor searchable + Fast writing/reading. Not-appendable, nor searchable. table(t) : Table format Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching - / selecting subsets of the data - append : boolean, default False + / selecting subsets of the data. + append : bool, default False This will force Table format, append the input data to the existing. - data_columns : list of columns to create as data columns, or True to + data_columns : list, default None + List of columns to create as data columns, or True to use all columns. See `here `__. - encoding : default None, provide an encoding for strings - dropna : boolean, default False, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + encoding : str, default None + Provide an encoding for strings. + dropna : bool, default False, do not write an ALL nan row to + The store settable by the option 'io.hdf.dropna_table'. """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1165,12 +1173,15 @@ def create_table_index(self, key, **kwargs): s.create_index(**kwargs) def groups(self): - """return a list of all the top-level nodes (that are not themselves a - pandas storage object) + """ + Return a list of all the top-level nodes. + + Each node returned is not a pandas storage object. Returns ------- list + List of objects. """ _tables() self._check_if_open() @@ -1188,10 +1199,12 @@ def groups(self): ] def walk(self, where="/"): - """ Walk the pytables group hierarchy for pandas objects + """ + Walk the pytables group hierarchy for pandas objects. This generator will yield the group path, subgroups and pandas object names for each group. + Any non-pandas PyTables objects that are not a group will be ignored. The `where` group itself is listed first (preorder), then each of its @@ -1202,18 +1215,17 @@ def walk(self, where="/"): Parameters ---------- - where : str, optional + where : str, default "/" Group where to start walking. - If not supplied, the root group is used. Yields ------ path : str - Full path to a group (without trailing '/') - groups : list of str - names of the groups contained in `path` - leaves : list of str - names of the pandas objects contained in `path` + Full path to a group (without trailing '/'). + groups : list + Names (strings) of the groups contained in `path`. + leaves : list + Names (strings) of the pandas objects contained in `path`. """ _tables() self._check_if_open() @@ -1533,7 +1545,6 @@ def _read_group(self, group, **kwargs): class TableIterator: - """ define the iteration interface on a table Parameters @@ -1641,7 +1652,6 @@ def get_result(self, coordinates=False): class IndexCol: - """ an index column description class Parameters @@ -1955,7 +1965,6 @@ def write_metadata(self, handler): class GenericIndexCol(IndexCol): - """ an index which is not represented in the data of the table """ @property @@ -1993,7 +2002,6 @@ def set_attr(self): class DataCol(IndexCol): - """ a data holding column, by definition this is not indexable Parameters @@ -2443,7 +2451,6 @@ def set_attr(self): class DataIndexableCol(DataCol): - """ represent a data column that can be indexed """ is_data_indexable = True @@ -2466,7 +2473,6 @@ def get_atom_timedelta64(self, block): class GenericDataIndexableCol(DataIndexableCol): - """ represent a generic pytables data column """ def get_attr(self): @@ -2474,7 +2480,6 @@ def get_attr(self): class Fixed: - """ represent an object in my store facilitate read/write of various types of objects this is an abstract base class @@ -2642,7 +2647,6 @@ def delete(self, where=None, start=None, stop=None, **kwargs): class GenericFixed(Fixed): - """ a generified fixed version """ _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"} @@ -2898,7 +2902,12 @@ def read_index_node(self, node, start=None, stop=None): kwargs["freq"] = node._v_attrs["freq"] if "tz" in node._v_attrs: - kwargs["tz"] = node._v_attrs["tz"] + if isinstance(node._v_attrs["tz"], bytes): + # created by python2 + kwargs["tz"] = node._v_attrs["tz"].decode("utf-8") + else: + # created by python3 + kwargs["tz"] = node._v_attrs["tz"] if kind in ("date", "datetime"): index = factory( @@ -3239,7 +3248,6 @@ class FrameFixed(BlockManagerFixed): class Table(Fixed): - """ represent a table: facilitate read/write of various types of tables @@ -4114,7 +4122,6 @@ def read_column(self, column, where=None, start=None, stop=None): class WORMTable(Table): - """ a write-once read-many table: this format DOES NOT ALLOW appending to a table. writing is a one-time operation the data are stored in a format that allows for searching the data on disk @@ -4136,7 +4143,6 @@ def write(self, **kwargs): class LegacyTable(Table): - """ an appendable table: allow append/query/delete operations to a (possibly) already existing appendable table this table ALLOWS append (but doesn't require them), and stores the data in a format @@ -4590,7 +4596,6 @@ def write(self, **kwargs): class AppendableMultiFrameTable(AppendableFrameTable): - """ a frame with a multi-index """ table_type = "appendable_multiframe" @@ -4949,7 +4954,6 @@ def _need_convert(kind): class Selection: - """ Carries out a selection operation on a tables.Table object. diff --git a/pandas/io/sql.py b/pandas/io/sql.py index f1f52a9198d29d..44cb399336d62f 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -269,7 +269,8 @@ def read_sql_query( parse_dates=None, chunksize=None, ): - """Read SQL query into a DataFrame. + """ + Read SQL query into a DataFrame. Returns a DataFrame corresponding to the result set of the query string. Optionally provide an `index_col` parameter to use one of the @@ -455,14 +456,14 @@ def to_sql( Parameters ---------- frame : DataFrame, Series - name : string + name : str Name of SQL table. con : SQLAlchemy connectable(engine/connection) or database string URI or sqlite3 DBAPI2 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. - schema : string, default None + schema : str, optional Name of SQL schema in database to write to (if database flavor supports this). If None, use default schema (default). if_exists : {'fail', 'replace', 'append'}, default 'fail' @@ -471,18 +472,19 @@ def to_sql( - append: If table exists, insert data. Create if does not exist. index : boolean, default True Write DataFrame index as a column. - index_label : string or sequence, default None + index_label : str or sequence, optional Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. - chunksize : int, default None - If not None, then rows will be written in batches of this size at a - time. If None, all rows will be written at once. - dtype : single SQLtype or dict of column name to SQL type, default None - Optional specifying the datatype for columns. The SQL type should - be a SQLAlchemy type, or a string for sqlite3 fallback connection. - If all columns are of the same type, one single value can be used. - method : {None, 'multi', callable}, default None + chunksize : int, optional + Specify the number of rows in each batch to be written at a time. + By default, all rows will be written at once. + dtype : dict or scalar, optional + Specifying the datatype for columns. If a dictionary is used, the + keys should be the column names and the values should be the + SQLAlchemy types or strings for the sqlite3 fallback mode. If a + scalar is provided, it will be applied to all columns. + method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: - None : Uses standard SQL ``INSERT`` clause (one per row). diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 69bafc77492587..31fdaa5cc67359 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -138,7 +138,7 @@ _iterator_params, ) -_data_method_doc = """\ +_data_method_doc = """ Read observations from Stata file, converting them into a dataframe .. deprecated:: diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 2e6a401b49efc4..837b01974be930 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1,22 +1,20 @@ import importlib -from typing import List, Type # noqa import warnings +from pandas._config import get_option + +from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_integer, is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -import pandas from pandas.core.base import PandasObject # Trigger matplotlib import, which implicitly registers our # converts. Implicit registration is deprecated, and when enforced # we can lazily import matplotlib. -try: - import pandas.plotting._matplotlib # noqa -except ImportError: - pass +import_optional_dependency("pandas.plotting._matplotlib", raise_on_missing=False) def hist_series( @@ -732,7 +730,7 @@ def __call__(self, *args, **kwargs): # `x` parameter, and return a Series with the parameter `y` as values. data = self._parent.copy() - if isinstance(data, pandas.core.dtypes.generic.ABCSeries): + if isinstance(data, ABCSeries): kwargs["reuse_plot"] = True if kind in self._dataframe_kinds: @@ -1576,10 +1574,18 @@ def _find_backend(backend: str): # We re-raise later on. pass else: - _backends[backend] = module - return module - - raise ValueError("No backend {}".format(backend)) + if hasattr(module, "plot"): + # Validate that the interface is implemented when the option + # is set, rather than at plot time. + _backends[backend] = module + return module + + msg = ( + "Could not find plotting backend '{name}'. Ensure that you've installed the " + "package providing the '{name}' entrypoint, or that the package has a" + "top-level `.plot` method." + ) + raise ValueError(msg.format(name=backend)) def _get_plot_backend(backend=None): @@ -1595,12 +1601,18 @@ def _get_plot_backend(backend=None): The backend is imported lazily, as matplotlib is a soft dependency, and pandas can be used without it being installed. """ - backend = backend or pandas.get_option("plotting.backend") + backend = backend or get_option("plotting.backend") if backend == "matplotlib": # Because matplotlib is an optional dependency and first-party backend, # we need to attempt an import here to raise an ImportError if needed. - import pandas.plotting._matplotlib as module + try: + import pandas.plotting._matplotlib as module + except ImportError: + raise ImportError( + "matplotlib is required for plotting when the " + 'default backend "matplotlib" is selected.' + ) from None _backends["matplotlib"] = module diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 893854ab26e37d..446350cb5d9152 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -329,7 +329,7 @@ def __init__(self, locator, tz=None, defaultfmt="%Y-%m-%d"): class PandasAutoDateLocator(dates.AutoDateLocator): def get_locator(self, dmin, dmax): - "Pick the best locator based on a distance." + """Pick the best locator based on a distance.""" _check_implicitly_registered() delta = relativedelta(dmax, dmin) @@ -382,6 +382,7 @@ def __call__(self): dmax, dmin = dmin, dmax # We need to cap at the endpoints of valid datetime + # FIXME: dont leave commented-out # TODO(wesm) unused? # delta = relativedelta(dmax, dmin) # try: @@ -448,6 +449,7 @@ def autoscale(self): # We need to cap at the endpoints of valid datetime + # FIXME: dont leave commented-out # TODO(wesm): unused? # delta = relativedelta(dmax, dmin) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 6ff3f284403039..346949cb82c4d0 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,5 +1,5 @@ import re -from typing import Optional # noqa +from typing import Optional import warnings import numpy as np diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 1cba0e73541826..a8e86d9dfa997d 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -329,7 +329,8 @@ def parallel_coordinates( sort_labels=False, **kwds ): - """Parallel coordinates plotting. + """ + Parallel coordinates plotting. Parameters ---------- @@ -392,7 +393,8 @@ def parallel_coordinates( def lag_plot(series, lag=1, ax=None, **kwds): - """Lag plot for time series. + """ + Lag plot for time series. Parameters ---------- @@ -415,8 +417,8 @@ def autocorrelation_plot(series, ax=None, **kwds): Parameters ---------- - series: Time series - ax: Matplotlib axis object, optional + series : Time series + ax : Matplotlib axis object, optional kwds : keywords Options to pass to matplotlib plotting method diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index f047154f2c6362..774ff14398bdb4 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -190,7 +190,12 @@ def box(request): @pytest.fixture( - params=[pd.Index, pd.Series, pytest.param(pd.DataFrame, marks=pytest.mark.xfail)], + params=[ + pd.Index, + pd.Series, + pytest.param(pd.DataFrame, marks=pytest.mark.xfail), + tm.to_array, + ], ids=id_func, ) def box_df_fail(request): @@ -206,6 +211,7 @@ def box_df_fail(request): (pd.Series, False), (pd.DataFrame, False), pytest.param((pd.DataFrame, True), marks=pytest.mark.xfail), + (tm.to_array, False), ], ids=id_func, ) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 5931cd93cc8c5a..bc7b979d2c7d03 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -348,28 +348,6 @@ def test_dt64arr_timestamp_equality(self, box_with_array): expected = tm.box_expected([False, False], xbox) tm.assert_equal(result, expected) - @pytest.mark.parametrize( - "op", - [operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le], - ) - def test_comparison_tzawareness_compat(self, op): - # GH#18162 - dr = pd.date_range("2016-01-01", periods=6) - dz = dr.tz_localize("US/Pacific") - - # Check that there isn't a problem aware-aware and naive-naive do not - # raise - naive_series = Series(dr) - aware_series = Series(dz) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - op(dz, naive_series) - with pytest.raises(TypeError, match=msg): - op(dr, aware_series) - - # TODO: implement _assert_tzawareness_compat for the reverse - # comparison with the Series on the left-hand side - class TestDatetimeIndexComparisons: @@ -599,15 +577,18 @@ def test_comparison_tzawareness_compat(self, op, box_df_fail): with pytest.raises(TypeError, match=msg): op(dz, np.array(list(dr), dtype=object)) - # Check that there isn't a problem aware-aware and naive-naive do not - # raise + # The aware==aware and naive==naive comparisons should *not* raise assert_all(dr == dr) - assert_all(dz == dz) + assert_all(dr == list(dr)) + assert_all(list(dr) == dr) + assert_all(np.array(list(dr), dtype=object) == dr) + assert_all(dr == np.array(list(dr), dtype=object)) - # FIXME: DataFrame case fails to raise for == and !=, wrong - # message for inequalities - assert (dr == list(dr)).all() - assert (dz == list(dz)).all() + assert_all(dz == dz) + assert_all(dz == list(dz)) + assert_all(list(dz) == dz) + assert_all(np.array(list(dz), dtype=object) == dz) + assert_all(dz == np.array(list(dz), dtype=object)) @pytest.mark.parametrize( "op", @@ -844,6 +825,7 @@ def test_dt64arr_isub_timedeltalike_scalar( rng -= two_hours tm.assert_equal(rng, expected) + # TODO: redundant with test_dt64arr_add_timedeltalike_scalar def test_dt64arr_add_td64_scalar(self, box_with_array): # scalar timedeltas/np.timedelta64 objects # operate with np.timedelta64 correctly @@ -1709,14 +1691,12 @@ def test_operators_datetimelike(self): dt1 - dt2 dt2 - dt1 - # ## datetime64 with timetimedelta ### + # datetime64 with timetimedelta dt1 + td1 td1 + dt1 dt1 - td1 - # TODO: Decide if this ought to work. - # td1 - dt1 - # ## timetimedelta with datetime64 ### + # timetimedelta with datetime64 td1 + dt1 dt1 + td1 @@ -1914,7 +1894,7 @@ def test_dt64_series_add_intlike(self, tz, op): with pytest.raises(TypeError, match=msg): method(other) with pytest.raises(TypeError, match=msg): - method(other.values) + method(np.array(other)) with pytest.raises(TypeError, match=msg): method(pd.Index(other)) @@ -2380,34 +2360,34 @@ def test_ufunc_coercions(self): idx = date_range("2011-01-01", periods=3, freq="2D", name="x") delta = np.timedelta64(1, "D") + exp = date_range("2011-01-02", periods=3, freq="2D", name="x") for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = date_range("2011-01-02", periods=3, freq="2D", name="x") tm.assert_index_equal(result, exp) assert result.freq == "2D" + exp = date_range("2010-12-31", periods=3, freq="2D", name="x") for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = date_range("2010-12-31", periods=3, freq="2D", name="x") tm.assert_index_equal(result, exp) assert result.freq == "2D" delta = np.array( [np.timedelta64(1, "D"), np.timedelta64(2, "D"), np.timedelta64(3, "D")] ) + exp = DatetimeIndex( + ["2011-01-02", "2011-01-05", "2011-01-08"], freq="3D", name="x" + ) for result in [idx + delta, np.add(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex( - ["2011-01-02", "2011-01-05", "2011-01-08"], freq="3D", name="x" - ) tm.assert_index_equal(result, exp) assert result.freq == "3D" + exp = DatetimeIndex( + ["2010-12-31", "2011-01-01", "2011-01-02"], freq="D", name="x" + ) for result in [idx - delta, np.subtract(idx, delta)]: assert isinstance(result, DatetimeIndex) - exp = DatetimeIndex( - ["2010-12-31", "2011-01-01", "2011-01-02"], freq="D", name="x" - ) tm.assert_index_equal(result, exp) assert result.freq == "D" diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index d686d9f90a5a4a..8e7e72fcdc5800 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -561,9 +561,9 @@ def test_div_int(self, numeric_idx): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("op", [operator.mul, ops.rmul, operator.floordiv]) - def test_mul_int_identity(self, op, numeric_idx, box): + def test_mul_int_identity(self, op, numeric_idx, box_with_array): idx = numeric_idx - idx = tm.box_expected(idx, box) + idx = tm.box_expected(idx, box_with_array) result = op(idx, 1) tm.assert_equal(result, idx) @@ -615,8 +615,9 @@ def test_mul_size_mismatch_raises(self, numeric_idx): idx * np.array([1, 2]) @pytest.mark.parametrize("op", [operator.pow, ops.rpow]) - def test_pow_float(self, op, numeric_idx, box): + def test_pow_float(self, op, numeric_idx, box_with_array): # test power calculations both ways, GH#14973 + box = box_with_array idx = numeric_idx expected = pd.Float64Index(op(idx.values, 2.0)) @@ -626,8 +627,9 @@ def test_pow_float(self, op, numeric_idx, box): result = op(idx, 2.0) tm.assert_equal(result, expected) - def test_modulo(self, numeric_idx, box): + def test_modulo(self, numeric_idx, box_with_array): # GH#9244 + box = box_with_array idx = numeric_idx expected = Index(idx.values % 2) @@ -1041,7 +1043,8 @@ class TestObjectDtypeEquivalence: # Tests that arithmetic operations match operations executed elementwise @pytest.mark.parametrize("dtype", [None, object]) - def test_numarr_with_dtype_add_nan(self, dtype, box): + def test_numarr_with_dtype_add_nan(self, dtype, box_with_array): + box = box_with_array ser = pd.Series([1, 2, 3], dtype=dtype) expected = pd.Series([np.nan, np.nan, np.nan], dtype=dtype) @@ -1055,7 +1058,8 @@ def test_numarr_with_dtype_add_nan(self, dtype, box): tm.assert_equal(result, expected) @pytest.mark.parametrize("dtype", [None, object]) - def test_numarr_with_dtype_add_int(self, dtype, box): + def test_numarr_with_dtype_add_int(self, dtype, box_with_array): + box = box_with_array ser = pd.Series([1, 2, 3], dtype=dtype) expected = pd.Series([2, 3, 4], dtype=dtype) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index fd9db806713603..f9c1de115b3a4f 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -89,7 +89,7 @@ def test_pow_ops_object(self): @pytest.mark.parametrize("op", [operator.add, ops.radd]) @pytest.mark.parametrize("other", ["category", "Int64"]) - def test_add_extension_scalar(self, other, box, op): + def test_add_extension_scalar(self, other, box_with_array, op): # GH#22378 # Check that scalars satisfying is_extension_array_dtype(obj) # do not incorrectly try to dispatch to an ExtensionArray operation @@ -97,8 +97,8 @@ def test_add_extension_scalar(self, other, box, op): arr = pd.Series(["a", "b", "c"]) expected = pd.Series([op(x, other) for x in arr]) - arr = tm.box_expected(arr, box) - expected = tm.box_expected(expected, box) + arr = tm.box_expected(arr, box_with_array) + expected = tm.box_expected(expected, box_with_array) result = op(arr, other) tm.assert_equal(result, expected) @@ -133,16 +133,17 @@ def test_objarr_radd_str(self, box): ], ) @pytest.mark.parametrize("dtype", [None, object]) - def test_objarr_radd_str_invalid(self, dtype, data, box): + def test_objarr_radd_str_invalid(self, dtype, data, box_with_array): ser = Series(data, dtype=dtype) - ser = tm.box_expected(ser, box) + ser = tm.box_expected(ser, box_with_array) with pytest.raises(TypeError): "foo_" + ser @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) - def test_objarr_add_invalid(self, op, box): + def test_objarr_add_invalid(self, op, box_with_array): # invalid ops + box = box_with_array obj_ser = tm.makeObjectSeries() obj_ser.name = "objects" diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 6d6b85a1e81e1c..d480b26e30fff6 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -241,10 +241,7 @@ def test_subtraction_ops(self): with pytest.raises(TypeError, match=msg): tdi - dti - msg = ( - r"descriptor '__sub__' requires a 'datetime\.datetime' object" - " but received a 'Timedelta'" - ) + msg = r"unsupported operand type\(s\) for -" with pytest.raises(TypeError, match=msg): td - dt @@ -968,71 +965,37 @@ def test_td64arr_add_datetime64_nat(self, box_with_array): # ------------------------------------------------------------------ # Operations with int-like others - def test_td64arr_add_int_series_invalid(self, box): - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") - tdser = tm.box_expected(tdser, box) - err = TypeError if box is not pd.Index else NullFrequencyError - int_ser = Series([2, 3, 4]) - - with pytest.raises(err): - tdser + int_ser - with pytest.raises(err): - int_ser + tdser - with pytest.raises(err): - tdser - int_ser - with pytest.raises(err): - int_ser - tdser - - def test_td64arr_add_intlike(self, box_with_array): - # GH#19123 - tdi = TimedeltaIndex(["59 days", "59 days", "NaT"]) - ser = tm.box_expected(tdi, box_with_array) - - err = TypeError - if box_with_array in [pd.Index, tm.to_array]: - err = NullFrequencyError - - other = Series([20, 30, 40], dtype="uint8") - - # TODO: separate/parametrize - with pytest.raises(err): - ser + 1 - with pytest.raises(err): - ser - 1 - - with pytest.raises(err): - ser + other - with pytest.raises(err): - ser - other - - with pytest.raises(err): - ser + np.array(other) - with pytest.raises(err): - ser - np.array(other) - - with pytest.raises(err): - ser + pd.Index(other) - with pytest.raises(err): - ser - pd.Index(other) - - @pytest.mark.parametrize("scalar", [1, 1.5, np.array(2)]) - def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, scalar): + @pytest.mark.parametrize( + "other", + [ + # GH#19123 + 1, + Series([20, 30, 40], dtype="uint8"), + np.array([20, 30, 40], dtype="uint8"), + pd.UInt64Index([20, 30, 40]), + pd.Int64Index([20, 30, 40]), + Series([2, 3, 4]), + 1.5, + np.array(2), + ], + ) + def test_td64arr_addsub_numeric_invalid(self, box_with_array, other): box = box_with_array - tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) + err = TypeError - if box in [pd.Index, tm.to_array] and not isinstance(scalar, float): + if box in [pd.Index, tm.to_array] and not isinstance(other, float): err = NullFrequencyError with pytest.raises(err): - tdser + scalar + tdser + other with pytest.raises(err): - scalar + tdser + other + tdser with pytest.raises(err): - tdser - scalar + tdser - other with pytest.raises(err): - scalar - tdser + other - tdser @pytest.mark.parametrize( "dtype", @@ -1059,11 +1022,12 @@ def test_td64arr_add_sub_numeric_scalar_invalid(self, box_with_array, scalar): ], ids=lambda x: type(x).__name__, ) - def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype): + def test_td64arr_add_sub_numeric_arr_invalid(self, box_with_array, vec, dtype): + box = box_with_array tdser = pd.Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]") tdser = tm.box_expected(tdser, box) err = TypeError - if box is pd.Index and not dtype.startswith("float"): + if box in [pd.Index, tm.to_array] and not dtype.startswith("float"): err = NullFrequencyError vector = vec.astype(dtype) @@ -1080,14 +1044,6 @@ def test_td64arr_add_sub_numeric_arr_invalid(self, box, vec, dtype): # Operations with timedelta-like others # TODO: this was taken from tests.series.test_ops; de-duplicate - @pytest.mark.parametrize( - "scalar_td", - [ - timedelta(minutes=5, seconds=4), - Timedelta(minutes=5, seconds=4), - Timedelta("5m4s").to_timedelta64(), - ], - ) def test_operators_timedelta64_with_timedelta(self, scalar_td): # smoke tests td1 = Series([timedelta(minutes=5, seconds=3)] * 3) @@ -1141,7 +1097,8 @@ def test_timedelta64_operations_with_timedeltas(self): # roundtrip tm.assert_series_equal(result + td2, td1) - def test_td64arr_add_td64_array(self, box): + def test_td64arr_add_td64_array(self, box_with_array): + box = box_with_array dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -1155,7 +1112,8 @@ def test_td64arr_add_td64_array(self, box): result = tdarr + tdi tm.assert_equal(result, expected) - def test_td64arr_sub_td64_array(self, box): + def test_td64arr_sub_td64_array(self, box_with_array): + box = box_with_array dti = pd.date_range("2016-01-01", periods=3) tdi = dti - dti.shift(1) tdarr = tdi.values @@ -1229,8 +1187,9 @@ def test_td64arr_add_sub_tdi(self, box, names): else: assert result.dtypes[0] == "timedelta64[ns]" - def test_td64arr_add_sub_td64_nat(self, box): + def test_td64arr_add_sub_td64_nat(self, box_with_array): # GH#23320 special handling for timedelta64("NaT") + box = box_with_array tdi = pd.TimedeltaIndex([NaT, Timedelta("1s")]) other = np.timedelta64("NaT") expected = pd.TimedeltaIndex(["NaT"] * 2) @@ -1247,8 +1206,9 @@ def test_td64arr_add_sub_td64_nat(self, box): result = other - obj tm.assert_equal(result, expected) - def test_td64arr_sub_NaT(self, box): + def test_td64arr_sub_NaT(self, box_with_array): # GH#18808 + box = box_with_array ser = Series([NaT, Timedelta("1s")]) expected = Series([NaT, NaT], dtype="timedelta64[ns]") @@ -1258,8 +1218,9 @@ def test_td64arr_sub_NaT(self, box): res = ser - pd.NaT tm.assert_equal(res, expected) - def test_td64arr_add_timedeltalike(self, two_hours, box): + def test_td64arr_add_timedeltalike(self, two_hours, box_with_array): # only test adding/sub offsets as + is now numeric + box = box_with_array rng = timedelta_range("1 days", "10 days") expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D") rng = tm.box_expected(rng, box) @@ -1268,8 +1229,9 @@ def test_td64arr_add_timedeltalike(self, two_hours, box): result = rng + two_hours tm.assert_equal(result, expected) - def test_td64arr_sub_timedeltalike(self, two_hours, box): + def test_td64arr_sub_timedeltalike(self, two_hours, box_with_array): # only test adding/sub offsets as - is now numeric + box = box_with_array rng = timedelta_range("1 days", "10 days") expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00") @@ -1352,8 +1314,9 @@ def test_td64arr_add_offset_index(self, names, box): # TODO: combine with test_td64arr_add_offset_index by parametrizing # over second box? - def test_td64arr_add_offset_array(self, box): + def test_td64arr_add_offset_array(self, box_with_array): # GH#18849 + box = box_with_array tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"]) other = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)]) @@ -1433,13 +1396,12 @@ def test_td64arr_with_offset_series(self, names, box_df_fail): # GH#18849 box = box_df_fail box2 = Series if box in [pd.Index, tm.to_array] else box + exname = names[2] if box is not tm.to_array else names[1] tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) other = Series([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) - expected_add = Series( - [tdi[n] + other[n] for n in range(len(tdi))], name=names[2] - ) + expected_add = Series([tdi[n] + other[n] for n in range(len(tdi))], name=exname) tdi = tm.box_expected(tdi, box) expected_add = tm.box_expected(expected_add, box2) @@ -1452,9 +1414,7 @@ def test_td64arr_with_offset_series(self, names, box_df_fail): tm.assert_equal(res2, expected_add) # TODO: separate/parametrize add/sub test? - expected_sub = Series( - [tdi[n] - other[n] for n in range(len(tdi))], name=names[2] - ) + expected_sub = Series([tdi[n] - other[n] for n in range(len(tdi))], name=exname) expected_sub = tm.box_expected(expected_sub, box2) with tm.assert_produces_warning(PerformanceWarning): @@ -2055,6 +2015,8 @@ def test_td64arr_div_numeric_array(self, box_with_array, vector, dtype): def test_td64arr_mul_int_series(self, box_df_fail, names): # GH#19042 test for correct name attachment box = box_df_fail # broadcasts along wrong axis, but doesn't raise + exname = names[2] if box is not tm.to_array else names[1] + tdi = TimedeltaIndex( ["0days", "1day", "2days", "3days", "4days"], name=names[0] ) @@ -2064,11 +2026,11 @@ def test_td64arr_mul_int_series(self, box_df_fail, names): expected = Series( ["0days", "1day", "4days", "9days", "16days"], dtype="timedelta64[ns]", - name=names[2], + name=exname, ) tdi = tm.box_expected(tdi, box) - box = Series if (box is pd.Index and type(ser) is Series) else box + box = Series if (box is pd.Index or box is tm.to_array) else box expected = tm.box_expected(expected, box) result = ser * tdi @@ -2119,7 +2081,11 @@ def test_float_series_rdiv_td64arr(self, box_with_array, names): tm.assert_equal(result, expected) -class TestTimedeltaArraylikeInvalidArithmeticOps: +class TestTimedelta64ArrayLikeArithmetic: + # Arithmetic tests for timedelta64[ns] vectors fully parametrized over + # DataFrame/Series/TimedeltaIndex/TimedeltaArray. Ideally all arithmetic + # tests will eventually end up here. + def test_td64arr_pow_invalid(self, scalar_td, box_with_array): td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 6a86289b6fcc60..655a6e717119b1 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -93,8 +93,13 @@ def test_set_na(self, left_right_dtypes): tm.assert_extension_array_equal(result, expected) -def test_repr_matches(): - idx = IntervalIndex.from_breaks([1, 2, 3]) - a = repr(idx) - b = repr(idx.values) - assert a.replace("Index", "Array") == b +def test_repr(): + # GH 25022 + arr = IntervalArray.from_tuples([(0, 1), (1, 2)]) + result = repr(arr) + expected = ( + "\n" + "[(0, 1], (1, 2]]\n" + "Length: 2, closed: right, dtype: interval[int64]" + ) + assert result == expected diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index c500760fa1390a..b6ffd8a83e409d 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -14,7 +14,7 @@ from pandas.core.dtypes.common import is_bool, is_list_like, is_scalar import pandas as pd -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, Series, compat, date_range from pandas.core.computation import pytables from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines @@ -1267,7 +1267,10 @@ def test_assignment_column(self): msg = "left hand side of an assignment must be a single name" with pytest.raises(SyntaxError, match=msg): df.eval("d,c = a + b") - msg = "can't assign to function call" + if compat.PY38: + msg = "cannot assign to function call" + else: + msg = "can't assign to function call" with pytest.raises(SyntaxError, match=msg): df.eval('Timestamp("20131001") = a + b') @@ -1967,6 +1970,26 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): pd.eval(ex, engine=engine, parser=parser) +@pytest.mark.parametrize( + "other", + [ + "'x'", + pytest.param( + "...", marks=pytest.mark.xfail(not compat.PY38, reason="GH-28116") + ), + ], +) +def test_equals_various(other): + df = DataFrame({"A": ["a", "b", "c"]}) + result = df.eval("A == {}".format(other)) + expected = Series([False, False, False], name="A") + if _USE_NUMEXPR: + # https://github.com/pandas-dev/pandas/issues/10239 + # lose name with numexpr engine. Remove when that's fixed. + expected.name = None + tm.assert_series_equal(result, expected) + + def test_inf(engine, parser): s = "inf + 1" expected = np.inf diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 2862615ef8585a..b341ed6a52ca57 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1069,18 +1069,24 @@ def test_replace_truthy(self): e = df assert_frame_equal(r, e) - def test_replace_int_to_int_chain(self): + def test_nested_dict_overlapping_keys_replace_int(self): + # GH 27660 keep behaviour consistent for simple dictionary and + # nested dictionary replacement df = DataFrame({"a": list(range(1, 5))}) - with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) - def test_replace_str_to_str_chain(self): + result = df.replace({"a": dict(zip(range(1, 5), range(2, 6)))}) + expected = df.replace(dict(zip(range(1, 5), range(2, 6)))) + assert_frame_equal(result, expected) + + def test_nested_dict_overlapping_keys_replace_str(self): + # GH 27660 a = np.arange(1, 5) astr = a.astype(str) bstr = np.arange(2, 6).astype(str) df = DataFrame({"a": astr}) - with pytest.raises(ValueError, match="Replacement not allowed .+"): - df.replace({"a": dict(zip(astr, bstr))}) + result = df.replace(dict(zip(astr, bstr))) + expected = df.replace({"a": dict(zip(astr, bstr))}) + assert_frame_equal(result, expected) def test_replace_swapping_bug(self): df = pd.DataFrame({"a": [True, False, True]}) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index f3452e9a85fb3e..84e343f07f990d 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -984,7 +984,7 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels): df = DataFrame([[10, 11, 12]], columns=cidx) result = df.stack() - # `MutliIndex.from_product` preserves categorical dtype - + # `MultiIndex.from_product` preserves categorical dtype - # it's tested elsewhere. midx = pd.MultiIndex.from_product([df.index, cidx]) expected = Series([10, 11, 12], index=midx) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e2e4a82ff581cf..8fb028a0f0326d 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -695,6 +695,20 @@ def _make_frame(names=None): tm.assert_index_equal(recons.columns, exp.columns) assert len(recons) == 0 + def test_to_csv_interval_index(self): + # GH 28210 + df = DataFrame({"A": list("abc"), "B": range(3)}, index=pd.interval_range(0, 3)) + + with ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + df.to_csv(path) + result = self.read_csv(path, index_col=0) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = df.copy() + expected.index = expected.index.astype(str) + + assert_frame_equal(result, expected) + def test_to_csv_float32_nanrep(self): df = DataFrame(np.random.randn(1, 4).astype(np.float32)) df[1] = np.nan diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 52d4fa76bf8794..aa80c461a00e79 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, compat, concat from pandas.core.base import SpecificationError -from pandas.core.groupby.generic import _maybe_mangle_lambdas +from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping import pandas.util.testing as tm @@ -560,3 +560,150 @@ def test_with_kwargs(self): result = pd.Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10) expected = pd.DataFrame({"": [13], "": [30]}) tm.assert_frame_equal(result, expected) + + def test_agg_with_one_lambda(self): + # GH 25719, write tests for DataFrameGroupby.agg with only one lambda + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + + # sort for 35 and earlier + columns = ["height_sqr_min", "height_max", "weight_max"] + if compat.PY35: + columns = ["height_max", "height_sqr_min", "weight_max"] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check pd.NameAgg case + result1 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + ) + tm.assert_frame_equal(result1, expected) + + # check agg(key=(col, aggfunc)) case + result2 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + ) + tm.assert_frame_equal(result2, expected) + + def test_agg_multiple_lambda(self): + # GH25719, test for DataFrameGroupby.agg with multiple lambdas + # with mixed aggfunc + df = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) + # sort for 35 and earlier + columns = [ + "height_sqr_min", + "height_max", + "weight_max", + "height_max_2", + "weight_min", + ] + if compat.PY35: + columns = [ + "height_max", + "height_max_2", + "height_sqr_min", + "weight_max", + "weight_min", + ] + expected = pd.DataFrame( + { + "height_sqr_min": [82.81, 36.00], + "height_max": [9.5, 34.0], + "weight_max": [9.9, 198.0], + "height_max_2": [9.5, 34.0], + "weight_min": [7.9, 7.5], + }, + index=pd.Index(["cat", "dog"], name="kind"), + columns=columns, + ) + + # check agg(key=(col, aggfunc)) case + result1 = df.groupby(by="kind").agg( + height_sqr_min=("height", lambda x: np.min(x ** 2)), + height_max=("height", "max"), + weight_max=("weight", "max"), + height_max_2=("height", lambda x: np.max(x)), + weight_min=("weight", lambda x: np.min(x)), + ) + tm.assert_frame_equal(result1, expected) + + # check pd.NamedAgg case + result2 = df.groupby(by="kind").agg( + height_sqr_min=pd.NamedAgg( + column="height", aggfunc=lambda x: np.min(x ** 2) + ), + height_max=pd.NamedAgg(column="height", aggfunc="max"), + weight_max=pd.NamedAgg(column="weight", aggfunc="max"), + height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)), + weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)), + ) + tm.assert_frame_equal(result2, expected) + + @pytest.mark.parametrize( + "order, expected_reorder", + [ + ( + [ + ("height", ""), + ("height", "max"), + ("weight", "max"), + ("height", ""), + ("weight", ""), + ], + [ + ("height", "_0"), + ("height", "max"), + ("weight", "max"), + ("height", "_1"), + ("weight", ""), + ], + ), + ( + [ + ("col2", "min"), + ("col1", ""), + ("col1", ""), + ("col1", ""), + ], + [ + ("col2", "min"), + ("col1", "_0"), + ("col1", "_1"), + ("col1", "_2"), + ], + ), + ( + [("col", ""), ("col", ""), ("col", "")], + [("col", "_0"), ("col", "_1"), ("col", "_2")], + ), + ], + ) + def test_make_unique(self, order, expected_reorder): + # GH 27519, test if make_unique function reorders correctly + result = _make_unique(order) + + assert result == expected_reorder diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 2195686ee9c7f6..b8f9ecd42bae3c 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -2,7 +2,7 @@ from numpy import nan import pytest -from pandas._libs import groupby, lib, reduction +from pandas._libs import groupby, lib, reduction as libreduction from pandas.core.dtypes.common import ensure_int64 @@ -18,7 +18,7 @@ def test_series_grouper(): labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) - grouper = reduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) + grouper = libreduction.SeriesGrouper(obj, np.mean, labels, 2, dummy) result, counts = grouper.get_result() expected = np.array([obj[3:6].mean(), obj[6:].mean()]) @@ -34,7 +34,7 @@ def test_series_bin_grouper(): bins = np.array([3, 6]) - grouper = reduction.SeriesBinGrouper(obj, np.mean, bins, dummy) + grouper = libreduction.SeriesBinGrouper(obj, np.mean, bins, dummy) result, counts = grouper.get_result() expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) @@ -120,31 +120,31 @@ class TestMoments: class TestReducer: def test_int_index(self): arr = np.random.randn(100, 4) - result = reduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) + result = libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) expected = arr.sum(0) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(100)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) ) expected = arr.sum(0) assert_almost_equal(result, expected) dummy = Series(0.0, index=np.arange(4)) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) expected = arr.sum(1) assert_almost_equal(result, expected) - result = reduction.compute_reduction( + result = libreduction.compute_reduction( arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) ) assert_almost_equal(result, expected) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 509d7c33b643b5..afb22a732691cd 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1,4 +1,5 @@ import builtins +import datetime as dt from io import StringIO from itertools import product from string import ascii_lowercase @@ -9,7 +10,16 @@ from pandas.errors import UnsupportedFunctionCall import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, isna +from pandas import ( + DataFrame, + Index, + MultiIndex, + NaT, + Series, + Timestamp, + date_range, + isna, +) import pandas.core.nanops as nanops from pandas.util import _test_decorators as td, testing as tm @@ -1015,6 +1025,42 @@ def test_nunique_with_timegrouper(): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "key, data, dropna, expected", + [ + ( + ["x", "x", "x"], + [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + True, + Series([1], index=pd.Index(["x"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "y", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ( + ["x", "x", "x", "x", "y"], + [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)], + False, + Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"), + ), + ], +) +def test_nunique_with_NaT(key, data, dropna, expected): + # GH 27951 + df = pd.DataFrame({"key": key, "data": data}) + result = df.groupby(["key"])["data"].nunique(dropna=dropna) + tm.assert_series_equal(result, expected) + + def test_nunique_preserves_column_level_names(): # GH 23222 test = pd.DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0")) @@ -1257,6 +1303,24 @@ def test_quantile_array(): tm.assert_frame_equal(result, expected) +def test_quantile_array2(): + # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959 + df = pd.DataFrame( + np.random.RandomState(0).randint(0, 5, size=(10, 3)), columns=list("ABC") + ) + result = df.groupby("A").quantile([0.3, 0.7]) + expected = pd.DataFrame( + { + "B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0], + "C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0], + }, + index=pd.MultiIndex.from_product( + [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None] + ), + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_array_no_sort(): df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4556b22b572797..bec5cbc5fecb8b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1882,3 +1882,69 @@ def test_groupby_axis_1(group_name): results = df.groupby(group_name, axis=1).sum() expected = df.T.groupby(group_name).sum().T assert_frame_equal(results, expected) + + +@pytest.mark.parametrize( + "op, expected", + [ + ( + "shift", + { + "time": [ + None, + None, + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + None, + None, + ] + }, + ), + ( + "bfill", + { + "time": [ + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + ] + }, + ), + ( + "ffill", + { + "time": [ + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + ] + }, + ), + ], +) +def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): + # GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill + tz = tz_naive_fixture + data = { + "id": ["A", "B", "A", "B", "A", "B"], + "time": [ + Timestamp("2019-01-01 12:00:00"), + Timestamp("2019-01-01 12:30:00"), + None, + None, + Timestamp("2019-01-01 14:00:00"), + Timestamp("2019-01-01 14:30:00"), + ], + } + df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz)) + + grouped = df.groupby("id") + result = getattr(grouped, op)() + expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz)) + assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index c1a21e6a7f1527..eeb0f43f4b9003 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -417,6 +417,46 @@ def test_repr_missing(self, constructor, expected): result = repr(obj) assert result == expected + @pytest.mark.parametrize( + "tuples, closed, expected_data", + [ + ([(0, 1), (1, 2), (2, 3)], "left", ["[0, 1)", "[1, 2)", "[2, 3)"]), + ( + [(0.5, 1.0), np.nan, (2.0, 3.0)], + "right", + ["(0.5, 1.0]", "NaN", "(2.0, 3.0]"], + ), + ( + [ + (Timestamp("20180101"), Timestamp("20180102")), + np.nan, + ((Timestamp("20180102"), Timestamp("20180103"))), + ], + "both", + ["[2018-01-01, 2018-01-02]", "NaN", "[2018-01-02, 2018-01-03]"], + ), + ( + [ + (Timedelta("0 days"), Timedelta("1 days")), + (Timedelta("1 days"), Timedelta("2 days")), + np.nan, + ], + "neither", + [ + "(0 days 00:00:00, 1 days 00:00:00)", + "(1 days 00:00:00, 2 days 00:00:00)", + "NaN", + ], + ), + ], + ) + def test_to_native_types(self, tuples, closed, expected_data): + # GH 28210 + index = IntervalIndex.from_tuples(tuples, closed=closed) + result = index.to_native_types() + expected = np.array(expected_data) + tm.assert_numpy_array_equal(result, expected) + def test_get_item(self, closed): i = IntervalIndex.from_arrays((0, 1, np.nan), (1, 2, np.nan), closed=closed) assert i[0] == Interval(0.0, 1.0, closed=closed) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index abe0cd86c90d7d..9845b1ac3a4b9a 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1070,6 +1070,16 @@ def test_series_indexing_zerodim_np_array(self): result = s.loc[np.array(0)] assert result == 1 + def test_loc_reverse_assignment(self): + # GH26939 + data = [1, 2, 3, 4, 5, 6] + [None] * 4 + expected = Series(data, index=range(2010, 2020)) + + result = pd.Series(index=range(2010, 2020)) + result.loc[2015:2010:-1] = [6, 5, 4, 3, 2, 1] + + tm.assert_series_equal(result, expected) + def test_series_loc_getitem_label_list_missing_values(): # gh-11428 diff --git a/pandas/tests/io/data/legacy_hdf/gh26443.h5 b/pandas/tests/io/data/legacy_hdf/gh26443.h5 new file mode 100644 index 00000000000000..45aa64324530f9 Binary files /dev/null and b/pandas/tests/io/data/legacy_hdf/gh26443.h5 differ diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index a39cface0e0157..5326f2df68972f 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -3,6 +3,7 @@ from datetime import datetime, time from functools import partial import os +from urllib.error import URLError import warnings import numpy as np @@ -14,8 +15,6 @@ from pandas import DataFrame, Index, MultiIndex, Series import pandas.util.testing as tm -from pandas.io.common import URLError - @contextlib.contextmanager def ignore_xlrd_time_clock_warning(): diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html new file mode 100644 index 00000000000000..4eb3f5319749d9 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_12.html @@ -0,0 +1,70 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
......
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html new file mode 100644 index 00000000000000..2b1d97aec517c5 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_10_min_rows_4.html @@ -0,0 +1,46 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
......
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html new file mode 100644 index 00000000000000..a539e5a4884a12 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_12_min_rows_None.html @@ -0,0 +1,78 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
......
5555
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html b/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html new file mode 100644 index 00000000000000..3e680a505c6d68 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_max_rows_None_min_rows_12.html @@ -0,0 +1,269 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
66
77
88
99
1010
1111
1212
1313
1414
1515
1616
1717
1818
1919
2020
2121
2222
2323
2424
2525
2626
2727
2828
2929
3030
3131
3232
3333
3434
3535
3636
3737
3838
3939
4040
4141
4242
4343
4444
4545
4646
4747
4848
4949
5050
5151
5252
5353
5454
5555
5656
5757
5858
5959
6060
+
diff --git a/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html new file mode 100644 index 00000000000000..10f6247e37deff --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_no_truncation.html @@ -0,0 +1,105 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
55
66
77
88
99
1010
1111
1212
1313
1414
1515
1616
1717
1818
1919
+
diff --git a/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html new file mode 100644 index 00000000000000..4eb3f5319749d9 --- /dev/null +++ b/pandas/tests/io/formats/data/html/html_repr_min_rows_default_truncated.html @@ -0,0 +1,70 @@ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
a
00
11
22
33
44
......
5656
5757
5858
5959
6060
+

61 rows × 1 columns

+
diff --git a/pandas/tests/io/formats/test_console.py b/pandas/tests/io/formats/test_console.py index f4bee99296a834..e56d14885f11e3 100644 --- a/pandas/tests/io/formats/test_console.py +++ b/pandas/tests/io/formats/test_console.py @@ -1,3 +1,5 @@ +import locale + import pytest from pandas._config import detect_console_encoding @@ -50,11 +52,11 @@ def test_detect_console_encoding_fallback_to_locale(monkeypatch, encoding): "std,locale", [ ["ascii", "ascii"], - ["ascii", Exception], + ["ascii", locale.Error], [AttributeError, "ascii"], - [AttributeError, Exception], + [AttributeError, locale.Error], [IOError, "ascii"], - [IOError, Exception], + [IOError, locale.Error], ], ) def test_detect_console_encoding_fallback_to_default(monkeypatch, std, locale): diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index ee236a8253b01a..ab44b8b8059eb4 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -514,3 +514,44 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): df.to_csv(path, compression=to_compression) result = pd.read_csv(path, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) + + def test_to_csv_compression_dict(self, compression_only): + # GH 26023 + method = compression_only + df = DataFrame({"ABC": [1]}) + filename = "to_csv_compress_as_dict." + filename += "gz" if method == "gzip" else method + with tm.ensure_clean(filename) as path: + df.to_csv(path, compression={"method": method}) + read_df = pd.read_csv(path, index_col=0) + tm.assert_frame_equal(read_df, df) + + def test_to_csv_compression_dict_no_method_raises(self): + # GH 26023 + df = DataFrame({"ABC": [1]}) + compression = {"some_option": True} + msg = "must have key 'method'" + + with tm.ensure_clean("out.zip") as path: + with pytest.raises(ValueError, match=msg): + df.to_csv(path, compression=compression) + + @pytest.mark.parametrize("compression", ["zip", "infer"]) + @pytest.mark.parametrize( + "archive_name", [None, "test_to_csv.csv", "test_to_csv.zip"] + ) + def test_to_csv_zip_arguments(self, compression, archive_name): + # GH 26023 + from zipfile import ZipFile + + df = DataFrame({"ABC": [1]}) + with tm.ensure_clean("to_csv_archive_name.zip") as path: + df.to_csv( + path, compression={"method": compression, "archive_name": archive_name} + ) + zp = ZipFile(path) + expected_arcname = path if archive_name is None else archive_name + expected_arcname = os.path.basename(expected_arcname) + assert len(zp.filelist) == 1 + archived_file = os.path.basename(zp.filelist[0].filename) + assert archived_file == expected_arcname diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 448e869df950dd..52c7b89220f06b 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -713,3 +713,42 @@ def test_to_html_with_col_space_units(unit): for h in hdrs: expected = ''.format(unit=unit) assert expected in h + + +def test_html_repr_min_rows_default(datapath): + # gh-27991 + + # default setting no truncation even if above min_rows + df = pd.DataFrame({"a": range(20)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_no_truncation") + assert result == expected + + # default of max_rows 60 triggers truncation if above + df = pd.DataFrame({"a": range(61)}) + result = df._repr_html_() + expected = expected_html(datapath, "html_repr_min_rows_default_truncated") + assert result == expected + + +@pytest.mark.parametrize( + "max_rows,min_rows,expected", + [ + # truncated after first two rows + (10, 4, "html_repr_max_rows_10_min_rows_4"), + # when set to None, follow value of max_rows + (12, None, "html_repr_max_rows_12_min_rows_None"), + # when set value higher as max_rows, use the minimum + (10, 12, "html_repr_max_rows_10_min_rows_12"), + # max_rows of None -> never truncate + (None, 12, "html_repr_max_rows_None_min_rows_12"), + ], +) +def test_html_repr_min_rows(datapath, max_rows, min_rows, expected): + # gh-27991 + + df = pd.DataFrame({"a": range(61)}) + expected = expected_html(datapath, expected) + with option_context("display.max_rows", max_rows, "display.min_rows", min_rows): + result = df._repr_html_() + assert result == expected diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 924b2a19e85046..9ffb54d23e37e3 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -388,8 +388,7 @@ def test_to_latex_special_escape(self): """ assert escaped_result == escaped_expected - def test_to_latex_longtable(self, float_frame): - float_frame.to_latex(longtable=True) + def test_to_latex_longtable(self): df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) withindex_result = df.to_latex(longtable=True) @@ -439,6 +438,141 @@ def test_to_latex_longtable(self, float_frame): with3columns_result = df.to_latex(index=False, longtable=True) assert r"\multicolumn{3}" in with3columns_result + def test_to_latex_caption_label(self): + # GH 25436 + the_caption = "a table in a \\texttt{table/tabular} environment" + the_label = "tab:table_tabular" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the caption is provided + result_c = df.to_latex(caption=the_caption) + + expected_c = r"""\begin{table} +\centering +\caption{a table in a \texttt{table/tabular} environment} +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_c == expected_c + + # test when only the label is provided + result_l = df.to_latex(label=the_label) + + expected_l = r"""\begin{table} +\centering +\label{tab:table_tabular} +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_l == expected_l + + # test when the caption and the label are provided + result_cl = df.to_latex(caption=the_caption, label=the_label) + + expected_cl = r"""\begin{table} +\centering +\caption{a table in a \texttt{table/tabular} environment} +\label{tab:table_tabular} +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_cl == expected_cl + + def test_to_latex_longtable_caption_label(self): + # GH 25436 + the_caption = "a table in a \\texttt{longtable} environment" + the_label = "tab:longtable" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the caption is provided + result_c = df.to_latex(longtable=True, caption=the_caption) + + expected_c = r"""\begin{longtable}{lrl} +\caption{a table in a \texttt{longtable} environment}\\ +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_c == expected_c + + # test when only the label is provided + result_l = df.to_latex(longtable=True, label=the_label) + + expected_l = r"""\begin{longtable}{lrl} +\label{tab:longtable}\\ +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_l == expected_l + + # test when the caption and the label are provided + result_cl = df.to_latex(longtable=True, caption=the_caption, label=the_label) + + expected_cl = r"""\begin{longtable}{lrl} +\caption{a table in a \texttt{longtable} environment}\label{tab:longtable}\\ +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_cl == expected_cl + def test_to_latex_escape_special_chars(self): special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] df = DataFrame(data=special_characters) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9c687f036aa684..9842a706f43d78 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1012,60 +1012,70 @@ def test_convert_dates_infer(self): result = read_json(dumps(data))[["id", infer_word]] assert_frame_equal(result, expected) - def test_date_format_frame(self): + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_frame(self, date, date_unit): df = self.tsframe.copy() - def test_w_date(date, date_unit=None): - df["date"] = Timestamp(date) - df.iloc[1, df.columns.get_loc("date")] = pd.NaT - df.iloc[5, df.columns.get_loc("date")] = pd.NaT - if date_unit: - json = df.to_json(date_format="iso", date_unit=date_unit) - else: - json = df.to_json(date_format="iso") - result = read_json(json) - expected = df.copy() - expected.index = expected.index.tz_localize("UTC") - expected["date"] = expected["date"].dt.tz_localize("UTC") - assert_frame_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + df["date"] = Timestamp(date) + df.iloc[1, df.columns.get_loc("date")] = pd.NaT + df.iloc[5, df.columns.get_loc("date")] = pd.NaT + if date_unit: + json = df.to_json(date_format="iso", date_unit=date_unit) + else: + json = df.to_json(date_format="iso") + result = read_json(json) + expected = df.copy() + # expected.index = expected.index.tz_localize("UTC") + expected["date"] = expected["date"].dt.tz_localize("UTC") + assert_frame_equal(result, expected) + def test_date_format_frame_raises(self): + df = self.tsframe.copy() msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): df.to_json(date_format="iso", date_unit="foo") - def test_date_format_series(self): - def test_w_date(date, date_unit=None): - ts = Series(Timestamp(date), index=self.ts.index) - ts.iloc[1] = pd.NaT - ts.iloc[5] = pd.NaT - if date_unit: - json = ts.to_json(date_format="iso", date_unit=date_unit) - else: - json = ts.to_json(date_format="iso") - result = read_json(json, typ="series") - expected = ts.copy() - expected.index = expected.index.tz_localize("UTC") - expected = expected.dt.tz_localize("UTC") - assert_series_equal(result, expected) - - test_w_date("20130101 20:43:42.123") - test_w_date("20130101 20:43:42", date_unit="s") - test_w_date("20130101 20:43:42.123", date_unit="ms") - test_w_date("20130101 20:43:42.123456", date_unit="us") - test_w_date("20130101 20:43:42.123456789", date_unit="ns") + @pytest.mark.parametrize( + "date,date_unit", + [ + ("20130101 20:43:42.123", None), + ("20130101 20:43:42", "s"), + ("20130101 20:43:42.123", "ms"), + ("20130101 20:43:42.123456", "us"), + ("20130101 20:43:42.123456789", "ns"), + ], + ) + def test_date_format_series(self, date, date_unit): + ts = Series(Timestamp(date), index=self.ts.index) + ts.iloc[1] = pd.NaT + ts.iloc[5] = pd.NaT + if date_unit: + json = ts.to_json(date_format="iso", date_unit=date_unit) + else: + json = ts.to_json(date_format="iso") + result = read_json(json, typ="series") + expected = ts.copy() + # expected.index = expected.index.tz_localize("UTC") + expected = expected.dt.tz_localize("UTC") + assert_series_equal(result, expected) + def test_date_format_series_raises(self): ts = Series(Timestamp("20130101 20:43:42.123"), index=self.ts.index) msg = "Invalid value 'foo' for option 'date_unit'" with pytest.raises(ValueError, match=msg): ts.to_json(date_format="iso", date_unit="foo") - def test_date_unit(self): + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_date_unit(self, unit): df = self.tsframe.copy() df["date"] = Timestamp("20130101 20:43:42") dl = df.columns.get_loc("date") @@ -1073,16 +1083,15 @@ def test_date_unit(self): df.iloc[2, dl] = Timestamp("21460101 20:43:42") df.iloc[4, dl] = pd.NaT - for unit in ("s", "ms", "us", "ns"): - json = df.to_json(date_format="epoch", date_unit=unit) + json = df.to_json(date_format="epoch", date_unit=unit) - # force date unit - result = read_json(json, date_unit=unit) - assert_frame_equal(result, df) + # force date unit + result = read_json(json, date_unit=unit) + assert_frame_equal(result, df) - # detect date unit - result = read_json(json, date_unit=None) - assert_frame_equal(result, df) + # detect date unit + result = read_json(json, date_unit=None) + assert_frame_equal(result, df) def test_weird_nested_json(self): # this used to core dump the parser @@ -1611,3 +1620,30 @@ def test_read_timezone_information(self): ) expected = Series([88], index=DatetimeIndex(["2019-01-01 11:00:00"], tz="UTC")) assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "date_format,key", [("epoch", 86400000), ("iso", "P1DT0H0M0S")] + ) + def test_timedelta_as_label(self, date_format, key): + df = pd.DataFrame([[1]], columns=[pd.Timedelta("1D")]) + expected = '{{"{key}":{{"0":1}}}}'.format(key=key) + result = df.to_json(date_format=date_format) + + assert result == expected + + @pytest.mark.parametrize( + "orient,expected", + [ + ("index", "{\"('a', 'b')\":{\"('c', 'd')\":1}}"), + ("columns", "{\"('c', 'd')\":{\"('a', 'b')\":1}}"), + # TODO: the below have separate encoding procedures + # They produce JSON but not in a consistent manner + pytest.param("split", "", marks=pytest.mark.skip), + pytest.param("table", "", marks=pytest.mark.skip), + ], + ) + def test_tuple_labels(self, orient, expected): + # GH 20500 + df = pd.DataFrame([[1]], index=[("a", "b")], columns=[("c", "d")]) + result = df.to_json(orient=orient) + assert result == expected diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e5366a8357adbc..756463e9d8d335 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -11,6 +11,7 @@ import os import platform from tempfile import TemporaryFile +from urllib.error import URLError import numpy as np import pytest @@ -21,7 +22,6 @@ from pandas import DataFrame, Index, MultiIndex, Series, compat, concat import pandas.util.testing as tm -from pandas.io.common import URLError from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser @@ -1865,6 +1865,23 @@ def test_inf_parsing(all_parsers, na_filter): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("na_filter", [True, False]) +def test_infinity_parsing(all_parsers, na_filter): + parser = all_parsers + data = """\ +,A +a,Infinity +b,-Infinity +c,+Infinity +""" + expected = DataFrame( + {"A": [float("infinity"), float("-infinity"), float("+infinity")]}, + index=["a", "b", "c"], + ) + result = parser.read_csv(StringIO(data), index_col=0, na_filter=na_filter) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("nrows", [0, 1, 2, 3, 4, 5]) def test_raise_on_no_columns(all_parsers, nrows): parser = all_parsers @@ -1898,7 +1915,10 @@ def test_null_byte_char(all_parsers): out = parser.read_csv(StringIO(data), names=names) tm.assert_frame_equal(out, expected) else: - msg = "NULL byte detected" + if compat.PY38: + msg = "line contains NUL" + else: + msg = "NULL byte detected" with pytest.raises(ParserError, match=msg): parser.read_csv(StringIO(data), names=names) diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index d67f2c3b7bd66e..77cac00882771f 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -37,7 +37,6 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, assert_series_equal, set_timezone -from pandas.io import pytables as pytables # noqa:E402 from pandas.io.formats.printing import pprint_thing from pandas.io.pytables import ( ClosedFileError, @@ -46,7 +45,9 @@ Term, read_hdf, ) -from pandas.io.pytables import TableIterator # noqa:E402 + +from pandas.io import pytables as pytables # noqa: E402 isort:skip +from pandas.io.pytables import TableIterator # noqa: E402 isort:skip tables = pytest.importorskip("tables") @@ -5446,3 +5447,16 @@ def test_read_with_where_tz_aware_index(self): store.append(key, expected, format="table", append=True) result = pd.read_hdf(path, key, where="DATE > 20151130") assert_frame_equal(result, expected) + + def test_py2_created_with_datetimez(self, datapath): + # The test HDF5 file was created in Python 2, but could not be read in + # Python 3. + # + # GH26443 + index = [pd.Timestamp("2019-01-01T18:00").tz_localize("America/New_York")] + expected = DataFrame({"data": 123}, index=index) + with ensure_clean_store( + datapath("io", "data", "legacy_hdf", "gh26443.h5"), mode="r" + ) as store: + result = store["key"] + assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 16ca1109f266cc..d68b6a1effaa0a 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,6 +1,7 @@ import contextlib import os import subprocess +import sys import textwrap import warnings @@ -139,7 +140,7 @@ def test_with_missing_lzma(): import pandas """ ) - subprocess.check_output(["python", "-c", code]) + subprocess.check_output([sys.executable, "-c", code]) def test_with_missing_lzma_runtime(): @@ -156,4 +157,4 @@ def test_with_missing_lzma_runtime(): df.to_csv('foo.csv', compression='xz') """ ) - subprocess.check_output(["python", "-c", code]) + subprocess.check_output([sys.executable, "-c", code]) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 87a2405a10dd5c..ee668d6890756f 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -8,7 +8,7 @@ import pandas.util.testing as tm from pandas.util.testing import assert_frame_equal, ensure_clean -from pandas.io.feather_format import read_feather, to_feather # noqa:E402 +from pandas.io.feather_format import read_feather, to_feather # noqa: E402 isort:skip pyarrow = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 615e2735cd288f..183d217eb09d61 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -4,6 +4,7 @@ import os import re import threading +from urllib.error import URLError import numpy as np from numpy.random import rand @@ -17,7 +18,7 @@ import pandas.util.testing as tm from pandas.util.testing import makeCustomDataframe as mkdf, network -from pandas.io.common import URLError, file_path_to_url +from pandas.io.common import file_path_to_url import pandas.io.html from pandas.io.html import read_html diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index d8465a427eaea5..25727447b4c6fb 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -565,7 +565,6 @@ def _transaction_test(self): class _TestSQLApi(PandasSQLTest): - """ Base class to test the public API. diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py index d126407cfd823e..6511d94aa4c094 100644 --- a/pandas/tests/plotting/test_backend.py +++ b/pandas/tests/plotting/test_backend.py @@ -8,44 +8,38 @@ import pandas +dummy_backend = types.ModuleType("pandas_dummy_backend") +dummy_backend.plot = lambda *args, **kwargs: None -def test_matplotlib_backend_error(): - msg = ( - "matplotlib is required for plotting when the default backend " - '"matplotlib" is selected.' - ) - try: - import matplotlib # noqa - except ImportError: - with pytest.raises(ImportError, match=msg): - pandas.set_option("plotting.backend", "matplotlib") + +@pytest.fixture +def restore_backend(): + """Restore the plotting backend to matplotlib""" + pandas.set_option("plotting.backend", "matplotlib") + yield + pandas.set_option("plotting.backend", "matplotlib") def test_backend_is_not_module(): - msg = ( - '"not_an_existing_module" does not seem to be an installed module. ' - "A pandas plotting backend must be a module that can be imported" - ) + msg = "Could not find plotting backend 'not_an_existing_module'." with pytest.raises(ValueError, match=msg): pandas.set_option("plotting.backend", "not_an_existing_module") + assert pandas.options.plotting.backend == "matplotlib" -def test_backend_is_correct(monkeypatch): - monkeypatch.setattr( - "pandas.core.config_init.importlib.import_module", lambda name: None - ) - pandas.set_option("plotting.backend", "correct_backend") - assert pandas.get_option("plotting.backend") == "correct_backend" - # Restore backend for other tests (matplotlib can be not installed) - try: - pandas.set_option("plotting.backend", "matplotlib") - except ImportError: - pass +def test_backend_is_correct(monkeypatch, restore_backend): + monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend) + + pandas.set_option("plotting.backend", "pandas_dummy_backend") + assert pandas.get_option("plotting.backend") == "pandas_dummy_backend" + assert ( + pandas.plotting._core._get_plot_backend("pandas_dummy_backend") is dummy_backend + ) @td.skip_if_no_mpl -def test_register_entrypoint(): +def test_register_entrypoint(restore_backend): dist = pkg_resources.get_distribution("pandas") if dist.module_path not in pandas.__file__: @@ -74,13 +68,18 @@ def test_register_entrypoint(): assert result is mod -def test_register_import(): - mod = types.ModuleType("my_backend2") - mod.plot = lambda *args, **kwargs: 1 - sys.modules["my_backend2"] = mod +def test_setting_backend_without_plot_raises(): + # GH-28163 + module = types.ModuleType("pandas_plot_backend") + sys.modules["pandas_plot_backend"] = module - result = pandas.plotting._core._get_plot_backend("my_backend2") - assert result is mod + assert pandas.options.plotting.backend == "matplotlib" + with pytest.raises( + ValueError, match="Could not find plotting backend 'pandas_plot_backend'." + ): + pandas.set_option("plotting.backend", "pandas_plot_backend") + + assert pandas.options.plotting.backend == "matplotlib" @td.skip_if_mpl diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 6cb6f818d40fdd..940cfef4058e03 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -21,7 +21,7 @@ def test_import_error_message(): # GH-19810 df = DataFrame({"A": [1, 2]}) - with pytest.raises(ImportError, match="No module named 'matplotlib'"): + with pytest.raises(ImportError, match="matplotlib is required for plotting"): df.plot() diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 6366bf0521fbc2..13f0f14014a314 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -50,7 +50,6 @@ def sort_with_none(request): class TestConcatAppendCommon: - """ Test common dtype coercion rules between concat and append. """ diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 5b1c4f92bf3419..5eb69fb2952dcb 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -252,6 +252,7 @@ def _get_overlap_public_nat_methods(klass, as_tuple=False): "day_name", "dst", "floor", + "fromisocalendar", "fromisoformat", "fromordinal", "fromtimestamp", @@ -296,6 +297,8 @@ def test_overlap_public_nat_methods(klass, expected): # "fromisoformat" was introduced in 3.7 if klass is Timestamp and not compat.PY37: expected.remove("fromisoformat") + if klass is Timestamp and not compat.PY38: + expected.remove("fromisocalendar") assert _get_overlap_public_nat_methods(klass) == expected diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 58bd03129f2df0..9634c6d8222368 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.errors import NullFrequencyError + from pandas import Timedelta, Timestamp import pandas.util.testing as tm @@ -66,6 +68,20 @@ def test_delta_preserve_nanos(self): result = val + timedelta(1) assert result.nanosecond == val.nanosecond + def test_rsub_dtscalars(self, tz_naive_fixture): + # In particular, check that datetime64 - Timestamp works GH#28286 + td = Timedelta(1235345642000) + ts = Timestamp.now(tz_naive_fixture) + other = ts + td + + assert other - ts == td + assert other.to_pydatetime() - ts == td + if tz_naive_fixture is None: + assert other.to_datetime64() - ts == td + else: + with pytest.raises(TypeError, match="subtraction must have"): + other.to_datetime64() - ts + def test_timestamp_sub_datetime(self): dt = datetime(2013, 10, 12) ts = Timestamp(datetime(2013, 10, 13)) @@ -151,3 +167,56 @@ def test_timestamp_add_timedelta64_unit(self, other, expected_difference): result = ts + other valdiff = result.value - ts.value assert valdiff == expected_difference + + @pytest.mark.parametrize("ts", [Timestamp.now(), Timestamp.now("utc")]) + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + np.array([1, 2], dtype=np.int32), + np.array([3, 4], dtype=np.uint64), + ], + ) + def test_add_int_no_freq_raises(self, ts, other): + with pytest.raises(NullFrequencyError, match="without freq"): + ts + other + with pytest.raises(NullFrequencyError, match="without freq"): + other + ts + + with pytest.raises(NullFrequencyError, match="without freq"): + ts - other + with pytest.raises(TypeError): + other - ts + + @pytest.mark.parametrize( + "ts", + [ + Timestamp("1776-07-04", freq="D"), + Timestamp("1776-07-04", tz="UTC", freq="D"), + ], + ) + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + np.array([1, 2], dtype=np.int32), + np.array([3, 4], dtype=np.uint64), + ], + ) + def test_add_int_with_freq(self, ts, other): + with tm.assert_produces_warning(FutureWarning): + result1 = ts + other + with tm.assert_produces_warning(FutureWarning): + result2 = other + ts + + assert np.all(result1 == result2) + + with tm.assert_produces_warning(FutureWarning): + result = result1 - other + + assert np.all(result == ts) + + with pytest.raises(TypeError): + other - ts diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 0686b397cbd811..0ddf1dfcabb59b 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -191,6 +191,20 @@ def test_to_csv_compression(self, s, encoding, compression): s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding) ) + def test_to_csv_interval_index(self): + # GH 28210 + s = Series(["foo", "bar", "baz"], index=pd.interval_range(0, 3)) + + with ensure_clean("__tmp_to_csv_interval_index__.csv") as path: + s.to_csv(path, header=False) + result = self.read_csv(path, index_col=0, squeeze=True) + + # can't roundtrip intervalindex via read_csv so check string repr (GH 23595) + expected = s.copy() + expected.index = expected.index.astype(str) + + assert_series_equal(result, expected) + class TestSeriesIO: def test_to_frame(self, datetime_series): diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 062c07cb6242aa..bf725a04de0589 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -36,22 +36,14 @@ def test_bool_operators_with_nas(self, bool_op): expected[mask] = False assert_series_equal(result, expected) - def test_operators_bitwise(self): + def test_logical_operators_bool_dtype_with_empty(self): # GH#9016: support bitwise op for integer types index = list("bca") s_tft = Series([True, False, True], index=index) s_fff = Series([False, False, False], index=index) - s_tff = Series([True, False, False], index=index) s_empty = Series([]) - # TODO: unused - # s_0101 = Series([0, 1, 0, 1]) - - s_0123 = Series(range(4), dtype="int64") - s_3333 = Series([3] * 4) - s_4444 = Series([4] * 4) - res = s_tft & s_empty expected = s_fff assert_series_equal(res, expected) @@ -60,6 +52,16 @@ def test_operators_bitwise(self): expected = s_tft assert_series_equal(res, expected) + def test_logical_operators_int_dtype_with_int_dtype(self): + # GH#9016: support bitwise op for integer types + + # TODO: unused + # s_0101 = Series([0, 1, 0, 1]) + + s_0123 = Series(range(4), dtype="int64") + s_3333 = Series([3] * 4) + s_4444 = Series([4] * 4) + res = s_0123 & s_3333 expected = Series(range(4), dtype="int64") assert_series_equal(res, expected) @@ -68,76 +70,125 @@ def test_operators_bitwise(self): expected = Series(range(4, 8), dtype="int64") assert_series_equal(res, expected) - s_a0b1c0 = Series([1], list("b")) - - res = s_tft & s_a0b1c0 - expected = s_tff.reindex(list("abc")) + s_1111 = Series([1] * 4, dtype="int8") + res = s_0123 & s_1111 + expected = Series([0, 1, 0, 1], dtype="int64") assert_series_equal(res, expected) - res = s_tft | s_a0b1c0 - expected = s_tft.reindex(list("abc")) + res = s_0123.astype(np.int16) | s_1111.astype(np.int32) + expected = Series([1, 1, 3, 3], dtype="int32") assert_series_equal(res, expected) - n0 = 0 - res = s_tft & n0 - expected = s_fff - assert_series_equal(res, expected) + def test_logical_operators_int_dtype_with_int_scalar(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") - res = s_0123 & n0 + res = s_0123 & 0 expected = Series([0] * 4) assert_series_equal(res, expected) - n1 = 1 - res = s_tft & n1 - expected = s_tft - assert_series_equal(res, expected) - - res = s_0123 & n1 + res = s_0123 & 1 expected = Series([0, 1, 0, 1]) assert_series_equal(res, expected) - s_1111 = Series([1] * 4, dtype="int8") - res = s_0123 & s_1111 - expected = Series([0, 1, 0, 1], dtype="int64") - assert_series_equal(res, expected) - - res = s_0123.astype(np.int16) | s_1111.astype(np.int32) - expected = Series([1, 1, 3, 3], dtype="int32") - assert_series_equal(res, expected) + def test_logical_operators_int_dtype_with_float(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") - with pytest.raises(TypeError): - s_1111 & "a" - with pytest.raises(TypeError): - s_1111 & ["a", "b", "c", "d"] with pytest.raises(TypeError): s_0123 & np.NaN with pytest.raises(TypeError): s_0123 & 3.14 with pytest.raises(TypeError): s_0123 & [0.1, 4, 3.14, 2] + with pytest.raises(TypeError): + s_0123 & np.array([0.1, 4, 3.14, 2]) + with pytest.raises(TypeError): + s_0123 & Series([0.1, 4, -3.14, 2]) - # s_0123 will be all false now because of reindexing like s_tft - exp = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) - assert_series_equal(s_tft & s_0123, exp) - - # s_tft will be all false now because of reindexing like s_0123 - exp = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) - assert_series_equal(s_0123 & s_tft, exp) - - assert_series_equal(s_0123 & False, Series([False] * 4)) - assert_series_equal(s_0123 ^ False, Series([False, True, True, True])) - assert_series_equal(s_0123 & [False], Series([False] * 4)) - assert_series_equal(s_0123 & (False), Series([False] * 4)) - assert_series_equal( - s_0123 & Series([False, np.NaN, False, False]), Series([False] * 4) - ) + def test_logical_operators_int_dtype_with_str(self): + s_1111 = Series([1] * 4, dtype="int8") + + with pytest.raises(TypeError): + s_1111 & "a" + with pytest.raises(TypeError): + s_1111 & ["a", "b", "c", "d"] + + def test_logical_operators_int_dtype_with_bool(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") + + expected = Series([False] * 4) + + result = s_0123 & False + assert_series_equal(result, expected) + + result = s_0123 & [False] + assert_series_equal(result, expected) + + result = s_0123 & (False,) + assert_series_equal(result, expected) - s_ftft = Series([False, True, False, True]) - assert_series_equal(s_0123 & Series([0.1, 4, -3.14, 2]), s_ftft) + result = s_0123 ^ False + expected = Series([False, True, True, True]) + assert_series_equal(result, expected) + + def test_logical_operators_int_dtype_with_object(self): + # GH#9016: support bitwise op for integer types + s_0123 = Series(range(4), dtype="int64") + + result = s_0123 & Series([False, np.NaN, False, False]) + expected = Series([False] * 4) + assert_series_equal(result, expected) s_abNd = Series(["a", "b", np.NaN, "d"]) - res = s_0123 & s_abNd - expected = s_ftft + with pytest.raises(TypeError, match="unsupported.* 'int' and 'str'"): + s_0123 & s_abNd + + def test_logical_operators_bool_dtype_with_int(self): + index = list("bca") + + s_tft = Series([True, False, True], index=index) + s_fff = Series([False, False, False], index=index) + + res = s_tft & 0 + expected = s_fff + assert_series_equal(res, expected) + + res = s_tft & 1 + expected = s_tft + assert_series_equal(res, expected) + + def test_logical_operators_int_dtype_with_bool_dtype_and_reindex(self): + # GH#9016: support bitwise op for integer types + + # with non-matching indexes, logical operators will cast to object + # before operating + index = list("bca") + + s_tft = Series([True, False, True], index=index) + s_tft = Series([True, False, True], index=index) + s_tff = Series([True, False, False], index=index) + + s_0123 = Series(range(4), dtype="int64") + + # s_0123 will be all false now because of reindexing like s_tft + expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) + result = s_tft & s_0123 + assert_series_equal(result, expected) + + expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"]) + result = s_0123 & s_tft + assert_series_equal(result, expected) + + s_a0b1c0 = Series([1], list("b")) + + res = s_tft & s_a0b1c0 + expected = s_tff.reindex(list("abc")) + assert_series_equal(res, expected) + + res = s_tft | s_a0b1c0 + expected = s_tft.reindex(list("abc")) assert_series_equal(res, expected) def test_scalar_na_logical_ops_corners(self): @@ -523,6 +574,7 @@ def test_comparison_operators_with_nas(self): assert_series_equal(result, expected) + # FIXME: dont leave commented-out # fffffffuuuuuuuuuuuu # result = f(val, s) # expected = f(val, s.dropna()).reindex(s.index) diff --git a/pandas/tests/sparse/frame/test_frame.py b/pandas/tests/sparse/frame/test_frame.py index ddb50e0897a869..e372e2563e682c 100644 --- a/pandas/tests/sparse/frame/test_frame.py +++ b/pandas/tests/sparse/frame/test_frame.py @@ -1487,6 +1487,22 @@ def test_comparison_op_scalar(self): assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), df != 0) + def test_add_series_retains_dtype(self): + # SparseDataFrame._combine_match_columns used to incorrectly cast + # to float + d = {0: [2j, 3j], 1: [0, 1]} + sdf = SparseDataFrame(data=d, default_fill_value=1) + result = sdf + sdf[0] + + df = sdf.to_dense() + expected = df + df[0] + tm.assert_frame_equal(result.to_dense(), expected) + + # Make it explicit to be on the safe side + edata = {0: [4j, 5j], 1: [3j, 1 + 3j]} + expected = DataFrame(edata) + tm.assert_frame_equal(result.to_dense(), expected) + @pytest.mark.filterwarnings("ignore:Sparse:FutureWarning") @pytest.mark.filterwarnings("ignore:DataFrame.to_sparse:FutureWarning") diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 93baafddedeb48..3a24736c57c011 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -145,6 +145,7 @@ def _getitem_tuple(self, tup): # Cython import warning @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") +@pytest.mark.filterwarnings("ignore:RangeIndex.* is deprecated:DeprecationWarning") def test_pyarrow(df): pyarrow = import_module("pyarrow") # noqa diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 47e398dfe3d167..7a8a6d511aa69a 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -29,6 +29,7 @@ def test_namespace(): "NaTType", "iNaT", "is_null_datetimelike", + "NullFrequencyError", "OutOfBoundsDatetime", "Period", "IncompatibleFrequency", diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index b4787bf25e3bb6..70ba85120af3c6 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -334,3 +334,30 @@ def test_readonly_array(self): result = pd.Series(arr).rolling(2).mean() expected = pd.Series([np.nan, 2, np.nan, np.nan, 4]) tm.assert_series_equal(result, expected) + + def test_rolling_datetime(self, axis_frame, tz_naive_fixture): + # GH-28192 + tz = tz_naive_fixture + df = pd.DataFrame( + { + i: [1] * 2 + for i in pd.date_range("2019-8-01", "2019-08-03", freq="D", tz=tz) + } + ) + if axis_frame in [0, "index"]: + result = df.T.rolling("2D", axis=axis_frame).sum().T + else: + result = df.rolling("2D", axis=axis_frame).sum() + expected = pd.DataFrame( + { + **{ + i: [1.0] * 2 + for i in pd.date_range("2019-8-01", periods=1, freq="D", tz=tz) + }, + **{ + i: [2.0] * 2 + for i in pd.date_range("2019-8-02", "2019-8-03", freq="D", tz=tz) + }, + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a208d5ad2fea99..edf58ba3850a1c 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -204,8 +204,7 @@ def __add__(date): normalize : bool, default False Whether to round the result of a DateOffset addition down to the previous midnight. - **kwds - Temporal parameter that add to or replace the offset value. + **kwds : Temporal parameter that add to or replace the offset value. Parameters that **add** to the offset (like Timedelta): @@ -233,16 +232,19 @@ def __add__(date): See Also -------- - dateutil.relativedelta.relativedelta + dateutil.relativedelta.relativedelta : The relativedelta type is designed + to be applied to an existing datetime an can replace specific components of + that datetime, or represents an interval of time. Examples -------- + >>> from pandas.tseries.offsets import DateOffset >>> ts = pd.Timestamp('2017-01-01 09:10:11') >>> ts + DateOffset(months=3) Timestamp('2017-04-01 09:10:11') >>> ts = pd.Timestamp('2017-01-01 09:10:11') - >>> ts + DateOffset(month=3) + >>> ts + DateOffset(months=2) Timestamp('2017-03-01 09:10:11') """ diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 5c7d481ff2586e..8a25e511b5fc4f 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -1,21 +1,35 @@ from functools import wraps import inspect from textwrap import dedent -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) import warnings from pandas._libs.properties import cache_readonly # noqa +FuncType = Callable[..., Any] +F = TypeVar("F", bound=FuncType) + def deprecate( name: str, - alternative: Callable, + alternative: Callable[..., Any], version: str, alt_name: Optional[str] = None, klass: Optional[Type[Warning]] = None, stacklevel: int = 2, msg: Optional[str] = None, -) -> Callable: +) -> Callable[..., Any]: """ Return a new function that emits a deprecation warning on use. @@ -47,7 +61,7 @@ def deprecate( warning_msg = msg or "{} is deprecated, use {} instead".format(name, alt_name) @wraps(alternative) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: warnings.warn(warning_msg, klass, stacklevel=stacklevel) return alternative(*args, **kwargs) @@ -90,9 +104,9 @@ def wrapper(*args, **kwargs): def deprecate_kwarg( old_arg_name: str, new_arg_name: Optional[str], - mapping: Optional[Union[Dict, Callable[[Any], Any]]] = None, + mapping: Optional[Union[Dict[Any, Any], Callable[[Any], Any]]] = None, stacklevel: int = 2, -) -> Callable: +) -> Callable[..., Any]: """ Decorator to deprecate a keyword argument of a function. @@ -160,27 +174,27 @@ def deprecate_kwarg( "mapping from old to new argument values " "must be dict or callable!" ) - def _deprecate_kwarg(func): + def _deprecate_kwarg(func: F) -> F: @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: old_arg_value = kwargs.pop(old_arg_name, None) - if new_arg_name is None and old_arg_value is not None: - msg = ( - "the '{old_name}' keyword is deprecated and will be " - "removed in a future version. " - "Please take steps to stop the use of '{old_name}'" - ).format(old_name=old_arg_name) - warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - kwargs[old_arg_name] = old_arg_value - return func(*args, **kwargs) - if old_arg_value is not None: - if mapping is not None: - if hasattr(mapping, "get"): - new_arg_value = mapping.get(old_arg_value, old_arg_value) - else: + if new_arg_name is None: + msg = ( + "the '{old_name}' keyword is deprecated and will be " + "removed in a future version. " + "Please take steps to stop the use of '{old_name}'" + ).format(old_name=old_arg_name) + warnings.warn(msg, FutureWarning, stacklevel=stacklevel) + kwargs[old_arg_name] = old_arg_value + return func(*args, **kwargs) + + elif mapping is not None: + if callable(mapping): new_arg_value = mapping(old_arg_value) + else: + new_arg_value = mapping.get(old_arg_value, old_arg_value) msg = ( "the {old_name}={old_val!r} keyword is deprecated, " "use {new_name}={new_val!r} instead" @@ -198,7 +212,7 @@ def wrapper(*args, **kwargs): ).format(old_name=old_arg_name, new_name=new_arg_name) warnings.warn(msg, FutureWarning, stacklevel=stacklevel) - if kwargs.get(new_arg_name, None) is not None: + if kwargs.get(new_arg_name) is not None: msg = ( "Can only specify '{old_name}' or '{new_name}', " "not both" ).format(old_name=old_arg_name, new_name=new_arg_name) @@ -207,17 +221,17 @@ def wrapper(*args, **kwargs): kwargs[new_arg_name] = new_arg_value return func(*args, **kwargs) - return wrapper + return cast(F, wrapper) return _deprecate_kwarg def rewrite_axis_style_signature( name: str, extra_params: List[Tuple[str, Any]] -) -> Callable: - def decorate(func): +) -> Callable[..., Any]: + def decorate(func: F) -> F: @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args, **kwargs) -> Callable[..., Any]: return func(*args, **kwargs) kind = inspect.Parameter.POSITIONAL_OR_KEYWORD @@ -234,8 +248,9 @@ def wrapper(*args, **kwargs): sig = inspect.Signature(params) - func.__signature__ = sig - return wrapper + # https://github.com/python/typing/issues/598 + func.__signature__ = sig # type: ignore + return cast(F, wrapper) return decorate @@ -279,18 +294,17 @@ def __init__(self, *args, **kwargs): self.params = args or kwargs - def __call__(self, func: Callable) -> Callable: + def __call__(self, func: F) -> F: func.__doc__ = func.__doc__ and func.__doc__ % self.params return func def update(self, *args, **kwargs) -> None: """ Update self.params with supplied args. - - If called, we assume self.params is a dict. """ - self.params.update(*args, **kwargs) + if isinstance(self.params, dict): + self.params.update(*args, **kwargs) class Appender: @@ -320,7 +334,7 @@ def __init__(self, addendum: Optional[str], join: str = "", indents: int = 0): self.addendum = addendum self.join = join - def __call__(self, func: Callable) -> Callable: + def __call__(self, func: F) -> F: func.__doc__ = func.__doc__ if func.__doc__ else "" self.addendum = self.addendum if self.addendum else "" docitems = [func.__doc__, self.addendum] diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 3de4e5d66d5774..0e07b9f5fe9f76 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -25,9 +25,8 @@ def test_foo(): """ from distutils.version import LooseVersion import locale -from typing import Optional +from typing import Callable, Optional -from _pytest.mark.structures import MarkDecorator import pytest from pandas.compat import is_platform_32bit, is_platform_windows @@ -103,7 +102,7 @@ def _skip_if_no_scipy(): ) -def skip_if_installed(package: str,) -> MarkDecorator: +def skip_if_installed(package: str) -> Callable: """ Skip a test if a package is installed. @@ -117,7 +116,7 @@ def skip_if_installed(package: str,) -> MarkDecorator: ) -def skip_if_no(package: str, min_version: Optional[str] = None) -> MarkDecorator: +def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: """ Generic function to help skip tests when required packages are not present on the testing system. diff --git a/pandas/util/testing.py b/pandas/util/testing.py index a8f0d0da52e1f4..c54dab046f57e7 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -4,7 +4,6 @@ from datetime import datetime from functools import wraps import gzip -import http.client import os import re from shutil import rmtree @@ -580,7 +579,8 @@ def assert_index_equal( check_categorical: bool = True, obj: str = "Index", ) -> None: - """Check that left and right Index are equal. + """ + Check that left and right Index are equal. Parameters ---------- @@ -1081,7 +1081,8 @@ def assert_series_equal( check_categorical=True, obj="Series", ): - """Check that left and right Series are equal. + """ + Check that left and right Series are equal. Parameters ---------- @@ -2273,11 +2274,17 @@ def dec(f): # But some tests (test_data yahoo) contact incredibly flakey # servers. -# and conditionally raise on these exception types -_network_error_classes = (IOError, http.client.HTTPException, TimeoutError) +# and conditionally raise on exception types in _get_default_network_errors + + +def _get_default_network_errors(): + # Lazy import for http.client because it imports many things from the stdlib + import http.client + return (IOError, http.client.HTTPException, TimeoutError) -def can_connect(url, error_classes=_network_error_classes): + +def can_connect(url, error_classes=None): """Try to connect to the given url. True if succeeds, False if IOError raised @@ -2292,6 +2299,10 @@ def can_connect(url, error_classes=_network_error_classes): Return True if no IOError (unable to connect) or URLError (bad url) was raised """ + + if error_classes is None: + error_classes = _get_default_network_errors() + try: with urlopen(url): pass @@ -2307,7 +2318,7 @@ def network( url="http://www.google.com", raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, check_before_test=False, - error_classes=_network_error_classes, + error_classes=None, skip_errnos=_network_errno_vals, _skip_on_messages=_network_error_messages, ): @@ -2395,6 +2406,9 @@ def network( """ from pytest import skip + if error_classes is None: + error_classes = _get_default_network_errors() + t.network = True @wraps(t) diff --git a/requirements-dev.txt b/requirements-dev.txt index cf11a3ee282584..c0fb9ee331b11a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -54,4 +54,5 @@ xarray xlrd xlsxwriter xlwt +odfpy pyreadstat \ No newline at end of file diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 1075a257d42705..95a892b822cff7 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -10,11 +10,11 @@ Usage:: $ ./find_commits_touching_func.py (see arguments below) """ -import logging -import re -import os import argparse from collections import namedtuple +import logging +import os +import re from dateutil.parser import parse diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 6ae10c2cb07d29..29fe8bf84c12b0 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -16,8 +16,8 @@ import os import re import sys -import yaml +import yaml EXCLUDE = {"python=3"} RENAME = {"pytables": "tables", "pyqt": "pyqt5"} diff --git a/scripts/merge-pr.py b/scripts/merge-pr.py index 95352751a23c6b..300cb149f387fc 100755 --- a/scripts/merge-pr.py +++ b/scripts/merge-pr.py @@ -22,14 +22,15 @@ # usage: ./apache-pr-merge.py (see config env vars below) # # Lightly modified from version of this script in incubator-parquet-format -from subprocess import check_output -from requests.auth import HTTPBasicAuth -import requests import os +from subprocess import check_output import sys import textwrap +import requests +from requests.auth import HTTPBasicAuth + PANDAS_HOME = "." PROJECT_NAME = "pandas" print("PANDAS_HOME = " + PANDAS_HOME) diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 35aaf10458f449..85e5bf239cbfa8 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -2,12 +2,13 @@ import random import string import textwrap -import pytest -import numpy as np -import pandas as pd +import numpy as np +import pytest import validate_docstrings +import pandas as pd + validate_one = validate_docstrings.validate_one diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index bf5d861281a36b..401eaf8ff5ed5c 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -13,20 +13,20 @@ $ ./validate_docstrings.py $ ./validate_docstrings.py pandas.DataFrame.head """ -import os -import sys -import json -import re -import glob -import functools -import collections import argparse -import pydoc -import inspect -import importlib +import ast +import collections import doctest +import functools +import glob +import importlib +import inspect +import json +import os +import pydoc +import re +import sys import tempfile -import ast import textwrap import flake8.main.application @@ -41,20 +41,20 @@ # script. Setting here before matplotlib is loaded. # We don't warn for the number of open plots, as none is actually being opened os.environ["MPLBACKEND"] = "Template" -import matplotlib +import matplotlib # noqa: E402 isort:skip matplotlib.rc("figure", max_open_warning=10000) -import numpy +import numpy # noqa: E402 isort:skip BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(BASE_PATH)) -import pandas +import pandas # noqa: E402 isort:skip sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) -from numpydoc.docscrape import NumpyDocString -from pandas.io.formats.printing import pprint_thing +from numpydoc.docscrape import NumpyDocString # noqa: E402 isort:skip +from pandas.io.formats.printing import pprint_thing # noqa: E402 isort:skip PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] diff --git a/setup.cfg b/setup.cfg index 716ff5d9d8853f..43dbac15f5cfe7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -110,68 +110,25 @@ directory = coverage_html_report # To be kept consistent with "Import Formatting" section in contributing.rst [isort] -known_pre_libs=pandas._config -known_pre_core=pandas._libs,pandas.util._*,pandas.compat,pandas.errors -known_dtypes=pandas.core.dtypes -known_post_core=pandas.tseries,pandas.io,pandas.plotting -sections=FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER - -known_first_party=pandas -known_third_party=Cython,numpy,dateutil,matplotlib,python-dateutil,pytz,pyarrow,pytest - -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -line_length=88 -force_sort_within_sections=True -skip_glob=env, -skip= - pandas/__init__.py - pandas/core/api.py, - pandas/io/msgpack/__init__.py - asv_bench/benchmarks/attrs_caching.py, - asv_bench/benchmarks/binary_ops.py, - asv_bench/benchmarks/categoricals.py, - asv_bench/benchmarks/ctors.py, - asv_bench/benchmarks/eval.py, - asv_bench/benchmarks/frame_ctor.py, - asv_bench/benchmarks/frame_methods.py, - asv_bench/benchmarks/gil.py, - asv_bench/benchmarks/groupby.py, - asv_bench/benchmarks/index_object.py, - asv_bench/benchmarks/indexing.py, - asv_bench/benchmarks/inference.py, - asv_bench/benchmarks/io/csv.py, - asv_bench/benchmarks/io/excel.py, - asv_bench/benchmarks/io/hdf.py, - asv_bench/benchmarks/io/json.py, - asv_bench/benchmarks/io/msgpack.py, - asv_bench/benchmarks/io/pickle.py, - asv_bench/benchmarks/io/sql.py, - asv_bench/benchmarks/io/stata.py, - asv_bench/benchmarks/join_merge.py, - asv_bench/benchmarks/multiindex_object.py, - asv_bench/benchmarks/panel_ctor.py, - asv_bench/benchmarks/panel_methods.py, - asv_bench/benchmarks/plotting.py, - asv_bench/benchmarks/reindex.py, - asv_bench/benchmarks/replace.py, - asv_bench/benchmarks/reshape.py, - asv_bench/benchmarks/rolling.py, - asv_bench/benchmarks/series_methods.py, - asv_bench/benchmarks/sparse.py, - asv_bench/benchmarks/stat_ops.py, - asv_bench/benchmarks/timeseries.py - asv_bench/benchmarks/pandas_vb_common.py - asv_bench/benchmarks/offset.py - asv_bench/benchmarks/dtypes.py - asv_bench/benchmarks/strings.py - asv_bench/benchmarks/period.py +known_pre_libs = pandas._config +known_pre_core = pandas._libs,pandas.util._*,pandas.compat,pandas.errors +known_dtypes = pandas.core.dtypes +known_post_core = pandas.tseries,pandas.io,pandas.plotting +sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER +known_first_party = pandas +known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml +multi_line_output = 3 +include_trailing_comma = True +force_grid_wrap = 0 +combine_as_imports = True +line_length = 88 +force_sort_within_sections = True +skip_glob = env, +skip = pandas/__init__.py,pandas/core/api.py [mypy] ignore_missing_imports=True no_implicit_optional=True [mypy-pandas.conftest,pandas.tests.*] -ignore_errors=True \ No newline at end of file +ignore_errors=True diff --git a/setup.py b/setup.py index d2c6b18b892cda..05e5f5250e2506 100755 --- a/setup.py +++ b/setup.py @@ -6,16 +6,16 @@ BSD license. Parts are from lxml (https://github.com/lxml/lxml) """ +from distutils.sysconfig import get_config_vars +from distutils.version import LooseVersion import os from os.path import join as pjoin - -import pkg_resources import platform -from distutils.sysconfig import get_config_vars -import sys import shutil -from distutils.version import LooseVersion -from setuptools import setup, Command, find_packages +import sys + +import pkg_resources +from setuptools import Command, find_packages, setup # versioning import versioneer @@ -58,8 +58,8 @@ def is_platform_mac(): # The import of Extension must be after the import of Cython, otherwise # we do not get the appropriately patched class. # See https://cython.readthedocs.io/en/latest/src/reference/compilation.html -from distutils.extension import Extension # noqa:E402 -from distutils.command.build import build # noqa:E402 +from distutils.extension import Extension # noqa: E402 isort:skip +from distutils.command.build import build # noqa: E402 isort:skip try: if not _CYTHON_INSTALLED: @@ -277,6 +277,7 @@ def initialize_options(self): ".pyo", ".pyd", ".c", + ".cpp", ".orig", ): self._clean_me.append(filepath) @@ -300,12 +301,12 @@ def run(self): for clean_me in self._clean_me: try: os.unlink(clean_me) - except Exception: + except OSError: pass for clean_tree in self._clean_trees: try: shutil.rmtree(clean_tree) - except Exception: + except OSError: pass @@ -831,9 +832,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): ] }, entry_points={ - "pandas_plotting_backends": [ - "matplotlib = pandas:plotting._matplotlib", - ], + "pandas_plotting_backends": ["matplotlib = pandas:plotting._matplotlib"] }, **setuptools_kwargs )