diff --git a/.binstar.yml b/.binstar.yml deleted file mode 100644 index 7b507b4f90049..0000000000000 --- a/.binstar.yml +++ /dev/null @@ -1,28 +0,0 @@ -package: pandas -user: jreback - -install: - - conda config --add channels pandas - -before_script: - - python -V - -platform: - - linux-64 - #- linux-32 - - osx-64 - #- win-32 - - win-64 -engine: - - python=2.7 - - python=3.4 -script: - - conda build conda.recipe --quiet - -iotimeout: 600 - -build_targets: conda - -notifications: - email: - recipients: ['jeff@reback.net'] diff --git a/.devcontainer.json b/.devcontainer.json new file mode 100644 index 0000000000000..315a1ff647012 --- /dev/null +++ b/.devcontainer.json @@ -0,0 +1,28 @@ +// For format details, see https://aka.ms/vscode-remote/devcontainer.json or the definition README at +// https://github.com/microsoft/vscode-dev-containers/tree/master/containers/python-3-miniconda +{ + "name": "pandas", + "context": ".", + "dockerFile": "Dockerfile", + + // Use 'settings' to set *default* container specific settings.json values on container create. + // You can edit these settings after create using File > Preferences > Settings > Remote. + "settings": { + "terminal.integrated.shell.linux": "/bin/bash", + "python.condaPath": "/opt/conda/bin/conda", + "python.pythonPath": "/opt/conda/bin/python", + "python.formatting.provider": "black", + "python.linting.enabled": true, + "python.linting.flake8Enabled": true, + "python.linting.pylintEnabled": false, + "python.linting.mypyEnabled": true, + "python.testing.pytestEnabled": true, + "python.testing.cwd": "pandas/tests" + }, + + // Add the IDs of extensions you want installed when the container is created in the array below. + "extensions": [ + "ms-python.python", + "ms-vscode.cpptools" + ] +} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a36420556ae24..d87fa5203bd52 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,53 +23,53 @@ jobs: - name: Looking for unwanted patterns run: ci/code_checks.sh patterns - if: true + if: always() - name: Setup environment and build pandas run: ci/setup_env.sh - if: true + if: always() - name: Linting run: | source activate pandas-dev ci/code_checks.sh lint - if: true + if: always() - name: Dependencies consistency run: | source activate pandas-dev ci/code_checks.sh dependencies - if: true + if: always() - name: Checks on imported code run: | source activate pandas-dev ci/code_checks.sh code - if: true + if: always() - name: Running doctests run: | source activate pandas-dev ci/code_checks.sh doctests - if: true + if: always() - name: Docstring validation run: | source activate pandas-dev ci/code_checks.sh docstrings - if: true + if: always() - name: Typing validation run: | source activate pandas-dev ci/code_checks.sh typing - if: true + if: always() - name: Testing docstring validation script run: | source activate pandas-dev pytest --capture=no --strict scripts - if: true + if: always() - name: Running benchmarks run: | @@ -87,7 +87,7 @@ jobs: else echo "Benchmarks did not run, no changes detected" fi - if: true + if: always() - name: Publish benchmarks artifact uses: actions/upload-artifact@master @@ -95,3 +95,65 @@ jobs: name: Benchmarks log path: asv_bench/benchmarks.log if: failure() + + web_and_docs: + name: Web and docs + runs-on: ubuntu-latest + steps: + + - name: Setting conda path + run: echo "::set-env name=PATH::${HOME}/miniconda3/bin:${PATH}" + + - name: Checkout + uses: actions/checkout@v1 + + - name: Setup environment and build pandas + run: ci/setup_env.sh + + - name: Build website + run: | + source activate pandas-dev + python web/pandas_web.py web/pandas --target-path=web/build + + - name: Build documentation + run: | + source activate pandas-dev + doc/make.py --warnings-are-errors | tee sphinx.log ; exit ${PIPESTATUS[0]} + + # This can be removed when the ipython directive fails when there are errors, + # including the `tee sphinx.log` in te previous step (https://github.com/ipython/ipython/issues/11547) + - name: Check ipython directive errors + run: "! grep -B1 \"^<<<-------------------------------------------------------------------------$\" sphinx.log" + + - name: Merge website and docs + run: | + mkdir -p pandas_web/docs + cp -r web/build/* pandas_web/ + cp -r doc/build/html/* pandas_web/docs/ + if: github.event_name == 'push' + + - name: Install Rclone + run: sudo apt install rclone -y + if: github.event_name == 'push' + + - name: Set up Rclone + run: | + RCLONE_CONFIG_PATH=$HOME/.config/rclone/rclone.conf + mkdir -p `dirname $RCLONE_CONFIG_PATH` + echo "[ovh_cloud_pandas_web]" > $RCLONE_CONFIG_PATH + echo "type = swift" >> $RCLONE_CONFIG_PATH + echo "env_auth = false" >> $RCLONE_CONFIG_PATH + echo "auth_version = 3" >> $RCLONE_CONFIG_PATH + echo "auth = https://auth.cloud.ovh.net/v3/" >> $RCLONE_CONFIG_PATH + echo "endpoint_type = public" >> $RCLONE_CONFIG_PATH + echo "tenant_domain = default" >> $RCLONE_CONFIG_PATH + echo "tenant = 2977553886518025" >> $RCLONE_CONFIG_PATH + echo "domain = default" >> $RCLONE_CONFIG_PATH + echo "user = w4KGs3pmDxpd" >> $RCLONE_CONFIG_PATH + echo "key = ${{ secrets.ovh_object_store_key }}" >> $RCLONE_CONFIG_PATH + echo "region = BHS" >> $RCLONE_CONFIG_PATH + if: github.event_name == 'push' + + - name: Sync web + run: rclone sync pandas_web ovh_cloud_pandas_web:dev + if: github.event_name == 'push' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b34f5dfdd1a83..139b9e31df46c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,8 +11,20 @@ repos: language: python_venv additional_dependencies: [flake8-comprehensions>=3.1.0] - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.20 + rev: v4.3.21 hooks: - id: isort language: python_venv exclude: ^pandas/__init__\.py$|^pandas/core/api\.py$ +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.730 + hooks: + - id: mypy + args: + # As long as a some files are excluded from check-untyped-defs + # we have to exclude it from the pre-commit hook as the configuration + # is based on modules but the hook runs on files. + - --no-check-untyped-defs + - --follow-imports + - skip + files: pandas/ diff --git a/.travis.yml b/.travis.yml index 0c7740295b637..2c8533d02ddc1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ language: python -python: 3.5 +python: 3.7 # To turn off cached cython files and compiler cache # set NOCACHE-true @@ -7,10 +7,10 @@ python: 3.5 # travis cache --delete inside the project directory from the travis command line client # The cache directories will be deleted if anything in ci/ changes in a commit cache: - ccache: true - directories: - - $HOME/.cache # cython cache - - $HOME/.ccache # compiler cache + ccache: true + directories: + - $HOME/.cache # cython cache + - $HOME/.ccache # compiler cache env: global: @@ -20,45 +20,40 @@ env: - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" git: - # for cloning - depth: false + # for cloning + depth: false matrix: - fast_finish: true - exclude: - # Exclude the default Python 3.5 build - - python: 3.5 + fast_finish: true - include: + include: - env: - - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network)" + - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)" - env: - - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network)" + - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network and not clipboard)" - env: - - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" + - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" services: - mysql - postgresql - env: - - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" + # Enabling Deprecations when running tests + # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs + # See pandas/_testing.py for more details. + - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" services: - mysql - postgresql - # In allow_failures - env: - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1" services: - mysql - postgresql - allow_failures: - - env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1" - before_install: - echo "before_install" # set non-blocking IO on travis @@ -78,7 +73,6 @@ before_install: # This overrides travis and tells it to look nowhere. - export BOTO_CONFIG=/dev/null - install: - echo "install start" - ci/prep_cython_cache.sh @@ -95,5 +89,5 @@ script: after_script: - echo "after_script start" - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - ci/print_skipped.py + - ci/print_skipped.py - echo "after_script done" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000..b8aff5d671dcf --- /dev/null +++ b/Dockerfile @@ -0,0 +1,47 @@ +FROM continuumio/miniconda3 + +# if you forked pandas, you can pass in your own GitHub username to use your fork +# i.e. gh_username=myname +ARG gh_username=pandas-dev +ARG pandas_home="/home/pandas" + +# Avoid warnings by switching to noninteractive +ENV DEBIAN_FRONTEND=noninteractive + +# Configure apt and install packages +RUN apt-get update \ + && apt-get -y install --no-install-recommends apt-utils dialog 2>&1 \ + # + # Verify git, process tools, lsb-release (common in install instructions for CLIs) installed + && apt-get -y install git iproute2 procps iproute2 lsb-release \ + # + # Install C compilers (gcc not enough, so just went with build-essential which admittedly might be overkill), + # needed to build pandas C extensions + && apt-get -y install build-essential \ + # + # cleanup + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* + +# Switch back to dialog for any ad-hoc use of apt-get +ENV DEBIAN_FRONTEND=dialog + +# Clone pandas repo +RUN mkdir "$pandas_home" \ + && git clone "https://github.com/$gh_username/pandas.git" "$pandas_home" \ + && cd "$pandas_home" \ + && git remote add upstream "https://github.com/pandas-dev/pandas.git" \ + && git pull upstream master + +# Because it is surprisingly difficult to activate a conda environment inside a DockerFile +# (from personal experience and per https://github.com/ContinuumIO/docker-images/issues/89), +# we just update the base/root one from the 'environment.yml' file instead of creating a new one. +# +# Set up environment +RUN conda env update -n base -f "$pandas_home/environment.yml" + +# Build C extensions and pandas +RUN cd "$pandas_home" \ + && python setup.py build_ext --inplace -j 4 \ + && python -m pip install -e . diff --git a/LICENSE b/LICENSE index 924de26253bf4..76954a5a339ab 100644 --- a/LICENSE +++ b/LICENSE @@ -1,8 +1,10 @@ BSD 3-Clause License -Copyright (c) 2008-2012, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team +Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team All rights reserved. +Copyright (c) 2011-2020, Open source contributors. + Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/RELEASE.md b/RELEASE.md index efd075dabcba9..7924ffaff561f 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -2,5 +2,5 @@ Release Notes ============= The list of changes to Pandas between each release can be found -[here](http://pandas.pydata.org/pandas-docs/stable/whatsnew.html). For full +[here](https://pandas.pydata.org/pandas-docs/stable/whatsnew/index.html). For full details, see the commit logs at http://github.com/pandas-dev/pandas. diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index c04bbf53a86a6..7886b63e9983e 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -7,7 +7,7 @@ "project": "pandas", // The project's homepage - "project_url": "http://pandas.pydata.org/", + "project_url": "https://pandas.pydata.org/", // The URL of the source code repository for the project being // benchmarked @@ -43,6 +43,7 @@ "matplotlib": [], "sqlalchemy": [], "scipy": [], + "numba": [], "numexpr": [], "pytables": [null, ""], // platform dependent, see excludes below "tables": [null, ""], @@ -122,5 +123,8 @@ ".*": "0409521665" }, "regression_thresholds": { - } + }, + "build_command": + ["python setup.py build -j4", + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"], } diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 7d97f2c740acb..0f3b3838de1b2 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -5,7 +5,8 @@ from pandas._libs import lib import pandas as pd -from pandas.util import testing as tm + +from .pandas_vb_common import tm for imp in ["pandas.util", "pandas.tools.hashing"]: try: diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 58e0db67d6025..64e067d25a454 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,3 +1,5 @@ +import operator + import numpy as np from pandas import DataFrame, Series, date_range @@ -9,6 +11,36 @@ import pandas.computation.expressions as expr +class IntFrameWithScalar: + params = [ + [np.float64, np.int64], + [2, 3.0, np.int32(4), np.float64(5)], + [ + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.floordiv, + operator.pow, + operator.mod, + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ], + ] + param_names = ["dtype", "scalar", "op"] + + def setup(self, dtype, scalar, op): + arr = np.random.randn(20000, 100) + self.df = DataFrame(arr.astype(dtype)) + + def time_frame_op_with_scalar(self, dtype, scalar, op): + op(self.df, scalar) + + class Ops: params = [[True, False], ["default", 1]] diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 43b1b31a0bfe8..1dcd52ac074a6 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -3,7 +3,8 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas.api.types import union_categoricals diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index a9e45cad22d27..7c43485f5ef45 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,7 +1,8 @@ import numpy as np from pandas import DatetimeIndex, Index, MultiIndex, Series, Timestamp -import pandas.util.testing as tm + +from .pandas_vb_common import tm def no_change(arr): diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index 1deca8fe3aad0..2b24bab85bc57 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,7 +1,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas.tseries.offsets import Nano, Hour diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index ae6c07107f4a0..2187668c96ca4 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -4,7 +4,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, NaT, Series, date_range, isnull, period_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm class GetNumericData: diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 860c6cc6192bb..e266d871f5bc6 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -2,7 +2,8 @@ from pandas import DataFrame, Series, date_range, factorize, read_csv from pandas.core.algorithms import take_1d -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas import ( @@ -24,7 +25,7 @@ except ImportError: from pandas import algos try: - from pandas.util.testing import test_parallel + from pandas._testing import test_parallel have_real_test_parallel = True except ImportError: diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index d51c53e2264f1..28e0dcc5d9b13 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -13,7 +13,8 @@ date_range, period_range, ) -import pandas.util.testing as tm + +from .pandas_vb_common import tm method_blacklist = { "object": { diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index d69799eb70040..103141545504b 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -12,7 +12,8 @@ Series, date_range, ) -import pandas.util.testing as tm + +from .pandas_vb_common import tm class SetOperations: diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index c78c2fa92827e..087fe3916845b 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -17,7 +17,8 @@ option_context, period_range, ) -import pandas.util.testing as tm + +from .pandas_vb_common import tm class NumericSeriesIndexing: @@ -131,6 +132,7 @@ def setup(self): self.col_scalar = columns[10] self.bool_indexer = self.df[self.col_scalar] > 0 self.bool_obj_indexer = self.bool_indexer.astype(object) + self.boolean_indexer = (self.df[self.col_scalar] > 0).astype("boolean") def time_loc(self): self.df.loc[self.idx_scalar, self.col_scalar] @@ -144,6 +146,9 @@ def time_boolean_rows(self): def time_boolean_rows_object(self): self.df[self.bool_obj_indexer] + def time_boolean_rows_boolean(self): + self.df[self.boolean_indexer] + class DataFrameNumericIndexing: def setup(self): diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index e85b3bd2c7687..1a8d5ede52512 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, Series, to_numeric -import pandas.util.testing as tm -from .pandas_vb_common import lib, numeric_dtypes +from .pandas_vb_common import lib, numeric_dtypes, tm class NumericInferOps: diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index b8e8630e663ee..9bcd125f56bbb 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -5,9 +5,8 @@ import numpy as np from pandas import Categorical, DataFrame, date_range, read_csv, to_datetime -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class ToCSV(BaseIO): diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 75d87140488e3..80af2cff41769 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -6,7 +6,8 @@ from odf.text import P from pandas import DataFrame, ExcelWriter, date_range, read_excel -import pandas.util.testing as tm + +from ..pandas_vb_common import tm def _generate_dataframe(): diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py index 88c1a3dc48ea4..4ca399a293a4b 100644 --- a/asv_bench/benchmarks/io/hdf.py +++ b/asv_bench/benchmarks/io/hdf.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, HDFStore, date_range, read_hdf -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class HDFStoreDataFrame(BaseIO): diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 8f037e94e0095..f478bf2aee0ba 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, concat, date_range, read_json, timedelta_range -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class ReadJSON(BaseIO): @@ -132,6 +131,30 @@ def peakmem_to_json_wide(self, orient, frame): df.to_json(self.fname, orient=orient) +class ToJSONISO(BaseIO): + fname = "__test__.json" + params = [["split", "columns", "index", "values", "records"]] + param_names = ["orient"] + + def setup(self, orient): + N = 10 ** 5 + index = date_range("20000101", periods=N, freq="H") + timedeltas = timedelta_range(start=1, periods=N, freq="s") + datetimes = date_range(start=1, periods=N, freq="s") + self.df = DataFrame( + { + "td_1": timedeltas, + "td_2": timedeltas, + "ts_1": datetimes, + "ts_2": datetimes, + }, + index=index, + ) + + def time_iso_format(self, orient): + self.df.to_json(orient=orient, date_format="iso") + + class ToJSONLines(BaseIO): fname = "__test__.json" diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py index 12620656dd2bf..4ca9a82ae4827 100644 --- a/asv_bench/benchmarks/io/pickle.py +++ b/asv_bench/benchmarks/io/pickle.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, date_range, read_pickle -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class Pickle(BaseIO): diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py index 6cc7f56ae3d65..b71bb832280b9 100644 --- a/asv_bench/benchmarks/io/sql.py +++ b/asv_bench/benchmarks/io/sql.py @@ -4,7 +4,8 @@ from sqlalchemy import create_engine from pandas import DataFrame, date_range, read_sql_query, read_sql_table -import pandas.util.testing as tm + +from ..pandas_vb_common import tm class SQL: diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py index f3125f8598418..9faafa82ff46e 100644 --- a/asv_bench/benchmarks/io/stata.py +++ b/asv_bench/benchmarks/io/stata.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, date_range, read_stata -import pandas.util.testing as tm -from ..pandas_vb_common import BaseIO +from ..pandas_vb_common import BaseIO, tm class Stata(BaseIO): diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 5cf9f6336ba0c..1333b3a0f0560 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -3,7 +3,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, Series, concat, date_range, merge, merge_asof -import pandas.util.testing as tm + +from .pandas_vb_common import tm try: from pandas import merge_ordered diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 5a396c9f0deff..0e188c58012fa 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -3,7 +3,8 @@ import numpy as np from pandas import DataFrame, MultiIndex, RangeIndex, date_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm class GetLoc: diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 1faf13329110d..6da2b2270c04a 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -13,6 +13,13 @@ except (ImportError, TypeError, ValueError): pass +# Compatibility import for the testing module +try: + import pandas._testing as tm # noqa +except ImportError: + import pandas.util.testing as tm # noqa + + numeric_dtypes = [ np.int64, np.int32, diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index cd450f801c805..03394e6fe08cb 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,9 +1,8 @@ import numpy as np from pandas import DataFrame, Index, MultiIndex, Series, date_range, period_range -import pandas.util.testing as tm -from .pandas_vb_common import lib +from .pandas_vb_common import lib, tm class Reindex: diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 7a72622fd5fe3..f7e1e395a76bc 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -44,6 +44,27 @@ def time_rolling(self, constructor, window, dtype, function, raw): self.roll.apply(function, raw=raw) +class Engine: + params = ( + ["DataFrame", "Series"], + ["int", "float"], + [np.sum, lambda x: np.sum(x) + 5], + ["cython", "numba"], + ) + param_names = ["constructor", "dtype", "function", "engine"] + + def setup(self, constructor, dtype, function, engine): + N = 10 ** 3 + arr = (100 * np.random.random(N)).astype(dtype) + self.data = getattr(pd, constructor)(arr) + + def time_rolling_apply(self, constructor, dtype, function, engine): + self.data.rolling(10).apply(function, raw=True, engine=engine) + + def time_expanding_apply(self, constructor, dtype, function, engine): + self.data.expanding().apply(function, raw=True, engine=engine) + + class ExpandingMethods: params = ( diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index a3f1d92545c3f..57c625ced8a43 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -3,7 +3,8 @@ import numpy as np from pandas import NaT, Series, date_range -import pandas.util.testing as tm + +from .pandas_vb_common import tm class SeriesConstructor: diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index f30b2482615bd..d7fb2775376c0 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -3,7 +3,8 @@ import numpy as np from pandas import DataFrame, Series -import pandas.util.testing as tm + +from .pandas_vb_common import tm class Methods: diff --git a/asv_bench/benchmarks/tslibs/offsets.py b/asv_bench/benchmarks/tslibs/offsets.py index d6379b922641c..fc1efe63307b2 100644 --- a/asv_bench/benchmarks/tslibs/offsets.py +++ b/asv_bench/benchmarks/tslibs/offsets.py @@ -59,7 +59,7 @@ def setup(self, offset): def time_on_offset(self, offset): for date in self.dates: - offset.onOffset(date) + offset.is_on_offset(date) class OffestDatetimeArithmetic: diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index cb0b17e3553a4..c9a2e4eefd19d 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -18,31 +18,39 @@ jobs: py36_minimum_versions: ENV_FILE: ci/deps/azure-36-minimum_versions.yaml CONDA_PY: "36" - PATTERN: "not slow and not network" + PATTERN: "not slow and not network and not clipboard" + py36_locale_slow_old_np: ENV_FILE: ci/deps/azure-36-locale_slow.yaml CONDA_PY: "36" PATTERN: "slow" - LOCALE_OVERRIDE: "zh_CN.UTF-8" + # pandas does not use the language (zh_CN), but should support diferent encodings (utf8) + # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any + LANG: "zh_CN.utf8" + LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans" py36_locale: ENV_FILE: ci/deps/azure-36-locale.yaml CONDA_PY: "36" PATTERN: "not slow and not network" - LOCALE_OVERRIDE: "it_IT.UTF-8" + LANG: "it_IT.utf8" + LC_ALL: "it_IT.utf8" + EXTRA_APT: "language-pack-it xsel" py36_32bit: ENV_FILE: ci/deps/azure-36-32bit.yaml CONDA_PY: "36" - PATTERN: "not slow and not network" + PATTERN: "not slow and not network and not clipboard" BITS32: "yes" py37_locale: ENV_FILE: ci/deps/azure-37-locale.yaml CONDA_PY: "37" PATTERN: "not slow and not network" - LOCALE_OVERRIDE: "zh_CN.UTF-8" + LANG: "zh_CN.utf8" + LC_ALL: "zh_CN.utf8" + EXTRA_APT: "language-pack-zh-hans xsel" py37_np_dev: ENV_FILE: ci/deps/azure-37-numpydev.yaml @@ -54,10 +62,16 @@ jobs: steps: - script: | - if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386 $EXTRA_APT; fi - echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' - echo "Creating Environment" - ci/setup_env.sh + if [ "$(uname)" == "Linux" ]; then + sudo apt-get update + sudo apt-get install -y libc6-dev-i386 $EXTRA_APT + fi + displayName: 'Install extra packages' + + - script: echo '##vso[task.prependpath]$(HOME)/miniconda3/bin' + displayName: 'Set conda path' + + - script: ci/setup_env.sh displayName: 'Setup environment and build pandas' - script: | diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 03529bd6569c6..187a5db99802f 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -34,7 +34,7 @@ jobs: - bash: | source activate pandas-dev conda list - python setup.py build_ext -q -i + python setup.py build_ext -q -i -j 4 python -m pip install --no-build-isolation -e . displayName: 'Build' diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 94eaab0a5b4da..0cc42be42d61e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -100,6 +100,14 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime pandas/_libs/*.cpp RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of not concatenated strings' ; echo $MSG + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + $BASE_DIR/scripts/validate_string_concatenation.py --format="[error]{source_path}:{line_number}:{msg}" . + else + $BASE_DIR/scripts/validate_string_concatenation.py . + fi + RET=$(($RET + $?)) ; echo $MSG "DONE" + echo "isort --version-number" isort --version-number @@ -122,13 +130,18 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then # Check for imports from collections.abc instead of `from collections import abc` MSG='Check for non-standard imports' ; echo $MSG invgrep -R --include="*.py*" -E "from pandas.core.common import" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" invgrep -R --include="*.py*" -E "from pandas.core import common" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" invgrep -R --include="*.py*" -E "from collections.abc import" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" invgrep -R --include="*.py*" -E "from numpy import nan" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" # Checks for test suite - # Check for imports from pandas.util.testing instead of `import pandas.util.testing as tm` - invgrep -R --include="*.py*" -E "from pandas.util.testing import" pandas/tests + # Check for imports from pandas._testing instead of `import pandas._testing as tm` + invgrep -R --include="*.py*" -E "from pandas._testing import" pandas/tests + RET=$(($RET + $?)) ; echo $MSG "DONE" invgrep -R --include="*.py*" -E "from pandas.util import testing as tm" pandas/tests RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -195,6 +208,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.py" --include="*.pyx" -E 'class.*:\n\n( )+"""' . RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of {foo!r} instead of {repr(foo)}' ; echo $MSG + invgrep -R --include=*.{py,pyx} '!r}' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of comment-based annotation syntax' ; echo $MSG invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -281,8 +298,11 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then -k"-from_arrays -from_breaks -from_intervals -from_tuples -set_closed -to_tuples -interval_range" RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Doctests arrays/string_.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/arrays/string_.py + MSG='Doctests arrays'; echo $MSG + pytest -q --doctest-modules \ + pandas/core/arrays/string_.py \ + pandas/core/arrays/integer.py \ + pandas/core/arrays/boolean.py RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests arrays/boolean.py' ; echo $MSG @@ -294,8 +314,8 @@ fi ### DOCSTRINGS ### if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then - MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA01, SA02, SA03, SA05)' ; echo $MSG - $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA01,SA02,SA03,SA05 + MSG='Validate docstrings (GL03, GL04, GL05, GL06, GL07, GL09, GL10, SS04, SS05, PR03, PR04, PR05, PR10, EX04, RT01, RT04, RT05, SA02, SA03, SA05)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=GL03,GL04,GL05,GL06,GL07,GL09,GL10,SS04,SS05,PR03,PR04,PR05,PR10,EX04,RT01,RT04,RT05,SA02,SA03,SA05 RET=$(($RET + $?)) ; echo $MSG "DONE" fi diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 4f4c4524cb4dd..810554632a507 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -9,6 +9,7 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 + - pytest-asyncio - hypothesis>=3.58.0 - pytest-azurepipelines @@ -26,7 +27,7 @@ dependencies: - openpyxl # lowest supported version of pyarrow (putting it here instead of in # azure-36-minimum_versions because it needs numpy >= 1.14) - - pyarrow=0.12 + - pyarrow=0.13 - pytables - python-dateutil - pytz diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 2bb2b00319382..48ac50c001715 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -13,7 +13,7 @@ dependencies: - pytest-azurepipelines # pandas dependencies - - beautifulsoup4==4.6.0 + - beautifulsoup4=4.6.0 - bottleneck=1.2.* - lxml - matplotlib=2.2.2 diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index 8bf4f70d18aec..de7e011d9c7ca 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -17,6 +17,7 @@ dependencies: - beautifulsoup4=4.6.0 - bottleneck=1.2.1 - jinja2=2.8 + - numba=0.46.0 - numexpr=2.6.2 - numpy=1.13.3 - openpyxl=2.5.7 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index a10fa0904a451..111ba6b020bc7 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -8,6 +8,7 @@ dependencies: - cython>=0.29.13 - pytest>=5.0.1 - pytest-xdist>=1.21 + - pytest-asyncio - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index f393ed84ecf63..3bbbdb4cf32ad 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -22,7 +22,7 @@ dependencies: - numexpr - numpy=1.14 - openpyxl - - pyarrow>=0.12.0 + - pyarrow>=0.13.0 - pytables - python-dateutil==2.6.1 - pytz diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 2bd11c9030325..663c55492e69e 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -17,11 +17,12 @@ dependencies: - bottleneck - fastparquet>=0.3.2 - matplotlib=3.0.2 + - numba - numexpr - numpy=1.15.* - openpyxl - jinja2 - - pyarrow>=0.12.0 + - pyarrow>=0.13.0 - pytables - python-dateutil - pytz diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 928896efd5fc4..62be1075b3337 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -24,6 +24,7 @@ dependencies: - numexpr - numpy=1.14.* - openpyxl + - pyarrow=0.14 - pytables - python-dateutil - pytz diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index c1403f8eb8409..a46001c58d165 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -30,10 +30,8 @@ dependencies: - openpyxl<=3.0.1 # https://github.com/pandas-dev/pandas/pull/30009 openpyxl 3.0.2 broke - pandas-gbq - # https://github.com/pydata/pandas-gbq/issues/271 - - google-cloud-bigquery<=1.11 - psycopg2 - - pyarrow>=0.12.0 + - pyarrow>=0.13.0 - pymysql - pytables - python-snappy diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 6826a9d072ff3..73e2c20b31438 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -20,6 +20,7 @@ dependencies: - pyarrow - pytz - s3fs + - tabulate - pyreadstat - pip - pip: diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml index 828f02596a70e..a627b7edc175f 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/travis-38.yaml @@ -17,3 +17,4 @@ dependencies: - nomkl - pytz - pip + - tabulate==0.8.3 diff --git a/ci/print_skipped.py b/ci/print_skipped.py index 72822fa2d3c7f..60e2f047235e6 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os import xml.etree.ElementTree as et diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 0b68164e5767e..0cb1f4aabf352 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -5,17 +5,6 @@ # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -if [ -n "$LOCALE_OVERRIDE" ]; then - export LC_ALL="$LOCALE_OVERRIDE" - export LANG="$LOCALE_OVERRIDE" - PANDAS_LOCALE=`python -c 'import pandas; pandas.get_option("display.encoding")'` - if [[ "$LOCALE_OVERRIDE" != "$PANDAS_LOCALE" ]]; then - echo "pandas could not detect the locale. System locale: $LOCALE_OVERRIDE, pandas detected: $PANDAS_LOCALE" - # TODO Not really aborting the tests until https://github.com/pandas-dev/pandas/issues/23923 is fixed - # exit 1 - fi -fi - if [[ "not network" == *"$PATTERN"* ]]; then export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; fi @@ -25,14 +14,14 @@ if [ "$COVERAGE" ]; then COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" fi -PYTEST_CMD="pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" - -# Travis does not have have an X server -if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - DISPLAY=DISPLAY=:99.0 - PYTEST_CMD="xvfb-run -e /dev/stdout $PYTEST_CMD" +# If no X server is found, we use xvfb to emulate it +if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then + export DISPLAY=":0" + XVFB="xvfb-run " fi +PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n auto --dist=loadfile -s --strict --durations=10 --junitxml=test-data.xml $TEST_ARGS $COVERAGE pandas" + echo $PYTEST_CMD sh -c "$PYTEST_CMD" diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 08ba83ae94451..e5bee09fe2f79 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -1,15 +1,15 @@ #!/bin/bash -e # edit the locale file if needed -if [ -n "$LOCALE_OVERRIDE" ]; then +if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo "Adding locale to the first line of pandas/__init__.py" rm -f pandas/__init__.pyc - SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + SEDC="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LC_ALL')\n" sed -i "$SEDC" pandas/__init__.py + echo "[head -4 pandas/__init__.py]" head -4 pandas/__init__.py echo - sudo locale-gen "$LOCALE_OVERRIDE" fi MINICONDA_DIR="$HOME/miniconda3" @@ -114,6 +114,11 @@ echo "remove postgres if has been installed with conda" echo "we use the one from the CI" conda remove postgresql -y --force || true +echo +echo "remove qt" +echo "causes problems with the clipboard, we use xsel for that" +conda remove qt -y --force || true + echo echo "conda list pandas" conda list pandas @@ -121,7 +126,7 @@ conda list pandas # Make sure any error below is reported as such echo "[Build extensions]" -python setup.py build_ext -q -i +python setup.py build_ext -q -i -j2 # XXX: Some of our environments end up with old versions of pip (10.x) # Adding a new enough version of pip to the requirements explodes the diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index f92090fecccf3..47f63c11d0567 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -36,5 +36,5 @@ test: about: - home: http://pandas.pydata.org + home: https://pandas.pydata.org license: BSD diff --git a/doc/make.py b/doc/make.py index cf73f44b5dd02..024a748cd28ca 100755 --- a/doc/make.py +++ b/doc/make.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Python script for building documentation. diff --git a/doc/source/conf.py b/doc/source/conf.py index 481c03ab8f388..7f24d02a496e1 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -10,6 +10,7 @@ # All configuration values have a default; values that are commented out # serve to show the default. +from datetime import datetime import importlib import inspect import logging @@ -137,7 +138,7 @@ # General information about the project. project = "pandas" -copyright = "2008-2014, the pandas development team" +copyright = f"2008-{datetime.now().year}, the pandas development team" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst new file mode 100644 index 0000000000000..a295038b5a0bd --- /dev/null +++ b/doc/source/development/code_style.rst @@ -0,0 +1,155 @@ +.. _code_style: + +{{ header }} + +======================= +pandas code style guide +======================= + +.. contents:: Table of contents: + :local: + +Patterns +======== + +foo.__class__ +------------- + +*pandas* uses 'type(foo)' instead 'foo.__class__' as it is making the code more +readable. + +For example: + +**Good:** + +.. code-block:: python + + foo = "bar" + type(foo) + +**Bad:** + +.. code-block:: python + + foo = "bar" + foo.__class__ + + +String formatting +================= + +Concatenated strings +-------------------- + +f-strings +~~~~~~~~~ + +*pandas* uses f-strings formatting instead of '%' and '.format()' string formatters. + +The convention of using f-strings on a string that is concatenated over serveral lines, +is to prefix only the lines containing the value needs to be interpeted. + +For example: + +**Good:** + +.. code-block:: python + + foo = "old_function" + bar = "new_function" + + my_warning_message = ( + f"Warning, {foo} is deprecated, " + "please use the new and way better " + f"{bar}" + ) + +**Bad:** + +.. code-block:: python + + foo = "old_function" + bar = "new_function" + + my_warning_message = ( + f"Warning, {foo} is deprecated, " + f"please use the new and way better " + f"{bar}" + ) + +White spaces +~~~~~~~~~~~~ + +Putting the white space only at the end of the previous line, so +there is no whitespace at the beggining of the concatenated string. + +For example: + +**Good:** + +.. code-block:: python + + example_string = ( + "Some long concatenated string, " + "with good placement of the " + "whitespaces" + ) + +**Bad:** + +.. code-block:: python + + example_string = ( + "Some long concatenated string," + " with bad placement of the" + " whitespaces" + ) + +Representation function (aka 'repr()') +-------------------------------------- + +*pandas* uses 'repr()' instead of '%r' and '!r'. + +The use of 'repr()' will only happend when the value is not an obvious string. + +For example: + +**Good:** + +.. code-block:: python + + value = str + f"Unknown recived value, got: {repr(value)}" + +**Good:** + +.. code-block:: python + + value = str + f"Unknown recived type, got: '{type(value).__name__}'" + + +Imports (aim for absolute) +========================== + +In Python 3, absolute imports are recommended. In absolute import doing something +like ``import string`` will import the string module rather than ``string.py`` +in the same directory. As much as possible, you should try to write out +absolute imports that show the whole import chain from toplevel pandas. + +Explicit relative imports are also supported in Python 3. But it is not +recommended to use it. Implicit relative imports should never be used +and is removed in Python 3. + +For example: + +:: + + # preferred + import pandas.core.common as com + + # not preferred + from .common import test_base + + # wrong + from common import test_base diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index d7b3e159f8ce7..b650b2a2cf1fe 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -146,6 +146,17 @@ requires a C compiler and Python environment. If you're making documentation changes, you can skip to :ref:`contributing.documentation` but you won't be able to build the documentation locally before pushing your changes. +Using a Docker Container +~~~~~~~~~~~~~~~~~~~~~~~~ + +Instead of manually setting up a development environment, you can use Docker to +automatically create the environment with just several commands. Pandas provides a `DockerFile` +in the root directory to build a Docker image with a full pandas development environment. + +Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code, +a popular free IDE, using the `.devcontainer.json` file. +See https://code.visualstudio.com/docs/remote/containers for details. + .. _contributing.dev_c: Installing a C compiler @@ -354,9 +365,9 @@ About the *pandas* documentation -------------------------------- The documentation is written in **reStructuredText**, which is almost like writing -in plain English, and built using `Sphinx `__. The +in plain English, and built using `Sphinx `__. The Sphinx Documentation has an excellent `introduction to reST -`__. Review the Sphinx docs to perform more +`__. Review the Sphinx docs to perform more complex changes to the documentation as well. Some other important things to know about the docs: @@ -434,7 +445,7 @@ The utility script ``scripts/validate_docstrings.py`` can be used to get a csv summary of the API documentation. And also validate common errors in the docstring of a specific class, function or method. The summary also compares the list of methods documented in ``doc/source/api.rst`` (which is used to generate -the `API Reference `_ page) +the `API Reference `_ page) and the actual public methods. This will identify methods documented in ``doc/source/api.rst`` that are not actually class methods, and existing methods that are not documented in ``doc/source/api.rst``. @@ -569,8 +580,7 @@ do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as *backwards compatible* as possible to avoid mass breakages. -Additional standards are outlined on the `code style wiki -page `_. +Additional standards are outlined on the `pandas code style guide `_ Optional dependencies --------------------- @@ -636,6 +646,8 @@ many errors as possible, but it may not correct *all* of them. Thus, it is recommended that you run ``cpplint`` to double check and make any other style fixes manually. +.. _contributing.code-formatting: + Python (PEP8 / black) ~~~~~~~~~~~~~~~~~~~~~ @@ -657,19 +669,8 @@ apply ``black`` as you edit files. You should use a ``black`` version >= 19.10b0 as previous versions are not compatible with the pandas codebase. -Optionally, you may wish to setup `pre-commit hooks `_ -to automatically run ``black`` and ``flake8`` when you make a git commit. This -can be done by installing ``pre-commit``:: - - pip install pre-commit - -and then running:: - - pre-commit install - -from the root of the pandas repository. Now ``black`` and ``flake8`` will be run -each time you commit changes. You can skip these checks with -``git commit --no-verify``. +If you wish to run these checks automatically, we encourage you to use +:ref:`pre-commits ` instead. One caveat about ``git diff upstream/master -u -- "*.py" | flake8 --diff``: this command will catch any stylistic errors in your changes specifically, but @@ -677,7 +678,7 @@ be beware it may not catch all of them. For example, if you delete the only usage of an imported function, it is stylistically incorrect to import an unused function. However, style-checking the diff will not catch this because the actual import is not part of the diff. Thus, for completeness, you should -run this command, though it will take longer:: +run this command, though it may take longer:: git diff upstream/master --name-only -- "*.py" | xargs -r flake8 @@ -695,6 +696,8 @@ behaviour as follows:: This will get all the files being changed by the PR (and ending with ``.py``), and run ``flake8`` on them, one after the other. +Note that these commands can be run analogously with ``black``. + .. _contributing.import-formatting: Import formatting @@ -717,7 +720,6 @@ A summary of our current import sections ( in order ): Imports are alphabetically sorted within these sections. - As part of :ref:`Continuous Integration ` checks we run:: isort --recursive --check-only pandas @@ -741,8 +743,37 @@ to automatically format imports correctly. This will modify your local copy of t The `--recursive` flag can be passed to sort all files in a directory. +Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: + + git diff upstream/master --name-only -- "*.py" | xargs -r isort + +Where similar caveats apply if you are on OSX or Windows. + You can then verify the changes look ok, then git :ref:`commit ` and :ref:`push `. +.. _contributing.pre-commit: + +Pre-Commit +~~~~~~~~~~ + +You can run many of these styling checks manually as we have described above. However, +we encourage you to use `pre-commit hooks `_ instead +to automatically run ``black``, ``flake8``, ``isort`` when you make a git commit. This +can be done by installing ``pre-commit``:: + + pip install pre-commit + +and then running:: + + pre-commit install + +from the root of the pandas repository. Now all of the styling checks will be +run each time you commit changes without your needing to run each one manually. +In addition, using this pre-commit hook will also allow you to more easily +remain up-to-date with our code checks as they change. + +Note that if needed, you can skip these checks with ``git commit --no-verify``. + Backwards compatibility ~~~~~~~~~~~~~~~~~~~~~~~ @@ -957,7 +988,7 @@ inspiration. If your test requires working with files or network connectivity, there is more information on the `testing page `_ of the wiki. -The ``pandas.util.testing`` module has many special ``assert`` functions that +The ``pandas._testing`` module has many special ``assert`` functions that make it easier to make statements about whether Series or DataFrame objects are equivalent. The easiest way to verify that your code is correct is to explicitly construct the result you expect, then compare the actual result to @@ -1143,7 +1174,7 @@ If your change involves checking that a warning is actually emitted, use .. code-block:: python - import pandas.util.testing as tm + import pandas._testing as tm df = pd.DataFrame() @@ -1364,6 +1395,7 @@ some common prefixes along with general guidelines for when to use them: * TST: Additions/updates to tests * BLD: Updates to the build process/scripts * PERF: Performance improvement +* TYP: Type annotations * CLN: Code cleanup The following defines how a commit message should be structured. Please reference the @@ -1504,3 +1536,19 @@ The branch will still exist on GitHub, so to delete it there do:: git push origin --delete shiny-new-feature .. _Gitter: https://gitter.im/pydata/pandas + + +Tips for a successful Pull Request +================================== + +If you have made it to the `Review your code`_ phase, one of the core contributors may +take a look. Please note however that a handful of people are responsible for reviewing +all of the contributions, which can often lead to bottlenecks. + +To improve the chances of your pull request being reviewed, you should: + +- **Reference an open issue** for non-trivial changes to clarify the PR's purpose +- **Ensure you have appropriate tests**. These should be the first part of any PR +- **Keep your pull requests as simple as possible**. Larger PRs take longer to review +- **Ensure that CI is in a green state**. Reviewers may not even look otherwise +- **Keep** `Updating your pull request`_, either by request or every few days diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 34bc5f44eb0c0..cb32f0e1ee475 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -22,39 +22,39 @@ Next example gives an idea on how a docstring looks like: .. code-block:: python def add(num1, num2): - """ - Add up two integer numbers. - - This function simply wraps the `+` operator, and does not - do anything interesting, except for illustrating what is - the docstring of a very simple function. - - Parameters - ---------- - num1 : int - First number to add - num2 : int - Second number to add - - Returns - ------- - int - The sum of `num1` and `num2` - - See Also - -------- - subtract : Subtract one integer from another - - Examples - -------- - >>> add(2, 2) - 4 - >>> add(25, 0) - 25 - >>> add(10, -10) - 0 - """ - return num1 + num2 + """ + Add up two integer numbers. + + This function simply wraps the `+` operator, and does not + do anything interesting, except for illustrating what is + the docstring of a very simple function. + + Parameters + ---------- + num1 : int + First number to add + num2 : int + Second number to add + + Returns + ------- + int + The sum of `num1` and `num2` + + See Also + -------- + subtract : Subtract one integer from another + + Examples + -------- + >>> add(2, 2) + 4 + >>> add(25, 0) + 25 + >>> add(10, -10) + 0 + """ + return num1 + num2 Some standards exist about docstrings, so they are easier to read, and they can be exported to other formats such as html or pdf. @@ -399,7 +399,7 @@ DataFrame: * DataFrame * pandas.Index * pandas.Categorical -* pandas.SparseArray +* pandas.arrays.SparseArray If the exact type is not relevant, but must be compatible with a numpy array, array-like can be specified. If Any type that can be iterated is diff --git a/doc/source/development/index.rst b/doc/source/development/index.rst index 757b197c717e6..f8a6bb6deb52d 100644 --- a/doc/source/development/index.rst +++ b/doc/source/development/index.rst @@ -13,6 +13,7 @@ Development :maxdepth: 2 contributing + code_style maintaining internals extending diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index 00598830e2fe9..fafe63d80249c 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -129,20 +129,6 @@ Some specific goals include * Improve the overall organization of the documentation and specific subsections of the documentation to make navigation and finding content easier. -Package docstring validation ----------------------------- - -To improve the quality and consistency of pandas docstrings, we've developed -tooling to check docstrings in a variety of ways. -https://github.com/pandas-dev/pandas/blob/master/scripts/validate_docstrings.py -contains the checks. - -Like many other projects, pandas uses the -`numpydoc `__ style for writing -docstrings. With the collaboration of the numpydoc maintainers, we'd like to -move the checks to a package other than pandas so that other projects can easily -use them as well. - Performance monitoring ---------------------- diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 48c722bc16a86..90f839897ce4b 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -41,6 +41,16 @@ Pyjanitor provides a clean API for cleaning data, using method chaining. Engarde is a lightweight library used to explicitly state assumptions about your datasets and check that they're *actually* true. +`pandas-path `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since Python 3.4, `pathlib `_ has been +included in the Python standard library. Path objects provide a simple +and delightful way to interact with the file system. The pandas-path package enables the +Path API for pandas through a custom accessor ``.path``. Getting just the filenames from +a series of full file paths is as simple as ``my_files.path.name``. Other convenient operations like +joining paths, replacing file extensions, and checking if files exist are also available. + .. _ecosystem.stats: Statistics and machine learning @@ -112,16 +122,14 @@ also goes beyond matplotlib and pandas with the option to perform statistical estimation while plotting, aggregating across observations and visualizing the fit of statistical models to emphasize patterns in a dataset. -`yhat/ggpy `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +`plotnine `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. Based on `"The Grammar of Graphics" `__ it provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data. -It's really quite incredible. Various implementations to other languages are available, -but a faithful implementation for Python users has long been missing. Although still young -(as of Jan-2014), the `yhat/ggpy `__ project has been -progressing quickly in that direction. +Various implementations to other languages are available. +A good implementation for Python users is `has2k1/plotnine `__. `IPython Vega `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -244,8 +252,8 @@ Pandas DataFrames with timeseries indexes. `pydatastream `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PyDatastream is a Python interface to the -`Thomson Dataworks Enterprise (DWE/Datastream) `__ -SOAP API to return indexed Pandas DataFrames with financial data. +`Refinitiv Datastream (DWS) `__ +REST API to return indexed Pandas DataFrames with financial data. This package requires valid credentials for this API (non free). `pandaSDMX `__ @@ -327,6 +335,21 @@ PyTables, h5py, and pymongo to move data between non pandas formats. Its graph based approach is also extensible by end users for custom formats that may be too specific for the core of odo. +`Pandarallel `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. +If also displays progress bars. + +.. code:: python + + from pandarallel import pandarallel + + pandarallel.initialize(progress_bar=True) + + # df.apply(func) + df.parallel_apply(func) + `Ray `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -371,13 +394,16 @@ A directory of projects providing :ref:`extension accessors `. This is for users to discover new accessors and for library authors to coordinate on the namespace. -============== ========== ========================= -Library Accessor Classes -============== ========== ========================= -`cyberpandas`_ ``ip`` ``Series`` -`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` -============== ========== ========================= +=============== ========== ========================= =============================================================== +Library Accessor Classes Description +=============== ========== ========================= =============================================================== +`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses. +`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. +`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. +=============== ========== ========================= =============================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest .. _pdvega: https://altair-viz.github.io/pdvega/ - +.. _Altair: https://altair-viz.github.io/ +.. _pandas_path: https://github.com/drivendataorg/pandas-path/ +.. _pathlib.Path: https://docs.python.org/3/library/pathlib.html \ No newline at end of file diff --git a/doc/source/getting_started/10min.rst b/doc/source/getting_started/10min.rst index 66e500131b316..3055a22129b91 100644 --- a/doc/source/getting_started/10min.rst +++ b/doc/source/getting_started/10min.rst @@ -697,8 +697,9 @@ Plotting See the :ref:`Plotting ` docs. +We use the standard convention for referencing the matplotlib API: + .. ipython:: python - :suppress: import matplotlib.pyplot as plt plt.close('all') diff --git a/doc/source/getting_started/basics.rst b/doc/source/getting_started/basics.rst index d489d35dc1226..4fef5efbd1551 100644 --- a/doc/source/getting_started/basics.rst +++ b/doc/source/getting_started/basics.rst @@ -1937,21 +1937,36 @@ See :ref:`extending.extension-types` for how to write your own extension that works with pandas. See :ref:`ecosystem.extensions` for a list of third-party libraries that have implemented an extension. -The following table lists all of pandas extension types. See the respective +The following table lists all of pandas extension types. For methods requiring ``dtype`` +arguments, strings can be specified as indicated. See the respective documentation sections for more on each type. -=================== ========================= ================== ============================= ============================= -Kind of Data Data Type Scalar Array Documentation -=================== ========================= ================== ============================= ============================= -tz-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :class:`arrays.DatetimeArray` :ref:`timeseries.timezone` -Categorical :class:`CategoricalDtype` (none) :class:`Categorical` :ref:`categorical` -period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays.PeriodArray` :ref:`timeseries.periods` -sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse` -intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` -nullable integer :class:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` -Strings :class:`StringDtype` :class:`str` :class:`arrays.StringArray` :ref:`text` -Boolean (with NA) :class:`BooleanDtype` :class:`bool` :class:`arrays.BooleanArray` :ref:`api.arrays.bool` -=================== ========================= ================== ============================= ============================= ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Kind of Data | Data Type | Scalar | Array | String Aliases | Documentation | ++===================+===========================+====================+===============================+=========================================+===============================+ +| tz-aware datetime | :class:`DatetimeTZDtype` | :class:`Timestamp` | :class:`arrays.DatetimeArray` | ``'datetime64[ns, ]'`` | :ref:`timeseries.timezone` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Categorical | :class:`CategoricalDtype` | (none) | :class:`Categorical` | ``'category'`` | :ref:`categorical` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| period | :class:`PeriodDtype` | :class:`Period` | :class:`arrays.PeriodArray` | ``'period[]'``, | :ref:`timeseries.periods` | +| (time spans) | | | | ``'Period[]'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| sparse | :class:`SparseDtype` | (none) | :class:`arrays.SparseArray` | ``'Sparse'``, ``'Sparse[int]'``, | :ref:`sparse` | +| | | | | ``'Sparse[float]'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| intervals | :class:`IntervalDtype` | :class:`Interval` | :class:`arrays.IntervalArray` | ``'interval'``, ``'Interval'``, | :ref:`advanced.intervalindex` | +| | | | | ``'Interval[]'``, | | +| | | | | ``'Interval[datetime64[ns, ]]'``, | | +| | | | | ``'Interval[timedelta64[]]'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| nullable integer + :class:`Int64Dtype`, ... | (none) | :class:`arrays.IntegerArray` | ``'Int8'``, ``'Int16'``, ``'Int32'``, | :ref:`integer_na` | +| | | | | ``'Int64'``, ``'UInt8'``, ``'UInt16'``, | | +| | | | | ``'UInt32'``, ``'UInt64'`` | | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Strings | :class:`StringDtype` | :class:`str` | :class:`arrays.StringArray` | ``'string'`` | :ref:`text` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ +| Boolean (with NA) | :class:`BooleanDtype` | :class:`bool` | :class:`arrays.BooleanArray` | ``'boolean'`` | :ref:`api.arrays.bool` | ++-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ Pandas has two ways to store strings. diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index 69bb700c97b15..4e284fe7b5968 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -629,7 +629,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])['total_bill', 'tip'].sum() + tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() tips_summed.head() diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index db687386329bb..fec6bae1e0330 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -617,7 +617,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])['total_bill', 'tip'].sum() + tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() tips_summed.head() diff --git a/doc/source/getting_started/dsintro.rst b/doc/source/getting_started/dsintro.rst index a07fcbd8b67c4..81a2f0ae7d162 100644 --- a/doc/source/getting_started/dsintro.rst +++ b/doc/source/getting_started/dsintro.rst @@ -136,7 +136,7 @@ Like a NumPy array, a pandas Series has a :attr:`~Series.dtype`. This is often a NumPy dtype. However, pandas and 3rd-party libraries extend NumPy's type system in a few places, in which case the dtype would -be a :class:`~pandas.api.extensions.ExtensionDtype`. Some examples within +be an :class:`~pandas.api.extensions.ExtensionDtype`. Some examples within pandas are :ref:`categorical` and :ref:`integer_na`. See :ref:`basics.dtypes` for more. @@ -676,11 +676,11 @@ similar to an ndarray: # only show the first 5 rows df[:5].T +.. _dsintro.numpy_interop: + DataFrame interoperability with NumPy functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _dsintro.numpy_interop: - Elementwise NumPy ufuncs (log, exp, sqrt, ...) and various other NumPy functions can be used with no issues on Series and DataFrame, assuming the data within are numeric: @@ -741,7 +741,7 @@ implementation takes precedence and a Series is returned. np.maximum(ser, idx) NumPy ufuncs are safe to apply to :class:`Series` backed by non-ndarray arrays, -for example :class:`SparseArray` (see :ref:`sparse.calculation`). If possible, +for example :class:`arrays.SparseArray` (see :ref:`sparse.calculation`). If possible, the ufunc is applied without converting the underlying data to an ndarray. Console display diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 62a39fb5176f9..b3fd443e662a9 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -234,7 +234,8 @@ Optional dependencies ~~~~~~~~~~~~~~~~~~~~~ Pandas has many optional dependencies that are only used for specific methods. -For example, :func:`pandas.read_hdf` requires the ``pytables`` package. If the +For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while +:meth:`DataFrame.to_markdown` requires the ``tabulate`` package. If the optional dependency is not installed, pandas will raise an ``ImportError`` when the method requiring that dependency is called. @@ -255,6 +256,7 @@ gcsfs 0.2.2 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) lxml 3.8.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.2 Visualization +numba 0.46.0 Alternative execution engine for rolling operations openpyxl 2.5.7 Reading / writing for xlsx files pandas-gbq 0.8.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy @@ -264,6 +266,7 @@ pyreadstat SPSS files (.sav) reading pytables 3.4.2 HDF5 reading / writing qtpy Clipboard I/O s3fs 0.3.0 Amazon S3 access +tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) xarray 0.8.2 pandas-like API for N-dimensional data xclip Clipboard I/O on linux xlrd 1.1.0 Excel reading @@ -301,3 +304,4 @@ top-level :func:`~pandas.read_html` function: .. _html5lib: https://github.com/html5lib/html5lib-python .. _BeautifulSoup4: http://www.crummy.com/software/BeautifulSoup .. _lxml: http://lxml.de +.. _tabulate: https://github.com/astanin/python-tabulate diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 212f3636d0a98..1ed0e8f635b58 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -15,7 +15,7 @@ pandas' own :ref:`10 Minutes to pandas<10min>`. More complex recipes are in the :ref:`Cookbook`. -A handy pandas `cheat sheet `_. +A handy pandas `cheat sheet `_. Community guides ================ diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index 9cea68530fbe7..4ced92cbda81a 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -39,7 +39,7 @@ See the :ref:`overview` for more detail about what's in the library. :hidden: {% endif %} {% if not single_doc %} - What's New in 1.0.0 + What's New in 1.1.0 getting_started/index user_guide/index {% endif -%} @@ -51,7 +51,7 @@ See the :ref:`overview` for more detail about what's in the library. whatsnew/index {% endif %} -* :doc:`whatsnew/v1.0.0` +* :doc:`whatsnew/v1.1.0` * :doc:`getting_started/index` * :doc:`getting_started/install` @@ -109,6 +109,7 @@ See the :ref:`overview` for more detail about what's in the library. * :doc:`development/index` * :doc:`development/contributing` + * :doc:`development/code_style` * :doc:`development/internals` * :doc:`development/extending` * :doc:`development/developer` diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index cf14d28772f4c..c71350ecd73b3 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -12,7 +12,8 @@ For most data types, pandas uses NumPy arrays as the concrete objects contained with a :class:`Index`, :class:`Series`, or :class:`DataFrame`. -For some data types, pandas extends NumPy's type system. +For some data types, pandas extends NumPy's type system. String aliases for these types +can be found at :ref:`basics.dtypes`. =================== ========================= ================== ============================= Kind of Data Pandas Data Type Scalar Array @@ -443,13 +444,13 @@ Sparse data ----------- Data where a single value is repeated many times (e.g. ``0`` or ``NaN``) may -be stored efficiently as a :class:`SparseArray`. +be stored efficiently as a :class:`arrays.SparseArray`. .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst - SparseArray + arrays.SparseArray .. autosummary:: :toctree: api/ diff --git a/doc/source/reference/extensions.rst b/doc/source/reference/extensions.rst index 4b1a99da7cd4c..c072237850d82 100644 --- a/doc/source/reference/extensions.rst +++ b/doc/source/reference/extensions.rst @@ -59,3 +59,16 @@ objects. api.extensions.ExtensionArray.nbytes api.extensions.ExtensionArray.ndim api.extensions.ExtensionArray.shape + +Additionally, we have some utility methods for ensuring your object +behaves correctly. + +.. autosummary:: + :toctree: api/ + + api.indexers.check_bool_array_indexer + + +The sentinel ``pandas.api.extensions.no_default`` is used as the default +value in some methods. Use an ``is`` comparison to check if the user +provides a non-default value. diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 815f3f9c19d49..01aa6c60e3b2f 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -273,6 +273,8 @@ Metadata :attr:`DataFrame.attrs` is a dictionary for storing global metadata for this DataFrame. +.. warning:: ``DataFrame.attrs`` is considered experimental and may change without warning. + .. autosummary:: :toctree: api/ @@ -361,4 +363,5 @@ Serialization / IO / conversion DataFrame.to_records DataFrame.to_string DataFrame.to_clipboard + DataFrame.to_markdown DataFrame.style diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 0961acc43f301..0d9e0b0f4c668 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -18,6 +18,8 @@ Working with options set_option option_context +.. _api.general.testing: + Testing functions ----------------- .. autosummary:: @@ -26,6 +28,7 @@ Testing functions testing.assert_frame_equal testing.assert_series_equal testing.assert_index_equal + testing.assert_extension_array_equal Exceptions and warnings ----------------------- diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index 4a58055f1c955..fc1c6d6bd6d47 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -35,6 +35,8 @@ Methods DateOffset.copy DateOffset.isAnchored DateOffset.onOffset + DateOffset.is_anchored + DateOffset.is_on_offset BusinessDay ----------- @@ -65,6 +67,8 @@ Methods BusinessDay.copy BusinessDay.isAnchored BusinessDay.onOffset + BusinessDay.is_anchored + BusinessDay.is_on_offset BusinessHour ------------ @@ -94,6 +98,8 @@ Methods BusinessHour.copy BusinessHour.isAnchored BusinessHour.onOffset + BusinessHour.is_anchored + BusinessHour.is_on_offset CustomBusinessDay ----------------- @@ -123,6 +129,8 @@ Methods CustomBusinessDay.copy CustomBusinessDay.isAnchored CustomBusinessDay.onOffset + CustomBusinessDay.is_anchored + CustomBusinessDay.is_on_offset CustomBusinessHour ------------------ @@ -152,6 +160,8 @@ Methods CustomBusinessHour.copy CustomBusinessHour.isAnchored CustomBusinessHour.onOffset + CustomBusinessHour.is_anchored + CustomBusinessHour.is_on_offset MonthOffset ----------- @@ -182,6 +192,8 @@ Methods MonthOffset.copy MonthOffset.isAnchored MonthOffset.onOffset + MonthOffset.is_anchored + MonthOffset.is_on_offset MonthEnd -------- @@ -212,6 +224,8 @@ Methods MonthEnd.copy MonthEnd.isAnchored MonthEnd.onOffset + MonthEnd.is_anchored + MonthEnd.is_on_offset MonthBegin ---------- @@ -242,6 +256,8 @@ Methods MonthBegin.copy MonthBegin.isAnchored MonthBegin.onOffset + MonthBegin.is_anchored + MonthBegin.is_on_offset BusinessMonthEnd ---------------- @@ -272,6 +288,8 @@ Methods BusinessMonthEnd.copy BusinessMonthEnd.isAnchored BusinessMonthEnd.onOffset + BusinessMonthEnd.is_anchored + BusinessMonthEnd.is_on_offset BusinessMonthBegin ------------------ @@ -302,6 +320,8 @@ Methods BusinessMonthBegin.copy BusinessMonthBegin.isAnchored BusinessMonthBegin.onOffset + BusinessMonthBegin.is_anchored + BusinessMonthBegin.is_on_offset CustomBusinessMonthEnd ---------------------- @@ -332,6 +352,8 @@ Methods CustomBusinessMonthEnd.copy CustomBusinessMonthEnd.isAnchored CustomBusinessMonthEnd.onOffset + CustomBusinessMonthEnd.is_anchored + CustomBusinessMonthEnd.is_on_offset CustomBusinessMonthBegin ------------------------ @@ -362,6 +384,8 @@ Methods CustomBusinessMonthBegin.copy CustomBusinessMonthBegin.isAnchored CustomBusinessMonthBegin.onOffset + CustomBusinessMonthBegin.is_anchored + CustomBusinessMonthBegin.is_on_offset SemiMonthOffset --------------- @@ -392,6 +416,8 @@ Methods SemiMonthOffset.copy SemiMonthOffset.isAnchored SemiMonthOffset.onOffset + SemiMonthOffset.is_anchored + SemiMonthOffset.is_on_offset SemiMonthEnd ------------ @@ -422,6 +448,8 @@ Methods SemiMonthEnd.copy SemiMonthEnd.isAnchored SemiMonthEnd.onOffset + SemiMonthEnd.is_anchored + SemiMonthEnd.is_on_offset SemiMonthBegin -------------- @@ -452,6 +480,8 @@ Methods SemiMonthBegin.copy SemiMonthBegin.isAnchored SemiMonthBegin.onOffset + SemiMonthBegin.is_anchored + SemiMonthBegin.is_on_offset Week ---- @@ -482,6 +512,8 @@ Methods Week.copy Week.isAnchored Week.onOffset + Week.is_anchored + Week.is_on_offset WeekOfMonth ----------- @@ -511,6 +543,8 @@ Methods WeekOfMonth.copy WeekOfMonth.isAnchored WeekOfMonth.onOffset + WeekOfMonth.is_anchored + WeekOfMonth.is_on_offset LastWeekOfMonth --------------- @@ -540,6 +574,8 @@ Methods LastWeekOfMonth.copy LastWeekOfMonth.isAnchored LastWeekOfMonth.onOffset + LastWeekOfMonth.is_anchored + LastWeekOfMonth.is_on_offset QuarterOffset ------------- @@ -570,6 +606,8 @@ Methods QuarterOffset.copy QuarterOffset.isAnchored QuarterOffset.onOffset + QuarterOffset.is_anchored + QuarterOffset.is_on_offset BQuarterEnd ----------- @@ -600,6 +638,8 @@ Methods BQuarterEnd.copy BQuarterEnd.isAnchored BQuarterEnd.onOffset + BQuarterEnd.is_anchored + BQuarterEnd.is_on_offset BQuarterBegin ------------- @@ -630,6 +670,8 @@ Methods BQuarterBegin.copy BQuarterBegin.isAnchored BQuarterBegin.onOffset + BQuarterBegin.is_anchored + BQuarterBegin.is_on_offset QuarterEnd ---------- @@ -660,6 +702,8 @@ Methods QuarterEnd.copy QuarterEnd.isAnchored QuarterEnd.onOffset + QuarterEnd.is_anchored + QuarterEnd.is_on_offset QuarterBegin ------------ @@ -690,6 +734,8 @@ Methods QuarterBegin.copy QuarterBegin.isAnchored QuarterBegin.onOffset + QuarterBegin.is_anchored + QuarterBegin.is_on_offset YearOffset ---------- @@ -720,6 +766,8 @@ Methods YearOffset.copy YearOffset.isAnchored YearOffset.onOffset + YearOffset.is_anchored + YearOffset.is_on_offset BYearEnd -------- @@ -750,6 +798,8 @@ Methods BYearEnd.copy BYearEnd.isAnchored BYearEnd.onOffset + BYearEnd.is_anchored + BYearEnd.is_on_offset BYearBegin ---------- @@ -780,6 +830,8 @@ Methods BYearBegin.copy BYearBegin.isAnchored BYearBegin.onOffset + BYearBegin.is_anchored + BYearBegin.is_on_offset YearEnd ------- @@ -810,6 +862,8 @@ Methods YearEnd.copy YearEnd.isAnchored YearEnd.onOffset + YearEnd.is_anchored + YearEnd.is_on_offset YearBegin --------- @@ -840,6 +894,8 @@ Methods YearBegin.copy YearBegin.isAnchored YearBegin.onOffset + YearBegin.is_anchored + YearBegin.is_on_offset FY5253 ------ @@ -871,6 +927,8 @@ Methods FY5253.get_year_end FY5253.isAnchored FY5253.onOffset + FY5253.is_anchored + FY5253.is_on_offset FY5253Quarter ------------- @@ -901,6 +959,8 @@ Methods FY5253Quarter.get_weeks FY5253Quarter.isAnchored FY5253Quarter.onOffset + FY5253Quarter.is_anchored + FY5253Quarter.is_on_offset FY5253Quarter.year_has_extra_week Easter @@ -931,6 +991,8 @@ Methods Easter.copy Easter.isAnchored Easter.onOffset + Easter.is_anchored + Easter.is_on_offset Tick ---- @@ -960,6 +1022,8 @@ Methods Tick.copy Tick.isAnchored Tick.onOffset + Tick.is_anchored + Tick.is_on_offset Day --- @@ -989,6 +1053,8 @@ Methods Day.copy Day.isAnchored Day.onOffset + Day.is_anchored + Day.is_on_offset Hour ---- @@ -1018,6 +1084,8 @@ Methods Hour.copy Hour.isAnchored Hour.onOffset + Hour.is_anchored + Hour.is_on_offset Minute ------ @@ -1047,6 +1115,8 @@ Methods Minute.copy Minute.isAnchored Minute.onOffset + Minute.is_anchored + Minute.is_on_offset Second ------ @@ -1076,6 +1146,8 @@ Methods Second.copy Second.isAnchored Second.onOffset + Second.is_anchored + Second.is_on_offset Milli ----- @@ -1105,6 +1177,8 @@ Methods Milli.copy Milli.isAnchored Milli.onOffset + Milli.is_anchored + Milli.is_on_offset Micro ----- @@ -1134,6 +1208,8 @@ Methods Micro.copy Micro.isAnchored Micro.onOffset + Micro.is_anchored + Micro.is_on_offset Nano ---- @@ -1163,6 +1239,8 @@ Methods Nano.copy Nano.isAnchored Nano.onOffset + Nano.is_anchored + Nano.is_on_offset BDay ---- @@ -1195,6 +1273,8 @@ Methods BDay.copy BDay.isAnchored BDay.onOffset + BDay.is_anchored + BDay.is_on_offset BDay.rollback BDay.rollforward @@ -1228,6 +1308,8 @@ Methods BMonthEnd.copy BMonthEnd.isAnchored BMonthEnd.onOffset + BMonthEnd.is_anchored + BMonthEnd.is_on_offset BMonthEnd.rollback BMonthEnd.rollforward @@ -1261,6 +1343,8 @@ Methods BMonthBegin.copy BMonthBegin.isAnchored BMonthBegin.onOffset + BMonthBegin.is_anchored + BMonthBegin.is_on_offset BMonthBegin.rollback BMonthBegin.rollforward @@ -1298,6 +1382,8 @@ Methods CBMonthEnd.copy CBMonthEnd.isAnchored CBMonthEnd.onOffset + CBMonthEnd.is_anchored + CBMonthEnd.is_on_offset CBMonthEnd.rollback CBMonthEnd.rollforward @@ -1335,6 +1421,8 @@ Methods CBMonthBegin.copy CBMonthBegin.isAnchored CBMonthBegin.onOffset + CBMonthBegin.is_anchored + CBMonthBegin.is_on_offset CBMonthBegin.rollback CBMonthBegin.rollforward @@ -1369,6 +1457,8 @@ Methods CDay.copy CDay.isAnchored CDay.onOffset + CDay.is_anchored + CDay.is_on_offset CDay.rollback CDay.rollforward diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 6e1ee303135d8..4ad6a7b014532 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -525,6 +525,8 @@ Metadata :attr:`Series.attrs` is a dictionary for storing global metadata for this Series. +.. warning:: ``Series.attrs`` is considered experimental and may change without warning. + .. autosummary:: :toctree: api/ @@ -578,3 +580,4 @@ Serialization / IO / conversion Series.to_string Series.to_clipboard Series.to_latex + Series.to_markdown diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 31bb71064d735..d6f5c0c758b60 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -565,19 +565,15 @@ When working with an ``Index`` object directly, rather than via a ``DataFrame``, mi2 = mi.rename("new name", level=0) mi2 -.. warning:: - Prior to pandas 1.0.0, you could also set the names of a ``MultiIndex`` - by updating the name of a level. +You cannot set the names of the MultiIndex via a level. - .. code-block:: none +.. ipython:: python + :okexcept: - >>> mi.levels[0].name = 'name via level' - >>> mi.names[0] # only works for older panads - 'name via level' + mi.levels[0].name = "name via level" - As of pandas 1.0, this will *silently* fail to update the names - of the MultiIndex. Use :meth:`Index.set_names` instead. +Use :meth:`Index.set_names` instead. Sorting a ``MultiIndex`` ------------------------ diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index e0f676d3072fc..5276bc6142206 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -14,6 +14,29 @@ Nullable Boolean Data Type .. versionadded:: 1.0.0 + +.. _boolean.indexing: + +Indexing with NA values +----------------------- + +pandas does not allow indexing with NA values. Attempting to do so +will raise a ``ValueError``. + +.. ipython:: python + :okexcept: + + s = pd.Series([1, 2, 3]) + mask = pd.array([True, False, pd.NA], dtype="boolean") + s[mask] + +The missing values will need to be explicitly filled with True or False prior +to using the array as a mask. + +.. ipython:: python + + s[mask.fillna(False)] + .. _boolean.kleene: Kleene Logical Operations diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 627a83b7359bb..aeb32db639ffb 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -321,6 +321,11 @@ We provide a number of common statistical functions: :meth:`~Rolling.cov`, Unbiased covariance (binary) :meth:`~Rolling.corr`, Correlation (binary) +.. _stats.rolling_apply: + +Rolling Apply +~~~~~~~~~~~~~ + The :meth:`~Rolling.apply` function takes an extra ``func`` argument and performs generic rolling computations. The ``func`` argument should be a single function that produces a single value from an ndarray input. Suppose we wanted to @@ -334,6 +339,49 @@ compute the mean absolute deviation on a rolling basis: @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') +.. versionadded:: 1.0 + +Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ +if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying +``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). +Numba will be applied in potentially two routines: + +1. If ``func`` is a standard Python function, the engine will `JIT `__ +the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. + +2. The engine will JIT the for loop where the apply function is applied to each window. + +The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the +`numba.jit decorator `__. +These keyword arguments will be applied to *both* the passed function (if a standard Python function) +and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, +and their default values are set to ``False``, ``True`` and ``False`` respectively. + +.. note:: + + In terms of performance, **the first time a function is run using the Numba engine will be slow** + as Numba will have some function compilation overhead. However, ``rolling`` objects will cache + the function and subsequent calls will be fast. In general, the Numba engine is performant with + a larger amount of data points (e.g. 1+ million). + +.. code-block:: ipython + + In [1]: data = pd.Series(range(1_000_000)) + + In [2]: roll = data.rolling(10) + + In [3]: def f(x): + ...: return np.sum(x) + 5 + # Run the first time, compilation time will affect performance + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225 + 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) + # Function is cached and performance will improve + In [5]: %timeit roll.apply(f, engine='numba', raw=True) + 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [6]: %timeit roll.apply(f, engine='cython', raw=True) + 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + .. _stats.rolling_window: Rolling windows diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 37637bbdb38e6..f581d183b9413 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -406,10 +406,10 @@ Levels ****** `Prepending a level to a multiindex -`__ +`__ `Flatten Hierarchical columns -`__ +`__ .. _cookbook.missing_data: @@ -430,13 +430,13 @@ Fill forward a reversed timeseries df.reindex(df.index[::-1]).ffill() `cumsum reset at NaN values -`__ +`__ Replace ******* `Using replace with backrefs -`__ +`__ .. _cookbook.grouping: @@ -446,7 +446,7 @@ Grouping The :ref:`grouping ` docs. `Basic grouping with apply -`__ +`__ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to all the columns @@ -462,7 +462,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()]) `Using get_group -`__ +`__ .. ipython:: python @@ -470,7 +470,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to gb.get_group('cat') `Apply to different items in a group -`__ +`__ .. ipython:: python @@ -486,7 +486,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to expected_df `Expanding apply -`__ +`__ .. ipython:: python @@ -502,7 +502,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to `Replacing some values with mean of the rest of a group -`__ +`__ .. ipython:: python @@ -516,7 +516,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to gb.transform(replace) `Sort groups by aggregated data -`__ +`__ .. ipython:: python @@ -533,7 +533,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to sorted_df `Create multiple aggregated columns -`__ +`__ .. ipython:: python @@ -550,7 +550,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to ts `Create a value counts column and reassign back to the DataFrame -`__ +`__ .. ipython:: python @@ -561,7 +561,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df `Shift groups of the values in a column based on the index -`__ +`__ .. ipython:: python @@ -575,7 +575,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df `Select row with maximum value from each group -`__ +`__ .. ipython:: python @@ -587,7 +587,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df_count `Grouping like Python's itertools.groupby -`__ +`__ .. ipython:: python @@ -599,19 +599,19 @@ Expanding data ************** `Alignment and to-date -`__ +`__ `Rolling Computation window based on values instead of counts -`__ +`__ `Rolling Mean by Time Interval -`__ +`__ Splitting ********* `Splitting a frame -`__ +`__ Create a list of dataframes, split using a delineation based on logic included in rows. @@ -635,7 +635,7 @@ Pivot The :ref:`Pivot ` docs. `Partial sums and subtotals -`__ +`__ .. ipython:: python @@ -649,7 +649,7 @@ The :ref:`Pivot ` docs. table.stack('City') `Frequency table like plyr in R -`__ +`__ .. ipython:: python @@ -675,7 +675,7 @@ The :ref:`Pivot ` docs. 'Grade': lambda x: sum(x) / len(x)}) `Plot pandas DataFrame with year over year data -`__ +`__ To create year and month cross tabulation: @@ -691,7 +691,7 @@ Apply ***** `Rolling apply to organize - Turning embedded lists into a MultiIndex frame -`__ +`__ .. ipython:: python @@ -707,7 +707,7 @@ Apply df_orgz `Rolling apply with a DataFrame returning a Series -`__ +`__ Rolling Apply to multiple columns where function calculates a Series before a Scalar from the Series is returned @@ -727,7 +727,7 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc s `Rolling apply with a DataFrame returning a Scalar -`__ +`__ Rolling Apply to multiple columns where function returns a Scalar (Volume Weighted Average Price) @@ -753,26 +753,26 @@ Timeseries ---------- `Between times -`__ +`__ `Using indexer between time -`__ +`__ `Constructing a datetime range that excludes weekends and includes only certain times -`__ +`__ `Vectorized Lookup -`__ +`__ `Aggregation and plotting time series `__ Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. `How to rearrange a Python pandas DataFrame? -`__ +`__ `Dealing with duplicates when reindexing a timeseries to a specified frequency -`__ +`__ Calculate the first day of the month for each entry in a DatetimeIndex @@ -795,7 +795,7 @@ The :ref:`Resample ` docs. `__ `Valid frequency arguments to Grouper -`__ +`__ `Grouping using a MultiIndex `__ @@ -804,15 +804,15 @@ The :ref:`Resample ` docs. `__ `Resampling with custom periods -`__ +`__ `Resample intraday frame without adding new days -`__ +`__ `Resample minute data -`__ +`__ -`Resample with groupby `__ +`Resample with groupby `__ .. _cookbook.merge: @@ -822,7 +822,7 @@ Merge The :ref:`Concat ` docs. The :ref:`Join ` docs. `Append two dataframes with overlapping index (emulate R rbind) -`__ +`__ .. ipython:: python @@ -855,16 +855,16 @@ Depending on df construction, ``ignore_index`` may be needed suffixes=('_L', '_R')) `How to set the index and join -`__ +`__ `KDB like asof join -`__ +`__ `Join with a criteria based on the values -`__ +`__ `Using searchsorted to merge based on values inside a range -`__ +`__ .. _cookbook.plotting: @@ -874,31 +874,31 @@ Plotting The :ref:`Plotting ` docs. `Make Matplotlib look like R -`__ +`__ `Setting x-axis major and minor labels -`__ +`__ `Plotting multiple charts in an ipython notebook -`__ +`__ `Creating a multi-line plot -`__ +`__ `Plotting a heatmap -`__ +`__ `Annotate a time-series plot -`__ +`__ `Annotate a time-series plot #2 -`__ +`__ `Generate Embedded plots in excel files using Pandas, Vincent and xlsxwriter `__ `Boxplot for each quartile of a stratifying variable -`__ +`__ .. ipython:: python @@ -918,7 +918,7 @@ Data In/Out ----------- `Performance comparison of SQL vs HDF5 -`__ +`__ .. _cookbook.csv: @@ -930,25 +930,25 @@ The :ref:`CSV ` docs `read_csv in action `__ `appending to a csv -`__ +`__ `Reading a csv chunk-by-chunk -`__ +`__ `Reading only certain rows of a csv chunk-by-chunk -`__ +`__ `Reading the first few lines of a frame -`__ +`__ Reading a file that is compressed but not by ``gzip/bz2`` (the native compressed formats which ``read_csv`` understands). This example shows a ``WinZipped`` file, but is a general application of opening the file within a context manager and using that handle to read. `See here -`__ +`__ `Inferring dtypes from a file -`__ +`__ `Dealing with bad lines `__ @@ -960,7 +960,7 @@ using that handle to read. `__ `Write a multi-row index CSV without writing duplicates -`__ +`__ .. _cookbook.csv.multiple_files: @@ -1069,7 +1069,7 @@ SQL The :ref:`SQL ` docs `Reading from databases with SQL -`__ +`__ .. _cookbook.excel: @@ -1079,7 +1079,7 @@ Excel The :ref:`Excel ` docs `Reading from a filelike handle -`__ +`__ `Modifying formatting in XlsxWriter output `__ @@ -1090,7 +1090,7 @@ HTML **** `Reading HTML tables from a server that cannot handle the default request -header `__ +header `__ .. _cookbook.hdf: @@ -1100,54 +1100,54 @@ HDFStore The :ref:`HDFStores ` docs `Simple queries with a Timestamp Index -`__ +`__ `Managing heterogeneous data using a linked multiple table hierarchy `__ `Merging on-disk tables with millions of rows -`__ +`__ `Avoiding inconsistencies when writing to a store from multiple processes/threads -`__ +`__ De-duplicating a large store by chunks, essentially a recursive reduction operation. Shows a function for taking in data from csv file and creating a store by chunks, with date parsing as well. `See here -`__ +`__ `Creating a store chunk-by-chunk from a csv file -`__ +`__ `Appending to a store, while creating a unique index -`__ +`__ `Large Data work flows -`__ +`__ `Reading in a sequence of files, then providing a global unique index to a store while appending -`__ +`__ `Groupby on a HDFStore with low group density -`__ +`__ `Groupby on a HDFStore with high group density -`__ +`__ `Hierarchical queries on a HDFStore -`__ +`__ `Counting with a HDFStore -`__ +`__ `Troubleshoot HDFStore exceptions -`__ +`__ `Setting min_itemsize with strings -`__ +`__ `Using ptrepack to create a completely-sorted-index on a store -`__ +`__ Storing Attributes to a group node @@ -1305,7 +1305,7 @@ The :ref:`Timedeltas ` docs. datetime.timedelta(minutes=5) + s `Adding and subtracting deltas and dates -`__ +`__ .. ipython:: python @@ -1322,7 +1322,7 @@ The :ref:`Timedeltas ` docs. df.dtypes `Another example -`__ +`__ Values can be set to NaT using np.nan, similar to datetime diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 0229331127441..a8cdf4a61073d 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -668,7 +668,7 @@ Current behavior KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike + https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike Out[4]: 1 2.0 diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index 77568f3bcb244..a45d7a4fa1547 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -15,6 +15,10 @@ Nullable integer data type IntegerArray is currently experimental. Its API or implementation may change without warning. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. In :ref:`missing_data`, we saw that pandas primarily uses ``NaN`` to represent missing data. Because ``NaN`` is a float, this forces an array of integers with @@ -23,6 +27,9 @@ much. But if your integer column is, say, an identifier, casting to float can be problematic. Some integers cannot even be represented as floating point numbers. +Construction +------------ + Pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` implemented within pandas. @@ -39,6 +46,12 @@ NumPy's ``'int64'`` dtype: pd.array([1, 2, np.nan], dtype="Int64") +All NA-like values are replaced with :attr:`pandas.NA`. + +.. ipython:: python + + pd.array([1, 2, np.nan, None, pd.NA], dtype="Int64") + This array can be stored in a :class:`DataFrame` or :class:`Series` like any NumPy array. @@ -78,6 +91,9 @@ with the dtype. In the future, we may provide an option for :class:`Series` to infer a nullable-integer dtype. +Operations +---------- + Operations involving an integer array will behave similar to NumPy arrays. Missing values will be propagated, and the data will be coerced to another dtype if needed. @@ -123,3 +139,15 @@ Reduction and groupby operations such as 'sum' work as well. df.sum() df.groupby('B').A.sum() + +Scalar NA Value +--------------- + +:class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar +missing value. Slicing a single element that's missing will return +:attr:`pandas.NA` + +.. ipython:: python + + a = pd.array([1, None], dtype="Int64") + a[1] diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c32b009948fda..e776da016d5d7 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1153,7 +1153,7 @@ To completely override the default values that are recognized as missing, specif .. _io.navaluesconst: The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', -'n/a', 'NA', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. +'n/a', 'NA', '', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``. Let us consider some examples: @@ -1519,7 +1519,7 @@ rows will skip the intervening rows. .. ipython:: python - from pandas.util.testing import makeCustomDataframe as mkdf + from pandas._testing import makeCustomDataframe as mkdf df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv('mi.csv') print(open('mi.csv').read()) @@ -2066,6 +2066,8 @@ The Numpy parameter +++++++++++++++++++ .. note:: + This param has been deprecated as of version 1.0.0 and will raise a ``FutureWarning``. + This supports numeric data only. Index and columns labels may be non-numeric, e.g. strings, dates etc. If ``numpy=True`` is passed to ``read_json`` an attempt will be made to sniff @@ -2088,6 +2090,7 @@ data: %timeit pd.read_json(jsonfloats) .. ipython:: python + :okwarning: %timeit pd.read_json(jsonfloats, numpy=True) @@ -2102,6 +2105,7 @@ The speedup is less noticeable for smaller datasets: %timeit pd.read_json(jsonfloats) .. ipython:: python + :okwarning: %timeit pd.read_json(jsonfloats, numpy=True) @@ -2629,7 +2633,7 @@ that contain URLs. url_df = pd.DataFrame({ 'name': ['Python', 'Pandas'], - 'url': ['https://www.python.org/', 'http://pandas.pydata.org']}) + 'url': ['https://www.python.org/', 'https://pandas.pydata.org']}) print(url_df.to_html(render_links=True)) .. ipython:: python @@ -3877,6 +3881,8 @@ specified in the format: ``()``, where float may be signed (and fra store.append('dftd', dftd, data_columns=True) store.select('dftd', "C<'-3.5D'") +.. _io.query_multi: + Query MultiIndex ++++++++++++++++ @@ -4214,46 +4220,49 @@ Compression all kinds of stores, not just tables. Two parameters are used to control compression: ``complevel`` and ``complib``. -``complevel`` specifies if and how hard data is to be compressed. - ``complevel=0`` and ``complevel=None`` disables - compression and ``0`_: The default compression library. A classic in terms of compression, achieves good compression rates but is somewhat slow. - - `lzo `_: Fast compression and decompression. - - `bzip2 `_: Good compression rates. - - `blosc `_: Fast compression and decompression. - - Support for alternative blosc compressors: - - - `blosc:blosclz `_ This is the - default compressor for ``blosc`` - - `blosc:lz4 - `_: - A compact, very popular and fast compressor. - - `blosc:lz4hc - `_: - A tweaked version of LZ4, produces better - compression ratios at the expense of speed. - - `blosc:snappy `_: - A popular compressor used in many places. - - `blosc:zlib `_: A classic; - somewhat slower than the previous ones, but - achieving better compression ratios. - - `blosc:zstd `_: An - extremely well balanced codec; it provides the best - compression ratios among the others above, and at - reasonably fast speed. - - If ``complib`` is defined as something other than the - listed libraries a ``ValueError`` exception is issued. +* ``complevel`` specifies if and how hard data is to be compressed. + ``complevel=0`` and ``complevel=None`` disables compression and + ``0`_: The default compression library. + A classic in terms of compression, achieves good compression + rates but is somewhat slow. + - `lzo `_: Fast + compression and decompression. + - `bzip2 `_: Good compression rates. + - `blosc `_: Fast compression and + decompression. + + Support for alternative blosc compressors: + + - `blosc:blosclz `_ This is the + default compressor for ``blosc`` + - `blosc:lz4 + `_: + A compact, very popular and fast compressor. + - `blosc:lz4hc + `_: + A tweaked version of LZ4, produces better + compression ratios at the expense of speed. + - `blosc:snappy `_: + A popular compressor used in many places. + - `blosc:zlib `_: A classic; + somewhat slower than the previous ones, but + achieving better compression ratios. + - `blosc:zstd `_: An + extremely well balanced codec; it provides the best + compression ratios among the others above, and at + reasonably fast speed. + + If ``complib`` is defined as something other than the listed libraries a + ``ValueError`` exception is issued. .. note:: @@ -4646,10 +4655,10 @@ Several caveats. * Index level names, if specified, must be strings. * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. * The ``pyarrow`` engine preserves the ``ordered`` flag of categorical dtypes with string types. ``fastparquet`` does not preserve the ``ordered`` flag. -* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message - on an attempt at serialization. +* Non supported types include ``Interval`` and actual Python object types. These will raise a helpful error message + on an attempt at serialization. ``Period`` type is supported with pyarrow >= 0.16.0. * The ``pyarrow`` engine preserves extension data types such as the nullable integer and string data - type (requiring pyarrow >= 1.0.0, and requiring the extension type to implement the needed protocols, + type (requiring pyarrow >= 0.16.0, and requiring the extension type to implement the needed protocols, see the :ref:`extension types documentation `). You can specify an ``engine`` to direct the serialization. This can be one of ``pyarrow``, or ``fastparquet``, or ``auto``. diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 1bfe196cb2f89..0f55980b3d015 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -791,7 +791,7 @@ the nullable :doc:`integer `, boolean and :ref:`dedicated string ` data types as the missing value indicator. The goal of ``pd.NA`` is provide a "missing" indicator that can be used -consistently accross data types (instead of ``np.nan``, ``None`` or ``pd.NaT`` +consistently across data types (instead of ``np.nan``, ``None`` or ``pd.NaT`` depending on the data type). For example, when having missing values in a Series with the nullable integer @@ -825,14 +825,10 @@ For example, ``pd.NA`` propagates in arithmetic operations, similarly to There are a few special cases when the result is known, even when one of the operands is ``NA``. +.. ipython:: python -================ ====== -Operation Result -================ ====== -``pd.NA ** 0`` 0 -``1 ** pd.NA`` 1 -``-1 ** pd.NA`` -1 -================ ====== + pd.NA ** 0 + 1 ** pd.NA In equality and comparison operations, ``pd.NA`` also propagates. This deviates from the behaviour of ``np.nan``, where comparisons with ``np.nan`` always @@ -920,3 +916,29 @@ filling missing values beforehand. A similar situation occurs when using Series or DataFrame objects in ``if`` statements, see :ref:`gotchas.truth`. + +NumPy ufuncs +------------ + +:attr:`pandas.NA` implements NumPy's ``__array_ufunc__`` protocol. Most ufuncs +work with ``NA``, and generally return ``NA``: + +.. ipython:: python + + np.log(pd.NA) + np.add(pd.NA, 1) + +.. warning:: + + Currently, ufuncs involving an ndarray and ``NA`` will return an + object-dtype filled with NA values. + + .. ipython:: python + + a = np.array([1, 2, 3]) + np.greater(a, pd.NA) + + The return type here may change to return a different array type + in the future. + +See :ref:`dsintro.numpy_interop` for more on ufuncs. diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 8583a9312b690..b28354cd8b5f2 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -14,7 +14,7 @@ Reshaping by pivoting DataFrame objects .. ipython:: python :suppress: - import pandas.util.testing as tm + import pandas._testing as tm tm.N = 3 def unpivot(frame): @@ -38,7 +38,7 @@ For the curious here is how the above ``DataFrame`` was created: .. code-block:: python - import pandas.util.testing as tm + import pandas._testing as tm tm.N = 3 diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 0611c6334937f..43bb4966ec5bf 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -26,7 +26,7 @@ Assuming you want or need the expressiveness and power of pandas, let's carry on .. ipython:: python :suppress: - from pandas.util.testing import _make_timeseries + from pandas._testing import _make_timeseries # Make a random in-memory dataset ts = _make_timeseries(freq="30S", seed=0) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index c258a8840b714..8588fac4a18d0 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -15,7 +15,7 @@ can be chosen, including 0) is omitted. The compressed values are not actually s arr = np.random.randn(10) arr[2:-2] = np.nan - ts = pd.Series(pd.SparseArray(arr)) + ts = pd.Series(pd.arrays.SparseArray(arr)) ts Notice the dtype, ``Sparse[float64, nan]``. The ``nan`` means that elements in the @@ -51,7 +51,7 @@ identical to their dense counterparts. SparseArray ----------- -:class:`SparseArray` is a :class:`~pandas.api.extensions.ExtensionArray` +:class:`arrays.SparseArray` is a :class:`~pandas.api.extensions.ExtensionArray` for storing an array of sparse values (see :ref:`basics.dtypes` for more on extension arrays). It is a 1-dimensional ndarray-like object storing only values distinct from the ``fill_value``: @@ -61,7 +61,7 @@ only values distinct from the ``fill_value``: arr = np.random.randn(10) arr[2:5] = np.nan arr[7:8] = np.nan - sparr = pd.SparseArray(arr) + sparr = pd.arrays.SparseArray(arr) sparr A sparse array can be converted to a regular (dense) ndarray with :meth:`numpy.asarray` @@ -144,7 +144,7 @@ to ``SparseArray`` and get a ``SparseArray`` as a result. .. ipython:: python - arr = pd.SparseArray([1., np.nan, np.nan, -2., np.nan]) + arr = pd.arrays.SparseArray([1., np.nan, np.nan, -2., np.nan]) np.abs(arr) @@ -153,7 +153,7 @@ the correct dense result. .. ipython:: python - arr = pd.SparseArray([1., -1, -1, -2., -1], fill_value=-1) + arr = pd.arrays.SparseArray([1., -1, -1, -2., -1], fill_value=-1) np.abs(arr) np.abs(arr).to_dense() @@ -194,7 +194,7 @@ From an array-like, use the regular :class:`Series` or .. ipython:: python # New way - pd.DataFrame({"A": pd.SparseArray([0, 1])}) + pd.DataFrame({"A": pd.arrays.SparseArray([0, 1])}) From a SciPy sparse matrix, use :meth:`DataFrame.sparse.from_spmatrix`, @@ -256,10 +256,10 @@ Instead, you'll need to ensure that the values being assigned are sparse .. ipython:: python - df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) + df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1])}) df['B'] = [0, 0] # remains dense df['B'].dtype - df['B'] = pd.SparseArray([0, 0]) + df['B'] = pd.arrays.SparseArray([0, 0]) df['B'].dtype The ``SparseDataFrame.default_kind`` and ``SparseDataFrame.default_fill_value`` attributes diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 633827eb79f46..02550eab86913 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1063,7 +1063,7 @@ "- Provide an API that is pleasing to use interactively and is \"good enough\" for many tasks\n", "- Provide the foundations for dedicated libraries to build on\n", "\n", - "If you build a great library on top of this, let us know and we'll [link](http://pandas.pydata.org/pandas-docs/stable/ecosystem.html) to it.\n", + "If you build a great library on top of this, let us know and we'll [link](https://pandas.pydata.org/pandas-docs/stable/ecosystem.html) to it.\n", "\n", "### Subclassing\n", "\n", diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 53c7a7437d55f..88c86ac212f11 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -87,8 +87,9 @@ l. For ``StringDtype``, :ref:`string accessor methods` .. ipython:: python - s.astype(object).str.count("a") - s.astype(object).dropna().str.count("a") + s2 = pd.Series(["a", None, "b"], dtype="object") + s2.str.count("a") + s2.dropna().str.count("a") When NA values are present, the output dtype is float64. Similarly for methods returning boolean values. @@ -101,10 +102,10 @@ l. For ``StringDtype``, :ref:`string accessor methods` 2. Some string methods, like :meth:`Series.str.decode` are not available on ``StringArray`` because ``StringArray`` only holds strings, not bytes. -3. In comparision operations, :class:`arrays.StringArray` and ``Series`` backed +3. In comparison operations, :class:`arrays.StringArray` and ``Series`` backed by a ``StringArray`` will return an object with :class:`BooleanDtype`, rather than a ``bool`` dtype object. Missing values in a ``StringArray`` - will propagate in comparision operations, rather than always comparing + will propagate in comparison operations, rather than always comparing unequal like :attr:`numpy.nan`. Everything else that follows in the rest of this document applies equally to diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 05c7f72882088..bc463d0ab22d8 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details, see the commit logs at http://github.com/pandas-dev/pandas. For install and upgrade instructions, see :ref:`install`. +Version 1.1 +----------- + +.. toctree:: + :maxdepth: 2 + + v1.1.0 + Version 1.0 ----------- diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 86ff338536f80..823e177f3e05e 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -236,7 +236,7 @@ I/O enhancements .. ipython:: python - from pandas.util.testing import makeCustomDataframe as mkdf + from pandas._testing import makeCustomDataframe as mkdf df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) df.to_csv('mi.csv') print(open('mi.csv').read()) diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 6242c40d44bf8..4f9ab761334e7 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -224,7 +224,7 @@ Enhancements .. code-block:: ipython - In [28]: import pandas.util.testing as tm + In [28]: import pandas._testing as tm In [29]: panel = tm.makePanel(5) diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index b328e549e8899..95e354e425143 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -852,7 +852,7 @@ Other notable API changes: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead - See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy + See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy - ``merge``, ``DataFrame.merge``, and ``ordered_merge`` now return the same type as the ``left`` argument (:issue:`7737`). diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index b58eabaed6127..292351c709940 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -172,7 +172,7 @@ Other enhancements: 4 True True True True - Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). -- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. +- Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithmetic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). - Added ``Timedelta.to_timedelta64()`` method to the public API (:issue:`8884`). diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index fc638e35ed88b..855d0b8695bb1 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -528,7 +528,7 @@ Deprecations `seaborn `_ for similar but more refined functionality (:issue:`3445`). The documentation includes some examples how to convert your existing code - from ``rplot`` to seaborn `here `__. + from ``rplot`` to seaborn `here `__. - The ``pandas.sandbox.qtpandas`` interface is deprecated and will be removed in a future version. We refer users to the external package `pandas-qt `_. (:issue:`9615`) diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index a7174c6325f86..d3f96d4185d65 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -1279,7 +1279,7 @@ Bug Fixes - Removed ``millisecond`` property of ``DatetimeIndex``. This would always raise a ``ValueError`` (:issue:`12019`). - Bug in ``Series`` constructor with read-only data (:issue:`11502`) -- Removed ``pandas.util.testing.choice()``. Should use ``np.random.choice()``, instead. (:issue:`12386`) +- Removed ``pandas._testing.choice()``. Should use ``np.random.choice()``, instead. (:issue:`12386`) - Bug in ``.loc`` setitem indexer preventing the use of a TZ-aware DatetimeIndex (:issue:`12050`) - Bug in ``.style`` indexes and MultiIndexes not appearing (:issue:`11655`) - Bug in ``to_msgpack`` and ``from_msgpack`` which did not correctly serialize or deserialize ``NaT`` (:issue:`12307`). diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 6f6446c3f74e1..6eb509a258430 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -1225,6 +1225,7 @@ Previously, sparse data were ``float64`` dtype by default, even if all inputs we As of v0.19.0, sparse data keeps the input dtype, and uses more appropriate ``fill_value`` defaults (``0`` for ``int64`` dtype, ``False`` for ``bool`` dtype). .. ipython:: python + :okwarning: pd.SparseArray([1, 2, 0, 0], dtype=np.int64) pd.SparseArray([True, False, False, False]) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index e7dc6150ffcb1..ceb1c7f27231b 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -1360,7 +1360,7 @@ provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:` .. code-block:: ipython - In [133]: import pandas.util.testing as tm + In [133]: import pandas._testing as tm In [134]: p = tm.makePanel() diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index f33943e423b25..71969c4de6b02 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -470,7 +470,7 @@ Current behavior KeyError in the future, you can use .reindex() as an alternative. See the documentation here: - http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike + https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike Out[4]: 1 2.0 @@ -927,7 +927,7 @@ Other API changes - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytables standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) - ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) -- Removed the ``@slow`` decorator from ``pandas.util.testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) +- Removed the ``@slow`` decorator from ``pandas._testing``, which caused issues for some downstream packages' test suites. Use ``@pytest.mark.slow`` instead, which achieves the same thing (:issue:`16850`) - Moved definition of ``MergeError`` to the ``pandas.errors`` module. - The signature of :func:`Series.set_axis` and :func:`DataFrame.set_axis` has been changed from ``set_axis(axis, labels)`` to ``set_axis(labels, axis=0)``, for consistency with the rest of the API. The old signature is deprecated and will show a ``FutureWarning`` (:issue:`14636`) - :func:`Series.argmin` and :func:`Series.argmax` will now raise a ``TypeError`` when used with ``object`` dtypes, instead of a ``ValueError`` (:issue:`13595`) diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index f4c283ea742f7..b9e1b5060d1da 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -648,7 +648,7 @@ provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:` .. code-block:: ipython - In [75]: import pandas.util.testing as tm + In [75]: import pandas._testing as tm In [76]: p = tm.makePanel() diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index b6b91983b8267..b18d022349001 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -354,6 +354,7 @@ When passed DataFrames whose values are sparse, :func:`concat` will now return a :class:`Series` or :class:`DataFrame` with sparse values, rather than a :class:`SparseDataFrame` (:issue:`25702`). .. ipython:: python + :okwarning: df = pd.DataFrame({"A": pd.SparseArray([0, 1])}) @@ -910,6 +911,7 @@ by a ``Series`` or ``DataFrame`` with sparse values. **New way** .. ipython:: python + :okwarning: df = pd.DataFrame({"A": pd.SparseArray([0, 0, 1, 2])}) df.dtypes diff --git a/doc/source/whatsnew/v0.25.3.rst b/doc/source/whatsnew/v0.25.3.rst index f73a3f956f42e..f7f54198a0f82 100644 --- a/doc/source/whatsnew/v0.25.3.rst +++ b/doc/source/whatsnew/v0.25.3.rst @@ -19,4 +19,4 @@ Groupby/resample/rolling Contributors ~~~~~~~~~~~~ -.. contributors:: v0.25.2..HEAD +.. contributors:: v0.25.2..v0.25.3 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst old mode 100644 new mode 100755 index faca744a8f92c..3bd86bb02155f --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1,40 +1,29 @@ -.. _whatsnew_1000: +.. _whatsnew_100: What's new in 1.0.0 (??) ------------------------ -.. warning:: - - Starting with the 1.x series of releases, pandas only supports Python 3.6.1 and higher. +These are the changes in pandas 1.0.0. See :ref:`release` for a full changelog +including other versions of pandas. -New Deprecation Policy -~~~~~~~~~~~~~~~~~~~~~~ +.. note:: -Starting with Pandas 1.0.0, pandas will adopt a version of `SemVer`_. + The pandas 1.0 release removed a lot of functionality that was deprecated + in previous releases (see :ref:`below ` + for an overview). It is recommended to first upgrade to pandas 0.25 and to + ensure your code is working without warnings, before upgrading to pandas + 1.0. -Historically, pandas has used a "rolling" deprecation policy, with occasional -outright breaking API changes. Where possible, we would deprecate the behavior -we'd like to change, giving an option to adopt the new behavior (via a keyword -or an alternative method), and issuing a warning for users of the old behavior. -Sometimes, a deprecation was not possible, and we would make an outright API -breaking change. -We'll continue to *introduce* deprecations in major and minor releases (e.g. -1.0.0, 1.1.0, ...). Those deprecations will be *enforced* in the next major -release. +New Deprecation Policy +~~~~~~~~~~~~~~~~~~~~~~ -Note that *behavior changes* and *API breaking changes* are not identical. API -breaking changes will only be released in major versions. If we consider a -behavior to be a bug, and fixing that bug induces a behavior change, we'll -release that change in a minor release. This is a sometimes difficult judgment -call that we'll do our best on. +Starting with Pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to +version releases. Briefly, -This doesn't mean that pandas' pace of development will slow down. In the `2019 -Pandas User Survey`_, about 95% of the respondents said they considered pandas -"stable enough". This indicates there's an appetite for new features, even if it -comes at the cost of break API. The difference is that now API breaking changes -will be accompanied with a bump in the major version number (e.g. pandas 1.5.1 --> 2.0.0). +* Deprecations will be introduced in minor releases (e.g. 1.1.0, 1.2.0, 2.1.0, ...) +* Deprecations will be enforced in major releases (e.g. 1.0.0, 2.0.0, 3.0.0, ...) +* API-breaking changes will be made only in major releases (except for experimental features) See :ref:`policies.version` for more. @@ -43,20 +32,63 @@ See :ref:`policies.version` for more. {{ header }} -These are the changes in pandas 1.0.0. See :ref:`release` for a full changelog -including other versions of pandas. - +.. --------------------------------------------------------------------------- Enhancements ~~~~~~~~~~~~ +.. _whatsnew_100.NA: + +Experimental ``NA`` scalar to denote missing values +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A new ``pd.NA`` value (singleton) is introduced to represent scalar missing +values. Up to now, pandas used several values to represent missing data: ``np.nan`` is used for this for float data, ``np.nan`` or +``None`` for object-dtype data and ``pd.NaT`` for datetime-like data. The +goal of ``pd.NA`` is to provide a "missing" indicator that can be used +consistently across data types. ``pd.NA`` is currently used by the nullable integer and boolean +data types and the new string data type (:issue:`28095`). + +.. warning:: + + Experimental: the behaviour of ``pd.NA`` can still change without warning. + +For example, creating a Series using the nullable integer dtype: + +.. ipython:: python + + s = pd.Series([1, 2, None], dtype="Int64") + s + s[2] + +Compared to ``np.nan``, ``pd.NA`` behaves differently in certain operations. +In addition to arithmetic operations, ``pd.NA`` also propagates as "missing" +or "unknown" in comparison operations: + +.. ipython:: python + + np.nan > 1 + pd.NA > 1 + +For logical operations, ``pd.NA`` follows the rules of the +`three-valued logic `__ (or +*Kleene logic*). For example: + +.. ipython:: python + + pd.NA | True + +For more, see :ref:`NA section ` in the user guide on missing +data. + + .. _whatsnew_100.string: Dedicated string data type ^^^^^^^^^^^^^^^^^^^^^^^^^^ We've added :class:`StringDtype`, an extension type dedicated to string data. -Previously, strings were typically stored in object-dtype NumPy arrays. +Previously, strings were typically stored in object-dtype NumPy arrays. (:issue:`29975`) .. warning:: @@ -102,59 +134,15 @@ String accessor methods returning integers will return a value with :class:`Int6 We recommend explicitly using the ``string`` data type when working with strings. See :ref:`text.types` for more. -.. _whatsnew_100.NA: - -Experimental ``NA`` scalar to denote missing values -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A new ``pd.NA`` value (singleton) is introduced to represent scalar missing -values. Up to now, ``np.nan`` is used for this for float data, ``np.nan`` or -``None`` for object-dtype data and ``pd.NaT`` for datetime-like data. The -goal of ``pd.NA`` is provide a "missing" indicator that can be used -consistently accross data types. For now, the nullable integer and boolean -data types and the new string data type make use of ``pd.NA`` (:issue:`28095`). - -.. warning:: - - Experimental: the behaviour of ``pd.NA`` can still change without warning. - -For example, creating a Series using the nullable integer dtype: - -.. ipython:: python - - s = pd.Series([1, 2, None], dtype="Int64") - s - s[2] - -Compared to ``np.nan``, ``pd.NA`` behaves differently in certain operations. -In addition to arithmetic operations, ``pd.NA`` also propagates as "missing" -or "unknown" in comparison operations: - -.. ipython:: python - - np.nan > 1 - pd.NA > 1 - -For logical operations, ``pd.NA`` follows the rules of the -`three-valued logic `__ (or -*Kleene logic*). For example: - -.. ipython:: python - - pd.NA | True - -For more, see :ref:`NA section ` in the user guide on missing -data. - .. _whatsnew_100.boolean: Boolean data type with missing values support ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ We've added :class:`BooleanDtype` / :class:`~arrays.BooleanArray`, an extension -type dedicated to boolean data that can hold missing values. With the default -``'bool`` data type based on a numpy bool array, the column can only hold -True or False values and not missing values. This new :class:`BooleanDtype` +type dedicated to boolean data that can hold missing values. The default +``bool`` data type based on a bool-dtype NumPy array, the column can only hold +``True`` or ``False``, and not missing values. This new :class:`~arrays.BooleanArray` can store missing values as well by keeping track of this in a separate mask. (:issue:`29555`, :issue:`30095`) @@ -169,7 +157,18 @@ You can use the alias ``"boolean"`` as well. s = pd.Series([True, False, None], dtype="boolean") s -.. _whatsnew_1000.custom_window: +.. _whatsnew_100.numba_rolling_apply: + +Using Numba in ``rolling.apply`` and ``expanding.apply`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added an ``engine`` keyword to :meth:`~core.window.rolling.Rolling.apply` and :meth:`~core.window.expanding.Expanding.apply` +that allows the user to execute the routine using `Numba `__ instead of Cython. +Using the Numba engine can yield significant performance gains if the apply function can operate on numpy arrays and +the data set is larger (1 million rows or greater). For more details, see +:ref:`rolling apply documentation ` (:issue:`28987`, :issue:`30936`) + +.. _whatsnew_100.custom_window: Defining custom windows for rolling operations ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -180,12 +179,25 @@ method on a :func:`pandas.api.indexers.BaseIndexer` subclass that will generate indices used for each window during the rolling aggregation. For more details and example usage, see the :ref:`custom window rolling documentation ` -.. _whatsnew_1000.enhancements.other: +.. _whatsnew_100.to_markdown: + +Converting to Markdown +^^^^^^^^^^^^^^^^^^^^^^ + +We've added :meth:`~DataFrame.to_markdown` for creating a markdown table (:issue:`11052`) + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}, index=['a', 'a', 'b']) + print(df.to_markdown()) + +.. _whatsnew_100.enhancements.other: Other enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`DataFrame.to_string` added the ``max_colwidth`` parameter to control when wide columns are truncated (:issue:`9784`) +- Added the ``na_value`` argument to :meth:`Series.to_numpy`, :meth:`Index.to_numpy` and :meth:`DataFrame.to_numpy` to control the value used for missing data (:issue:`30322`) - :meth:`MultiIndex.from_product` infers level names from inputs if not explicitly provided (:issue:`27292`) - :meth:`DataFrame.to_latex` now accepts ``caption`` and ``label`` arguments (:issue:`25436`) - The :ref:`integer dtype ` with support for missing values and the @@ -201,12 +213,21 @@ Other enhancements - Added ``encoding`` argument to :func:`DataFrame.to_html` for non-ascii text (:issue:`28663`) - :meth:`Styler.background_gradient` now accepts ``vmin`` and ``vmax`` arguments (:issue:`12145`) - :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`) -- Roundtripping DataFrames with nullable integer or string data types to parquet +- Roundtripping DataFrames with nullable integer, string and period data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine - now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). + now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`). - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) -- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue: `30270`) +- :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) +- :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`) - DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) +- :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) +- :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`) +- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) +- Added new writer for exporting Stata dta files in versions 118 and 119, ``StataWriterUTF8``. These files formats support exporting strings containing Unicode characters. Format 119 supports data sets with more than 32,767 variables (:issue:`23573`, :issue:`30959`) +- :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`) +- Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`) +- :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`) +- :meth:`DataFrame.to_pickle` and :func:`read_pickle` now accept URL (:issue:`30163`) Build Changes @@ -217,12 +238,14 @@ cythonized files in the source distribution uploaded to PyPI (:issue:`28341`, :i a built distribution (wheel) or via conda, this shouldn't have any effect on you. If you're building pandas from source, you should no longer need to install Cython into your build environment before calling ``pip install pandas``. -.. _whatsnew_1000.api_breaking: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_1000.api_breaking.MultiIndex._names: +.. _whatsnew_100.api_breaking.MultiIndex._names: Avoid using names from ``MultiIndex.levels`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -240,10 +263,10 @@ For backwards compatibility, you can still *access* the names via the levels. mi.levels[0].name However, it is no longer possible to *update* the names of the ``MultiIndex`` -via the name of the level. The following will **silently** fail to update the -name of the ``MultiIndex`` +via the level. .. ipython:: python + :okexcept: mi.levels[0].name = "new name" mi.names @@ -270,52 +293,107 @@ New repr for :class:`~pandas.arrays.IntervalArray` closed='right', dtype='interval[int64]') - *pandas 1.0.0* .. ipython:: python pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) +``DataFrame.rename`` now only accepts one positional argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) +- :meth:`DataFrame.rename` would previously accept positional arguments that would lead + to ambiguous or undefined behavior. From pandas 1.0, only the very first argument, which + maps labels to their new names along the default axis, is allowed to be passed by position + (:issue:`29136`). -- :meth:`SeriesGroupBy.count` -- :meth:`SeriesGroupBy.size` -- :meth:`SeriesGroupBy.nunique` -- :meth:`SeriesGroupBy.nth` +*pandas 0.25.x* + +.. code-block:: ipython + + In [1]: df = pd.DataFrame([[1]]) + In [2]: df.rename({0: 1}, {0: 2}) + FutureWarning: ...Use named arguments to resolve ambiguity... + Out[2]: + 2 + 1 1 + +*pandas 1.0.0* .. ipython:: python + :okexcept: - df = pd.DataFrame({ - "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), - "cat_2": pd.Categorical(list("AB") * 2, categories=list("ABC")), - "value": [0.1] * 4, - }) - df + df.rename({0: 1}, {0: 2}) +Note that errors will now be raised when conflicting or potentially ambiguous arguments are provided. *pandas 0.25.x* .. code-block:: ipython - In [2]: df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + In [1]: df.rename({0: 1}, index={0: 2}) + Out[1]: + 0 + 1 1 + + In [2]: df.rename(mapper={0: 1}, index={0: 2}) Out[2]: - cat_1 cat_2 - A A 1 - B 1 - B A 1 - B 1 - Name: value, dtype: int64 + 0 + 2 1 + +*pandas 1.0.0* + +.. ipython:: python + :okexcept: + df.rename({0: 1}, index={0: 2}) + df.rename(mapper={0: 1}, index={0: 2}) + +You can still change the axis along which the first positional argument is applied by +supplying the ``axis`` keyword argument. + +.. ipython:: python + + df.rename({0: 1}) + df.rename({0: 1}, axis=1) + +If you would like to update both the index and column labels, be sure to use the respective +keywords. + +.. ipython:: python + + df.rename(index={0: 1}, columns={0: 2}) + +Extended verbose info output for :class:`~pandas.DataFrame` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :meth:`DataFrame.info` now shows line numbers for the columns summary (:issue:`17304`) + +*pandas 0.25.x* + +.. code-block:: python + + >>> df = pd.DataFrame({"int_col": [1, 2, 3], + ... "text_col": ["a", "b", "c"], + ... "float_col": [0.0, 0.1, 0.2]}) + >>> df.info(verbose=True) + + RangeIndex: 3 entries, 0 to 2 + Data columns (total 3 columns): + int_col 3 non-null int64 + text_col 3 non-null object + float_col 3 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 152.0+ bytes *pandas 1.0.0* .. ipython:: python - df.groupby(["cat_1", "cat_2"], observed=False)["value"].count() + df = pd.DataFrame({"int_col": [1, 2, 3], + "text_col": ["a", "b", "c"], + "float_col": [0.0, 0.1, 0.2]}) + df.info(verbose=True) :meth:`pandas.array` inference changes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -350,6 +428,130 @@ The following methods now also correctly output values for unobserved categories As a reminder, you can specify the ``dtype`` to disable all inference. +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`arrays.IntegerArray` now uses :attr:`pandas.NA` rather than +:attr:`numpy.nan` as its missing value marker (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a[2] + nan + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a + a[2] + +This has a few API-breaking consequences. + +**Converting to a NumPy ndarray** + +When converting to a NumPy array missing values will be ``pd.NA``, which cannot +be converted to a float. So calling ``np.asarray(integer_array, dtype="float")`` +will now raise. + +*pandas 0.25.x* + +.. code-block:: python + + >>> np.asarray(a, dtype="float") + array([ 1., 2., nan]) + +*pandas 1.0.0* + +.. ipython:: python + :okexcept: + + np.asarray(a, dtype="float") + +Use :meth:`arrays.IntegerArray.to_numpy` with an explicit ``na_value`` instead. + +.. ipython:: python + + a.to_numpy(dtype="float", na_value=np.nan) + +**Reductions can return ``pd.NA``** + +When performing a reduction such as a sum with ``skipna=False``, the result +will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values +(:issue:`30958`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.Series(a).sum(skipna=False) + nan + +*pandas 1.0.0* + +.. ipython:: python + + pd.Series(a).sum(skipna=False) + +**value_counts returns a nullable integer dtype** + +:meth:`Series.value_counts` with a nullable integer dtype now returns a nullable +integer dtype for the values. + +*pandas 0.25.x* + +.. code-block:: python + + >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + dtype('int64') + +*pandas 1.0.0* + +.. ipython:: python + + pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + +See :ref:`missing_data.NA` for more on the differences between :attr:`pandas.NA` +and :attr:`numpy.nan`. + +:class:`arrays.IntegerArray` comparisons return :class:`arrays.BooleanArray` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Comparison operations on a :class:`arrays.IntegerArray` now returns a +:class:`arrays.BooleanArray` rather than a NumPy array (:issue:`29964`). + +*pandas 0.25.x* + +.. code-block:: python + + >>> a = pd.array([1, 2, None], dtype="Int64") + >>> a + + [1, 2, NaN] + Length: 3, dtype: Int64 + + >>> a > 1 + array([False, True, False]) + +*pandas 1.0.0* + +.. ipython:: python + + a = pd.array([1, 2, None], dtype="Int64") + a > 1 + +Note that missing values now propagate, rather than always comparing unequal +like :attr:`numpy.nan`. See :ref:`missing_data.NA` for more. + By default :meth:`Categorical.min` now returns the minimum instead of np.nan ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -387,7 +589,14 @@ consistent with the behaviour of :class:`DataFrame` and :class:`Index`. DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning. Series([], dtype: float64) -.. _whatsnew_1000.api_breaking.deps: +.. _whatsnew_100.api_breaking.python: + +Increased minimum version for Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas 1.0.0 supports Python 3.6.1 and higher (:issue:`29212`). + +.. _whatsnew_100.api_breaking.deps: Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -428,9 +637,11 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | matplotlib | 2.2.2 | | +-----------------+-----------------+---------+ +| numba | 0.46.0 | X | ++-----------------+-----------------+---------+ | openpyxl | 2.5.7 | X | +-----------------+-----------------+---------+ -| pyarrow | 0.12.0 | X | +| pyarrow | 0.13.0 | X | +-----------------+-----------------+---------+ | pymysql | 0.7.1 | | +-----------------+-----------------+---------+ @@ -453,14 +664,13 @@ Optional libraries below the lowest tested version may still work, but are not c See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. - -.. _whatsnew_1000.api.other: +.. _whatsnew_100.api.other: Other API changes ^^^^^^^^^^^^^^^^^ - Bumped the minimum supported version of ``s3fs`` from 0.0.8 to 0.3.0 (:issue:`28616`) -- :class:`pandas.core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) +- :class:`core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) - In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). @@ -469,22 +679,25 @@ Other API changes - Changed the default configuration value for ``options.matplotlib.register_converters`` from ``True`` to ``"auto"`` (:issue:`18720`). Now, pandas custom formatters will only be applied to plots created by pandas, through :meth:`~DataFrame.plot`. Previously, pandas' formatters would be applied to all plots created *after* a :meth:`~DataFrame.plot`. - See :ref:`units registration ` for more. + See :ref:`units registration ` for more. - :meth:`Series.dropna` has dropped its ``**kwargs`` argument in favor of a single ``how`` parameter. Supplying anything else than ``how`` to ``**kwargs`` raised a ``TypeError`` previously (:issue:`29388`) - When testing pandas, the new minimum required version of pytest is 5.0.1 (:issue:`29664`) - :meth:`Series.str.__iter__` was deprecated and will be removed in future releases (:issue:`28277`). +- Added ```` to the list of default NA values for :meth:`read_csv` (:issue:`30821`) -.. _whatsnew_1000.api.documentation: +.. _whatsnew_100.api.documentation: Documentation Improvements ^^^^^^^^^^^^^^^^^^^^^^^^^^ - Added new section on :ref:`scale` (:issue:`28315`). -- Added sub-section Query MultiIndex in IO tools user guide (:issue:`28791`) +- Added sub-section on :ref:`io.query_multi` for HDF5 datasets (:issue:`28791`). -.. _whatsnew_1000.deprecations: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.deprecations: Deprecations ~~~~~~~~~~~~ @@ -495,31 +708,70 @@ Deprecations is equivalent to ``arr[idx.get_loc(idx_val)] = val``, which should be used instead (:issue:`28621`). - :func:`is_extension_type` is deprecated, :func:`is_extension_array_dtype` should be used instead (:issue:`29457`) - :func:`eval` keyword argument "truediv" is deprecated and will be removed in a future version (:issue:`29812`) -- :meth:`Categorical.take_nd` is deprecated, use :meth:`Categorical.take` instead (:issue:`27745`) +- :meth:`DateOffset.isAnchored` and :meth:`DatetOffset.onOffset` are deprecated and will be removed in a future version, use :meth:`DateOffset.is_anchored` and :meth:`DateOffset.is_on_offset` instead (:issue:`30340`) +- ``pandas.tseries.frequencies.get_offset`` is deprecated and will be removed in a future version, use ``pandas.tseries.frequencies.to_offset`` instead (:issue:`4205`) +- :meth:`Categorical.take_nd` and :meth:`CategoricalIndex.take_nd` are deprecated, use :meth:`Categorical.take` and :meth:`CategoricalIndex.take` instead (:issue:`27745`) - The parameter ``numeric_only`` of :meth:`Categorical.min` and :meth:`Categorical.max` is deprecated and replaced with ``skipna`` (:issue:`25303`) - The parameter ``label`` in :func:`lreshape` has been deprecated and will be removed in a future version (:issue:`29742`) - ``pandas.core.index`` has been deprecated and will be removed in a future version, the public classes are available in the top-level namespace (:issue:`19711`) - :func:`pandas.json_normalize` is now exposed in the top-level namespace. Usage of ``json_normalize`` as ``pandas.io.json.json_normalize`` is now deprecated and it is recommended to use ``json_normalize`` as :func:`pandas.json_normalize` instead (:issue:`27586`). +- The ``numpy`` argument of :meth:`pandas.read_json` is deprecated (:issue:`28512`). - :meth:`DataFrame.to_stata`, :meth:`DataFrame.to_feather`, and :meth:`DataFrame.to_parquet` argument "fname" is deprecated, use "path" instead (:issue:`23574`) +- The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`) +- The ``pandas.util.testing`` module has been deprecated. Use the public API in ``pandas.testing`` documented at :ref:`api.general.testing` (:issue:`16232`). +- ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`) +- The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) +- Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) +- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) +- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`) + +**Selecting Columns from a Grouped DataFrame** +When selecting columns from a :class:`DataFrameGroupBy` object, passing individual keys (or a tuple of keys) inside single brackets is deprecated, +a list of items should be used instead. (:issue:`23566`) For example: -.. _whatsnew_1000.prior_deprecations: +.. code-block:: ipython + + df = pd.DataFrame({ + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": np.random.randn(8), + "C": np.random.randn(8), + }) + g = df.groupby('A') + # single key, returns SeriesGroupBy + g['B'] -Removed SparseSeries and SparseDataFrame -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + # tuple of single key, returns SeriesGroupBy + g[('B',)] + + # tuple of multiple keys, returns DataFrameGroupBy, raises FutureWarning + g[('B', 'C')] + + # multiple keys passed directly, returns DataFrameGroupBy, raises FutureWarning + # (implicitly converts the passed strings into a single tuple) + g['B', 'C'] + + # proper way, returns DataFrameGroupBy + g[['B', 'C']] + +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Removed SparseSeries and SparseDataFrame** ``SparseSeries``, ``SparseDataFrame`` and the ``DataFrame.to_sparse`` method have been removed (:issue:`28425`). We recommend using a ``Series`` or ``DataFrame`` with sparse values instead. See :ref:`sparse.migration` for help with migrating existing code. -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. _whatsnew_1000.matplotlib_units: +.. _whatsnew_100.matplotlib_units: **Matplotlib unit registration** @@ -540,121 +792,125 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. **Other removals** -- Removed the previously deprecated "index" keyword from :func:`read_stata`, :class:`StataReader`, and :meth:`StataReader.read`, use "index_col" instead (:issue:`17328`) -- Removed the previously deprecated :meth:`StataReader.data` method, use :meth:`StataReader.read` instead (:issue:`9493`) -- Removed the previously deprecated :func:`pandas.plotting._matplotlib.tsplot`, use :meth:`Series.plot` instead (:issue:`19980`) -- :func:`pandas.tseries.converter.register` has been moved to :func:`pandas.plotting.register_matplotlib_converters` (:issue:`18307`) +- Removed the previously deprecated keyword "index" from :func:`read_stata`, :class:`StataReader`, and :meth:`StataReader.read`, use "index_col" instead (:issue:`17328`) +- Removed ``StataReader.data`` method, use :meth:`StataReader.read` instead (:issue:`9493`) +- Removed ``pandas.plotting._matplotlib.tsplot``, use :meth:`Series.plot` instead (:issue:`19980`) +- ``pandas.tseries.converter.register`` has been moved to :func:`pandas.plotting.register_matplotlib_converters` (:issue:`18307`) - :meth:`Series.plot` no longer accepts positional arguments, pass keyword arguments instead (:issue:`30003`) - :meth:`DataFrame.hist` and :meth:`Series.hist` no longer allows ``figsize="default"``, specify figure size by passinig a tuple instead (:issue:`30003`) - Floordiv of integer-dtyped array by :class:`Timedelta` now raises ``TypeError`` (:issue:`21036`) - :class:`TimedeltaIndex` and :class:`DatetimeIndex` no longer accept non-nanosecond dtype strings like "timedelta64" or "datetime64", use "timedelta64[ns]" and "datetime64[ns]" instead (:issue:`24806`) -- :func:`pandas.api.types.infer_dtype` argument ``skipna`` defaults to ``True`` instead of ``False`` (:issue:`24050`) -- Removed the previously deprecated :attr:`Series.ix` and :attr:`DataFrame.ix` (:issue:`26438`) -- Removed the previously deprecated :meth:`Index.summary` (:issue:`18217`) -- Removed the previously deprecated "fastpath" keyword from the :class:`Index` constructor (:issue:`23110`) -- Removed the previously deprecated :meth:`Series.get_value`, :meth:`Series.set_value`, :meth:`DataFrame.get_value`, :meth:`DataFrame.set_value` (:issue:`17739`) -- Removed the previously deprecated :meth:`Series.compound` and :meth:`DataFrame.compound` (:issue:`26405`) -- Changed the the default value of `inplace` in :meth:`DataFrame.set_index` and :meth:`Series.set_axis`. It now defaults to ``False`` (:issue:`27600`) -- Removed the previously deprecated :attr:`Series.cat.categorical`, :attr:`Series.cat.index`, :attr:`Series.cat.name` (:issue:`24751`) -- :func:`to_datetime` and :func:`to_timedelta` no longer accept "box" argument, always returns :class:`DatetimeIndex`, :class:`TimedeltaIndex`, :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`24486`) +- Changed the default "skipna" argument in :func:`pandas.api.types.infer_dtype` from ``False`` to ``True`` (:issue:`24050`) +- Removed ``Series.ix`` and ``DataFrame.ix`` (:issue:`26438`) +- Removed ``Index.summary`` (:issue:`18217`) +- Removed the previously deprecated keyword "fastpath" from the :class:`Index` constructor (:issue:`23110`) +- Removed ``Series.get_value``, ``Series.set_value``, ``DataFrame.get_value``, ``DataFrame.set_value`` (:issue:`17739`) +- Removed ``Series.compound`` and ``DataFrame.compound`` (:issue:`26405`) +- Changed the default "inplace" argument in :meth:`DataFrame.set_index` and :meth:`Series.set_axis` from ``None`` to ``False`` (:issue:`27600`) +- Removed ``Series.cat.categorical``, ``Series.cat.index``, ``Series.cat.name`` (:issue:`24751`) +- Removed the previously deprecated keyword "box" from :func:`to_datetime` and :func:`to_timedelta`; in addition these now always returns :class:`DatetimeIndex`, :class:`TimedeltaIndex`, :class:`Index`, :class:`Series`, or :class:`DataFrame` (:issue:`24486`) - :func:`to_timedelta`, :class:`Timedelta`, and :class:`TimedeltaIndex` no longer allow "M", "y", or "Y" for the "unit" argument (:issue:`23264`) -- Removed the previously deprecated ``time_rule`` keyword from (non-public) :func:`offsets.generate_range`, which has been moved to :func:`core.arrays._ranges.generate_range` (:issue:`24157`) +- Removed the previously deprecated keyword "time_rule" from (non-public) ``offsets.generate_range``, which has been moved to :func:`core.arrays._ranges.generate_range` (:issue:`24157`) - :meth:`DataFrame.loc` or :meth:`Series.loc` with listlike indexers and missing labels will no longer reindex (:issue:`17295`) - :meth:`DataFrame.to_excel` and :meth:`Series.to_excel` with non-existent columns will no longer reindex (:issue:`17295`) -- :func:`concat` parameter "join_axes" has been removed, use ``reindex_like`` on the result instead (:issue:`22318`) -- Removed the previously deprecated "by" keyword from :meth:`DataFrame.sort_index`, use :meth:`DataFrame.sort_values` instead (:issue:`10726`) -- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`18529`) +- Removed the previously deprecated keyword "join_axes" from :func:`concat`; use ``reindex_like`` on the result instead (:issue:`22318`) +- Removed the previously deprecated keyword "by" from :meth:`DataFrame.sort_index`, use :meth:`DataFrame.sort_values` instead (:issue:`10726`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`core.groupby.DataFrameGroupBy.aggregate`, :meth:`core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`18529`) - Passing ``datetime64`` data to :class:`TimedeltaIndex` or ``timedelta64`` data to ``DatetimeIndex`` now raises ``TypeError`` (:issue:`23539`, :issue:`23937`) - Passing ``int64`` values to :class:`DatetimeIndex` and a timezone now interprets the values as nanosecond timestamps in UTC, not wall times in the given timezone (:issue:`24559`) - A tuple passed to :meth:`DataFrame.groupby` is now exclusively treated as a single key (:issue:`18314`) -- Removed the previously deprecated :meth:`Index.contains`, use ``key in index`` instead (:issue:`30103`) +- Removed ``Index.contains``, use ``key in index`` instead (:issue:`30103`) - Addition and subtraction of ``int`` or integer-arrays is no longer allowed in :class:`Timestamp`, :class:`DatetimeIndex`, :class:`TimedeltaIndex`, use ``obj + n * obj.freq`` instead of ``obj + n`` (:issue:`22535`) -- Removed :meth:`Series.from_array` (:issue:`18258`) -- Removed :meth:`DataFrame.from_items` (:issue:`18458`) -- Removed :meth:`DataFrame.as_matrix`, :meth:`Series.as_matrix` (:issue:`18458`) -- Removed :meth:`Series.asobject` (:issue:`18477`) -- Removed :meth:`DataFrame.as_blocks`, :meth:`Series.as_blocks`, `DataFrame.blocks`, :meth:`Series.blocks` (:issue:`17656`) +- Removed ``Series.ptp`` (:issue:`21614`) +- Removed ``Series.from_array`` (:issue:`18258`) +- Removed ``DataFrame.from_items`` (:issue:`18458`) +- Removed ``DataFrame.as_matrix``, ``Series.as_matrix`` (:issue:`18458`) +- Removed ``Series.asobject`` (:issue:`18477`) +- Removed ``DataFrame.as_blocks``, ``Series.as_blocks``, ``DataFrame.blocks``, ``Series.blocks`` (:issue:`17656`) - :meth:`pandas.Series.str.cat` now defaults to aligning ``others``, using ``join='left'`` (:issue:`27611`) - :meth:`pandas.Series.str.cat` does not accept list-likes *within* list-likes anymore (:issue:`27611`) - :meth:`Series.where` with ``Categorical`` dtype (or :meth:`DataFrame.where` with ``Categorical`` column) no longer allows setting new categories (:issue:`24114`) -- :class:`DatetimeIndex`, :class:`TimedeltaIndex`, and :class:`PeriodIndex` constructors no longer allow ``start``, ``end``, and ``periods`` keywords, use :func:`date_range`, :func:`timedelta_range`, and :func:`period_range` instead (:issue:`23919`) -- :class:`DatetimeIndex` and :class:`TimedeltaIndex` constructors no longer have a ``verify_integrity`` keyword argument (:issue:`23919`) -- ``pandas.core.internals.blocks.make_block`` no longer accepts the "fastpath" keyword(:issue:`19265`) -- :meth:`Block.make_block_same_class` no longer accepts the "dtype" keyword(:issue:`19434`) -- Removed the previously deprecated :meth:`ExtensionArray._formatting_values`. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) -- Removed the previously deprecated :meth:`MultiIndex.to_hierarchical` (:issue:`21613`) -- Removed the previously deprecated :attr:`MultiIndex.labels`, use :attr:`MultiIndex.codes` instead (:issue:`23752`) -- Removed the previously deprecated "labels" keyword from the :class:`MultiIndex` constructor, use "codes" instead (:issue:`23752`) -- Removed the previously deprecated :meth:`MultiIndex.set_labels`, use :meth:`MultiIndex.set_codes` instead (:issue:`23752`) -- Removed the previously deprecated "labels" keyword from :meth:`MultiIndex.set_codes`, :meth:`MultiIndex.copy`, :meth:`MultiIndex.drop`, use "codes" instead (:issue:`23752`) +- Removed the previously deprecated keywords "start", "end", and "periods" from the :class:`DatetimeIndex`, :class:`TimedeltaIndex`, and :class:`PeriodIndex` constructors; use :func:`date_range`, :func:`timedelta_range`, and :func:`period_range` instead (:issue:`23919`) +- Removed the previously deprecated keyword "verify_integrity" from the :class:`DatetimeIndex` and :class:`TimedeltaIndex` constructors (:issue:`23919`) +- Removed the previously deprecated keyword "fastpath" from ``pandas.core.internals.blocks.make_block`` (:issue:`19265`) +- Removed the previously deprecated keyword "dtype" from :meth:`Block.make_block_same_class` (:issue:`19434`) +- Removed ``ExtensionArray._formatting_values``. Use :attr:`ExtensionArray._formatter` instead. (:issue:`23601`) +- Removed ``MultiIndex.to_hierarchical`` (:issue:`21613`) +- Removed ``MultiIndex.labels``, use :attr:`MultiIndex.codes` instead (:issue:`23752`) +- Removed the previously deprecated keyword "labels" from the :class:`MultiIndex` constructor, use "codes" instead (:issue:`23752`) +- Removed ``MultiIndex.set_labels``, use :meth:`MultiIndex.set_codes` instead (:issue:`23752`) +- Removed the previously deprecated keyword "labels" from :meth:`MultiIndex.set_codes`, :meth:`MultiIndex.copy`, :meth:`MultiIndex.drop`, use "codes" instead (:issue:`23752`) - Removed support for legacy HDF5 formats (:issue:`29787`) - Passing a dtype alias (e.g. 'datetime64[ns, UTC]') to :class:`DatetimeTZDtype` is no longer allowed, use :meth:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`) -- :func:`read_excel` removed support for "skip_footer" argument, use "skipfooter" instead (:issue:`18836`) +- Removed the previously deprecated keyword "skip_footer" from :func:`read_excel`; use "skipfooter" instead (:issue:`18836`) - :func:`read_excel` no longer allows an integer value for the parameter ``usecols``, instead pass a list of integers from 0 to ``usecols`` inclusive (:issue:`23635`) -- :meth:`DataFrame.to_records` no longer supports the argument "convert_datetime64" (:issue:`18902`) -- Removed the previously deprecated ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) -- Changed the default value for the "keep_tz" argument in :meth:`DatetimeIndex.to_series` to ``True`` (:issue:`23739`) -- Removed the previously deprecated :func:`api.types.is_period` and :func:`api.types.is_datetimetz` (:issue:`23917`) +- Removed the previously deprecated keyword "convert_datetime64" from :meth:`DataFrame.to_records` (:issue:`18902`) +- Removed ``IntervalIndex.from_intervals`` in favor of the :class:`IntervalIndex` constructor (:issue:`19263`) +- Changed the default "keep_tz" argument in :meth:`DatetimeIndex.to_series` from ``None`` to ``True`` (:issue:`23739`) +- Removed ``api.types.is_period`` and ``api.types.is_datetimetz`` (:issue:`23917`) - Ability to read pickles containing :class:`Categorical` instances created with pre-0.16 version of pandas has been removed (:issue:`27538`) -- Removed previously deprecated :func:`pandas.tseries.plotting.tsplot` (:issue:`18627`) -- Removed the previously deprecated ``reduce`` and ``broadcast`` arguments from :meth:`DataFrame.apply` (:issue:`18577`) -- Removed the previously deprecated ``assert_raises_regex`` function in ``pandas.util.testing`` (:issue:`29174`) +- Removed ``pandas.tseries.plotting.tsplot`` (:issue:`18627`) +- Removed the previously deprecated keywords "reduce" and "broadcast" from :meth:`DataFrame.apply` (:issue:`18577`) +- Removed the previously deprecated ``assert_raises_regex`` function in ``pandas._testing`` (:issue:`29174`) - Removed the previously deprecated ``FrozenNDArray`` class in ``pandas.core.indexes.frozen`` (:issue:`29335`) -- Removed previously deprecated "nthreads" argument from :func:`read_feather`, use "use_threads" instead (:issue:`23053`) -- Removed :meth:`Index.is_lexsorted_for_tuple` (:issue:`29305`) -- Removed support for nexted renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`DataFrameGroupBy.aggregate`, :meth:`SeriesGroupBy.aggregate`, :meth:`Rolling.aggregate` (:issue:`29608`) -- Removed the previously deprecated :meth:`Series.valid`; use :meth:`Series.dropna` instead (:issue:`18800`) -- Removed the previously properties :attr:`DataFrame.is_copy`, :attr:`Series.is_copy` (:issue:`18812`) -- Removed the previously deprecated :meth:`DataFrame.get_ftype_counts`, :meth:`Series.get_ftype_counts` (:issue:`18243`) -- Removed the previously deprecated :meth:`DataFrame.ftypes`, :meth:`Series.ftypes`, :meth:`Series.ftype` (:issue:`26744`) -- Removed the previously deprecated :meth:`Index.get_duplicates`, use ``idx[idx.duplicated()].unique()`` instead (:issue:`20239`) -- Removed the previously deprecated :meth:`Series.clip_upper`, :meth:`Series.clip_lower`, :meth:`DataFrame.clip_upper`, :meth:`DataFrame.clip_lower` (:issue:`24203`) +- Removed the previously deprecated keyword "nthreads" from :func:`read_feather`, use "use_threads" instead (:issue:`23053`) +- Removed ``Index.is_lexsorted_for_tuple`` (:issue:`29305`) +- Removed support for nested renaming in :meth:`DataFrame.aggregate`, :meth:`Series.aggregate`, :meth:`core.groupby.DataFrameGroupBy.aggregate`, :meth:`core.groupby.SeriesGroupBy.aggregate`, :meth:`core.window.rolling.Rolling.aggregate` (:issue:`29608`) +- Removed ``Series.valid``; use :meth:`Series.dropna` instead (:issue:`18800`) +- Removed ``DataFrame.is_copy``, ``Series.is_copy`` (:issue:`18812`) +- Removed ``DataFrame.get_ftype_counts``, ``Series.get_ftype_counts`` (:issue:`18243`) +- Removed ``DataFrame.ftypes``, ``Series.ftypes``, ``Series.ftype`` (:issue:`26744`) +- Removed ``Index.get_duplicates``, use ``idx[idx.duplicated()].unique()`` instead (:issue:`20239`) +- Removed ``Series.clip_upper``, ``Series.clip_lower``, ``DataFrame.clip_upper``, ``DataFrame.clip_lower`` (:issue:`24203`) - Removed the ability to alter :attr:`DatetimeIndex.freq`, :attr:`TimedeltaIndex.freq`, or :attr:`PeriodIndex.freq` (:issue:`20772`) -- Removed the previously deprecated :attr:`DatetimeIndex.offset` (:issue:`20730`) -- Removed the previously deprecated :meth:`DatetimeIndex.asobject`, :meth:`TimedeltaIndex.asobject`, :meth:`PeriodIndex.asobject`, use ``astype(object)`` instead (:issue:`29801`) -- Removed previously deprecated "order" argument from :func:`factorize` (:issue:`19751`) -- :func:`read_stata` and :meth:`DataFrame.to_stata` no longer supports the "encoding" argument (:issue:`21400`) -- In :func:`concat` the default value for ``sort`` has been changed from ``None`` to ``False`` (:issue:`20613`) -- Removed previously deprecated "raise_conflict" argument from :meth:`DataFrame.update`, use "errors" instead (:issue:`23585`) -- Removed previously deprecated keyword "n" from :meth:`DatetimeIndex.shift`, :meth:`TimedeltaIndex.shift`, :meth:`PeriodIndex.shift`, use "periods" instead (:issue:`22458`) -- Removed previously deprecated keywords ``how``, ``fill_method``, and ``limit`` from :meth:`DataFrame.resample` (:issue:`30139`) +- Removed ``DatetimeIndex.offset`` (:issue:`20730`) +- Removed ``DatetimeIndex.asobject``, ``TimedeltaIndex.asobject``, ``PeriodIndex.asobject``, use ``astype(object)`` instead (:issue:`29801`) +- Removed the previously deprecated keyword "order" from :func:`factorize` (:issue:`19751`) +- Removed the previously deprecated keyword "encoding" from :func:`read_stata` and :meth:`DataFrame.to_stata` (:issue:`21400`) +- Changed the default "sort" argument in :func:`concat` from ``None`` to ``False`` (:issue:`20613`) +- Removed the previously deprecated keyword "raise_conflict" from :meth:`DataFrame.update`, use "errors" instead (:issue:`23585`) +- Removed the previously deprecated keyword "n" from :meth:`DatetimeIndex.shift`, :meth:`TimedeltaIndex.shift`, :meth:`PeriodIndex.shift`, use "periods" instead (:issue:`22458`) +- Removed the previously deprecated keywords "how", "fill_method", and "limit" from :meth:`DataFrame.resample` (:issue:`30139`) - Passing an integer to :meth:`Series.fillna` or :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype now raises ``TypeError`` (:issue:`24694`) - Passing multiple axes to :meth:`DataFrame.dropna` is no longer supported (:issue:`20995`) -- Removed previously deprecated :meth:`Series.nonzero`, use `to_numpy().nonzero()` instead (:issue:`24048`) +- Removed ``Series.nonzero``, use ``to_numpy().nonzero()`` instead (:issue:`24048`) - Passing floating dtype ``codes`` to :meth:`Categorical.from_codes` is no longer supported, pass ``codes.astype(np.int64)`` instead (:issue:`21775`) -- :meth:`Series.str.partition` and :meth:`Series.str.rpartition` no longer accept "pat" keyword, use "sep" instead (:issue:`23767`) -- Removed the previously deprecated :meth:`Series.put` (:issue:`27106`) -- Removed the previously deprecated :attr:`Series.real`, :attr:`Series.imag` (:issue:`27106`) -- Removed the previously deprecated :meth:`Series.to_dense`, :meth:`DataFrame.to_dense` (:issue:`26684`) -- Removed the previously deprecated :meth:`Index.dtype_str`, use ``str(index.dtype)`` instead (:issue:`27106`) +- Removed the previously deprecated keyword "pat" from :meth:`Series.str.partition` and :meth:`Series.str.rpartition`, use "sep" instead (:issue:`23767`) +- Removed ``Series.put`` (:issue:`27106`) +- Removed ``Series.real``, ``Series.imag`` (:issue:`27106`) +- Removed ``Series.to_dense``, ``DataFrame.to_dense`` (:issue:`26684`) +- Removed ``Index.dtype_str``, use ``str(index.dtype)`` instead (:issue:`27106`) - :meth:`Categorical.ravel` returns a :class:`Categorical` instead of a ``ndarray`` (:issue:`27199`) - The 'outer' method on Numpy ufuncs, e.g. ``np.subtract.outer`` operating on :class:`Series` objects is no longer supported, and will raise ``NotImplementedError`` (:issue:`27198`) -- Removed previously deprecated :meth:`Series.get_dtype_counts` and :meth:`DataFrame.get_dtype_counts` (:issue:`27145`) -- Changed the default ``fill_value`` in :meth:`Categorical.take` from ``True`` to ``False`` (:issue:`20841`) -- Changed the default value for the `raw` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, -- :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` to ``False`` (:issue:`20584`) +- Removed ``Series.get_dtype_counts`` and ``DataFrame.get_dtype_counts`` (:issue:`27145`) +- Changed the default "fill_value" argument in :meth:`Categorical.take` from ``True`` to ``False`` (:issue:`20841`) +- Changed the default value for the `raw` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` from ``None`` to ``False`` (:issue:`20584`) - Removed deprecated behavior of :meth:`Series.argmin` and :meth:`Series.argmax`, use :meth:`Series.idxmin` and :meth:`Series.idxmax` for the old behavior (:issue:`16955`) - Passing a tz-aware ``datetime.datetime`` or :class:`Timestamp` into the :class:`Timestamp` constructor with the ``tz`` argument now raises a ``ValueError`` (:issue:`23621`) -- Removed the previously deprecated :attr:`Series.base`, :attr:`Index.base`, :attr:`Categorical.base`, :attr:`Series.flags`, :attr:`Index.flags`, :attr:`PeriodArray.flags`, :attr:`Series.strides`, :attr:`Index.strides`, :attr:`Series.itemsize`, :attr:`Index.itemsize`, :attr:`Series.data`, :attr:`Index.data` (:issue:`20721`) +- Removed ``Series.base``, ``Index.base``, ``Categorical.base``, ``Series.flags``, ``Index.flags``, ``PeriodArray.flags``, ``Series.strides``, ``Index.strides``, ``Series.itemsize``, ``Index.itemsize``, ``Series.data``, ``Index.data`` (:issue:`20721`) - Changed :meth:`Timedelta.resolution` to match the behavior of the standard library ``datetime.timedelta.resolution``, for the old behavior, use :meth:`Timedelta.resolution_string` (:issue:`26839`) -- Removed previously deprecated :attr:`Timestamp.weekday_name`, :attr:`DatetimeIndex.weekday_name`, and :attr:`Series.dt.weekday_name` (:issue:`18164`) -- Removed previously deprecated ``errors`` argument in :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` (:issue:`22644`) -- Changed the default value for ``ordered`` in :class:`CategoricalDtype` from ``None`` to ``False`` (:issue:`26336`) +- Removed ``Timestamp.weekday_name``, ``DatetimeIndex.weekday_name``, and ``Series.dt.weekday_name`` (:issue:`18164`) +- Removed the previously deprecated keyword "errors" in :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` (:issue:`22644`) +- Changed the default "ordered" argument in :class:`CategoricalDtype` from ``None`` to ``False`` (:issue:`26336`) - :meth:`Series.set_axis` and :meth:`DataFrame.set_axis` now require "labels" as the first argument and "axis" as an optional named parameter (:issue:`30089`) -- Removed the previously deprecated :func:`to_msgpack`, :func:`read_msgpack`, :meth:`DataFrame.to_msgpack`, :meth:`Series.to_msgpack` (:issue:`27103`) -- +- Removed ``to_msgpack``, ``read_msgpack``, ``DataFrame.to_msgpack``, ``Series.to_msgpack`` (:issue:`27103`) +- Removed ``Series.compress`` (:issue:`21930`) - Removed the previously deprecated keyword "fill_value" from :meth:`Categorical.fillna`, use "value" instead (:issue:`19269`) - Removed the previously deprecated keyword "data" from :func:`andrews_curves`, use "frame" instead (:issue:`6956`) - Removed the previously deprecated keyword "data" from :func:`parallel_coordinates`, use "frame" instead (:issue:`6956`) - Removed the previously deprecated keyword "colors" from :func:`parallel_coordinates`, use "color" instead (:issue:`6956`) - Removed the previously deprecated keywords "verbose" and "private_key" from :func:`read_gbq` (:issue:`30200`) +- Calling ``np.array`` and ``np.asarray`` on tz-aware :class:`Series` and :class:`DatetimeIndex` will now return an object array of tz-aware :class:`Timestamp` (:issue:`24596`) - -.. _whatsnew_1000.performance: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.performance: Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement in :class:`DataFrame` arithmetic and comparison operations with scalars (:issue:`24990`, :issue:`29853`) - Performance improvement in indexing with a non-unique :class:`IntervalIndex` (:issue:`27489`) - Performance improvement in :attr:`MultiIndex.is_monotonic` (:issue:`27495`) - Performance improvement in :func:`cut` when ``bins`` is an :class:`IntervalIndex` (:issue:`27668`) @@ -669,7 +925,9 @@ Performance improvements - Performance improvement in :meth:`Index.equals` and :meth:`MultiIndex.equals` (:issue:`29134`) - Performance improvement in :func:`~pandas.api.types.infer_dtype` when ``skipna`` is ``True`` (:issue:`28814`) -.. _whatsnew_1000.bug_fixes: +.. --------------------------------------------------------------------------- + +.. _whatsnew_100.bug_fixes: Bug fixes ~~~~~~~~~ @@ -691,6 +949,11 @@ Categorical :class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue:`27952`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) - Bug where calling :meth:`Categorical.min` or :meth:`Categorical.max` on an empty Categorical would raise a numpy exception (:issue:`30227`) +- The following methods now also correctly output values for unobserved categories when called through ``groupby(..., observed=False)`` (:issue:`17605`) + * :meth:`core.groupby.SeriesGroupBy.count` + * :meth:`core.groupby.SeriesGroupBy.size` + * :meth:`core.groupby.SeriesGroupBy.nunique` + * :meth:`core.groupby.SeriesGroupBy.nth` Datetimelike @@ -699,22 +962,31 @@ Datetimelike - Bug in :meth:`Series.dt` property lookups when the underlying data is read-only (:issue:`27529`) - Bug in ``HDFStore.__getitem__`` incorrectly reading tz attribute created in Python 2 (:issue:`26443`) - Bug in :func:`to_datetime` where passing arrays of malformed ``str`` with errors="coerce" could incorrectly lead to raising ``ValueError`` (:issue:`28299`) -- Bug in :meth:`pandas.core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) +- Bug in :meth:`core.groupby.SeriesGroupBy.nunique` where ``NaT`` values were interfering with the count of unique values (:issue:`27951`) - Bug in :class:`Timestamp` subtraction when subtracting a :class:`Timestamp` from a ``np.datetime64`` object incorrectly raising ``TypeError`` (:issue:`28286`) - Addition and subtraction of integer or integer-dtype arrays with :class:`Timestamp` will now raise ``NullFrequencyError`` instead of ``ValueError`` (:issue:`28268`) - Bug in :class:`Series` and :class:`DataFrame` with integer dtype failing to raise ``TypeError`` when adding or subtracting a ``np.datetime64`` object (:issue:`28080`) +- Bug in :meth:`Series.astype`, :meth:`Index.astype`, and :meth:`DataFrame.astype` failing to handle ``NaT`` when casting to an integer dtype (:issue:`28492`) - Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`) - Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype `'timedelta64[ns]'` (:issue:`28049`) -- Bug in :func:`pandas.core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) +- Bug in :func:`core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) - Bug in :meth:`DatetimeIndex.strftime` and :meth:`Series.dt.strftime` where ``NaT`` was converted to the string ``'NaT'`` instead of ``np.nan`` (:issue:`29578`) +- Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`) - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) - Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`) - Bug in :meth:`Series.item` with ``datetime64`` or ``timedelta64`` dtype, :meth:`DatetimeIndex.item`, and :meth:`TimedeltaIndex.item` returning an integer instead of a :class:`Timestamp` or :class:`Timedelta` (:issue:`30175`) - Bug in :class:`DatetimeIndex` addition when adding a non-optimized :class:`DateOffset` incorrectly dropping timezone information (:issue:`30336`) +- Bug in :meth:`DataFrame.drop` where attempting to drop non-existent values from a DatetimeIndex would yield a confusing error message (:issue:`30399`) - Bug in :meth:`DataFrame.append` would remove the timezone-awareness of new data (:issue:`30238`) +- Bug in :meth:`Series.cummin` and :meth:`Series.cummax` with timezone-aware dtype incorrectly dropping its timezone (:issue:`15553`) +- Bug in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` where inplace addition and subtraction did not actually operate inplace (:issue:`24115`) +- Bug in :func:`pandas.to_datetime` when called with ``Series`` storing ``IntegerArray`` raising ``TypeError`` instead of returning ``Series`` (:issue:`30050`) +- Bug in :func:`date_range` with custom business hours as ``freq`` and given number of ``periods`` (:issue:`30593`) +- Bug in :class:`PeriodIndex` comparisons with incorrectly casting integers to :class:`Period` objects, inconsistent with the :class:`Period` comparison behavior (:issue:`30722`) +- Bug in :meth:`DatetimeIndex.insert` raising a ``ValueError`` instead of a ``TypeError`` when trying to insert a timezone-aware :class:`Timestamp` into a timezone-naive :class:`DatetimeIndex`, or vice-versa (:issue:`30806`) Timedelta ^^^^^^^^^ @@ -743,6 +1015,8 @@ Numeric - Bug in :class:`NumericIndex` construction that caused :class:`UInt64Index` to be casted to :class:`Float64Index` when integers in the ``np.uint64`` range were used to index a :class:`DataFrame` (:issue:`28279`) - Bug in :meth:`Series.interpolate` when using method=`index` with an unsorted index, would previously return incorrect results. (:issue:`21037`) - Bug in :meth:`DataFrame.round` where a :class:`DataFrame` with a :class:`CategoricalIndex` of :class:`IntervalIndex` columns would incorrectly raise a ``TypeError`` (:issue:`30063`) +- Bug in :meth:`Series.pct_change` and :meth:`DataFrame.pct_change` when there are duplicated indices (:issue:`30463`) +- Bug in :class:`DataFrame` cumulative operations (e.g. cumsum, cummax) incorrect casting to object-dtype (:issue:`19296`) Conversion ^^^^^^^^^^ @@ -753,7 +1027,7 @@ Conversion Strings ^^^^^^^ -- Calling :meth:`Series.str.isalnum` (and other "ismethods") on an empty Series would return an object dtype instead of bool (:issue:`29624`) +- Calling :meth:`Series.str.isalnum` (and other "ismethods") on an empty ``Series`` would return an ``object`` dtype instead of ``bool`` (:issue:`29624`) - @@ -762,6 +1036,9 @@ Interval - Bug in :meth:`IntervalIndex.get_indexer` where a :class:`Categorical` or :class:`CategoricalIndex` ``target`` would incorrectly raise a ``TypeError`` (:issue:`30063`) - Bug in ``pandas.core.dtypes.cast.infer_dtype_from_scalar`` where passing ``pandas_dtype=True`` did not infer :class:`IntervalDtype` (:issue:`30337`) +- Bug in :class:`Series` constructor where constructing a ``Series`` from a ``list`` of :class:`Interval` objects resulted in ``object`` dtype instead of :class:`IntervalDtype` (:issue:`23563`) +- Bug in :class:`IntervalDtype` where the ``kind`` attribute was incorrectly set as ``None`` instead of ``"O"`` (:issue:`30568`) +- Bug in :class:`IntervalIndex`, :class:`~arrays.IntervalArray`, and :class:`Series` with interval data where equality comparisons were incorrect (:issue:`24112`) Indexing ^^^^^^^^ @@ -773,8 +1050,11 @@ Indexing - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) - :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) - Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with non-string categories didn't work (:issue:`17569`, :issue:`30225`) -- :meth:`Index.get_indexer_non_unique` could fail with `TypeError` in some cases, such as when searching for ints in a string index (:issue:`28257`) +- :meth:`Index.get_indexer_non_unique` could fail with ``TypeError`` in some cases, such as when searching for ints in a string index (:issue:`28257`) - Bug in :meth:`Float64Index.get_loc` incorrectly raising ``TypeError`` instead of ``KeyError`` (:issue:`29189`) +- :meth:`MultiIndex.get_loc` can't find missing values when input includes missing values (:issue:`19132`) +- Bug in :meth:`Series.__setitem__` incorrectly assigning values with boolean indexer when the length of new data matches the number of ``True`` values and new data is not a ``Series`` or an ``np.array`` (:issue:`30567`) +- Bug in indexing with a :class:`PeriodIndex` incorrectly accepting integers representing years, use e.g. ``ser.loc["2007"]`` instead of ``ser.loc[2007]`` (:issue:`30763`) Missing ^^^^^^^ @@ -785,8 +1065,8 @@ Missing MultiIndex ^^^^^^^^^^ -- Constructior for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) -- +- Constructor for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) +- Series and MultiIndex `.drop` with `MultiIndex` raise exception if labels not in given in level (:issue:`8594`) - I/O @@ -810,41 +1090,47 @@ I/O - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) - Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) +- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`) +- :func:`read_excel` now accepts binary data (:issue:`15914`) +- Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`) Plotting ^^^^^^^^ - Bug in :meth:`Series.plot` not able to plot boolean values (:issue:`23719`) -- - Bug in :meth:`DataFrame.plot` not able to plot when no rows (:issue:`27758`) - Bug in :meth:`DataFrame.plot` producing incorrect legend markers when plotting multiple series on the same axis (:issue:`18222`) - Bug in :meth:`DataFrame.plot` when ``kind='box'`` and data contains datetime or timedelta data. These types are now automatically dropped (:issue:`22799`) - Bug in :meth:`DataFrame.plot.line` and :meth:`DataFrame.plot.area` produce wrong xlim in x-axis (:issue:`27686`, :issue:`25160`, :issue:`24784`) -- Bug where :meth:`DataFrame.boxplot` would not accept a `color` parameter like `DataFrame.plot.box` (:issue:`26214`) +- Bug where :meth:`DataFrame.boxplot` would not accept a ``color`` parameter like :meth:`DataFrame.plot.box` (:issue:`26214`) - Bug in the ``xticks`` argument being ignored for :meth:`DataFrame.plot.bar` (:issue:`14119`) - :func:`set_option` now validates that the plot backend provided to ``'plotting.backend'`` implements the backend when the option is set, rather than when a plot is created (:issue:`28163`) -- :meth:`DataFrame.plot` now allow a ``backend`` keyword arugment to allow changing between backends in one session (:issue:`28619`). +- :meth:`DataFrame.plot` now allow a ``backend`` keyword argument to allow changing between backends in one session (:issue:`28619`). - Bug in color validation incorrectly raising for non-color styles (:issue:`29122`). +- Allow :meth:`DataFrame.plot.scatter` to plot ``objects`` and ``datetime`` type data (:issue:`18755`, :issue:`30391`) +- Bug in :meth:`DataFrame.hist`, ``xrot=0`` does not work with ``by`` and subplots (:issue:`30288`). Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` only showing output from a single group when function returns an :class:`Index` (:issue:`28652`) - Bug in :meth:`DataFrame.groupby` with multiple groups where an ``IndexError`` would be raised if any group contained all NA values (:issue:`20519`) -- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty series or dataframe (:issue:`28427`) +- Bug in :meth:`pandas.core.resample.Resampler.size` and :meth:`pandas.core.resample.Resampler.count` returning wrong dtype when used with an empty :class:`Series` or :class:`DataFrame` (:issue:`28427`) - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue:`28192`) - Bug in :meth:`DataFrame.rolling` not allowing rolling over multi-index levels (:issue:`15584`). - Bug in :meth:`DataFrame.rolling` not allowing rolling on monotonic decreasing time indexes (:issue:`19248`). - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) -- Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) +- Bug in :meth:`core.groupby.DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Remove error raised due to duplicated input functions in named aggregation in :meth:`DataFrame.groupby` and :meth:`Series.groupby`. Previously error will be raised if the same function is applied on the same column and now it is allowed if new assigned names are different. (:issue:`28426`) -- :meth:`SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue: 28479) -- Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) +- :meth:`core.groupby.SeriesGroupBy.value_counts` will be able to handle the case even when the :class:`Grouper` makes empty groups (:issue:`28479`) +- Bug in :meth:`core.window.rolling.Rolling.quantile` ignoring ``interpolation`` keyword argument when used within a groupby (:issue:`28779`) - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) -- Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrectly casting results to the original dtype (:issue:`29641`) - Bug in :meth:`DataFrame.groupby` when using axis=1 and having a single level columns index (:issue:`30208`) - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) +- Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) +- Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) Reshaping ^^^^^^^^^ @@ -857,17 +1143,20 @@ Reshaping - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). - Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) -- Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ValueError (:issue:`28664`) +- Bug in :meth:`Index.join` that caused infinite recursion error for mismatched ``MultiIndex`` name orders. (:issue:`25760`, :issue:`28956`) +- Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ``ValueError`` (:issue:`28664`) - Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) - Bug in :meth:`DataFrame.replace` that caused non-numeric replacer's dtype not respected (:issue:`26632`) - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) +- Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) -- +- Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`) +- Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) Sparse ^^^^^^ - Bug in :class:`SparseDataFrame` arithmetic operations incorrectly casting inputs to float (:issue:`28107`) -- +- Bug in ``DataFrame.sparse`` returning a ``Series`` when there was a column named ``sparse`` rather than the accessor (:issue:`30758`) - ExtensionArray @@ -875,7 +1164,7 @@ ExtensionArray - Bug in :class:`arrays.PandasArray` when setting a scalar string (:issue:`28118`, :issue:`28150`). - Bug where nullable integers could not be compared to strings (:issue:`28930`) -- Bug where :class:`DataFrame` constructor raised ValueError with list-like data and ``dtype`` specified (:issue:`30280`) +- Bug where :class:`DataFrame` constructor raised ``ValueError`` with list-like data and ``dtype`` specified (:issue:`30280`) Other @@ -886,15 +1175,26 @@ Other - Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`) - :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`) - Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`) +- Backtick quoting in :meth:`DataFrame.query` and :meth:`DataFrame.eval` can now also be used to use invalid identifiers like names that start with a digit, are python keywords, or are using single character operators. (:issue:`27017`) +- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`) - Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`) - Fix :class:`AbstractHolidayCalendar` to return correct results for years after 2030 (now goes up to 2200) (:issue:`27790`) -- Fixed :class:`IntegerArray` returning ``inf`` rather than ``NaN`` for operations dividing by 0 (:issue:`27398`) -- Fixed ``pow`` operations for :class:`IntegerArray` when the other value is ``0`` or ``1`` (:issue:`29997`) +- Fixed :class:`~arrays.IntegerArray` returning ``inf`` rather than ``NaN`` for operations dividing by ``0`` (:issue:`27398`) +- Fixed ``pow`` operations for :class:`~arrays.IntegerArray` when the other value is ``0`` or ``1`` (:issue:`29997`) - Bug in :meth:`Series.count` raises if use_inf_as_na is enabled (:issue:`29478`) +- Bug in :class:`Index` where a non-hashable name could be set without raising ``TypeError`` (:issue:`29069`) +- Bug in :class:`DataFrame` constructor when passing a 2D ``ndarray`` and an extension dtype (:issue:`12513`) +- Bug in :meth:`DataFrame.to_csv` when supplied a series with a ``dtype="string"`` and a ``na_rep``, the ``na_rep`` was being truncated to 2 characters. (:issue:`29975`) +- Bug where :meth:`DataFrame.itertuples` would incorrectly determine whether or not namedtuples could be used for dataframes of 255 columns (:issue:`28282`) +- Handle nested NumPy ``object`` arrays in :func:`testing.assert_series_equal` for ExtensionArray implementations (:issue:`30841`) +- Bug in :class:`Index` constructor incorrectly allowing 2-dimensional input arrays (:issue:`13601`, :issue:`27125`) +.. --------------------------------------------------------------------------- -.. _whatsnew_1000.contributors: +.. _whatsnew_100.contributors: Contributors ~~~~~~~~~~~~ + +.. contributors:: v0.25.3..v1.0.0rc0 diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst new file mode 100644 index 0000000000000..01c089b46b4a1 --- /dev/null +++ b/doc/source/whatsnew/v1.1.0.rst @@ -0,0 +1,172 @@ +.. _whatsnew_110: + +What's new in 1.1.0 (??) +------------------------ + +These are the changes in pandas 1.1.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_110.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) +- +- + + +.. --------------------------------------------------------------------------- + +.. _whatsnew_110.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + + +.. _whatsnew_110.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_110.bug_fixes: + +Bug fixes +~~~~~~~~~ + + +Categorical +^^^^^^^^^^^ + +- +- + +Datetimelike +^^^^^^^^^^^^ +- Bug in :class:`Timestamp` where constructing :class:`Timestamp` from ambiguous epoch time and calling constructor again changed :meth:`Timestamp.value` property (:issue:`24329`) +- :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`) +- + +Timedelta +^^^^^^^^^ + +- +- + +Timezones +^^^^^^^^^ + +- +- + + +Numeric +^^^^^^^ +- +- + +Conversion +^^^^^^^^^^ +- Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) +- +- + +Strings +^^^^^^^ + +- +- + + +Interval +^^^^^^^^ + +- +- + +Indexing +^^^^^^^^ +- Bug in slicing on a :class:`DatetimeIndex` with a partial-timestamp dropping high-resolution indices near the end of a year, quarter, or month (:issue:`31064`) +- +- + +Missing +^^^^^^^ + +- +- + +MultiIndex +^^^^^^^^^^ + +- +- + +I/O +^^^ + +- +- + +Plotting +^^^^^^^^ + +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) + +Reshaping +^^^^^^^^^ + +- +- Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`) +- Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) +- Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) + + +Sparse +^^^^^^ + +- +- + +ExtensionArray +^^^^^^^^^^^^^^ + +- +- + + +Other +^^^^^ +- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` + instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_110.contributors: + +Contributors +~~~~~~~~~~~~ diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index fdc5a6b283ba8..f394aac5c545b 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- encoding:utf-8 -*- """ Script to generate contributor and pull request lists diff --git a/environment.yml b/environment.yml index 2b171d097a693..5f1184e921119 100644 --- a/environment.yml +++ b/environment.yml @@ -27,13 +27,13 @@ dependencies: # documentation - gitpython # obtain contributors from git for whatsnew - sphinx - - numpydoc>=0.9.0 # documentation (jupyter notebooks) - nbconvert>=5.4.1 - nbsphinx - pandoc - # Dask and its dependencies + + # Dask and its dependencies (that dont install with dask) - dask-core - toolz>=0.7.3 - fsspec>=0.5.1 @@ -54,6 +54,9 @@ dependencies: - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 + - pytest-asyncio + + # downstream tests - seaborn - statsmodels @@ -67,29 +70,38 @@ dependencies: - blosc - bottleneck>=1.2.1 - ipykernel - - ipython>=5.6.0 + - ipython>=7.11.1 - jinja2 # pandas.Styler - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 - scipy>=1.1 + - numba>=0.46.0 # optional for io - - beautifulsoup4>=4.6.0 # pandas.read_html + # --------------- + # pd.read_html + - beautifulsoup4>=4.6.0 + - html5lib + - lxml + + # pd.read_excel, DataFrame.to_excel, pd.ExcelWriter, pd.ExcelFile + - openpyxl<=3.0.1 + - xlrd + - xlsxwriter + - xlwt + - odfpy + - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - - html5lib # pandas.read_html - - lxml # pandas.read_html - - openpyxl<=3.0.1 # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - pyarrow>=0.13.1 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather + - python-snappy # required by pyarrow + - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf - - python-snappy # required by pyarrow - s3fs # pandas.read_csv... when using 's3://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray - - xlrd # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - xlsxwriter # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - xlwt # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - odfpy # pandas.read_excel - pyreadstat # pandas.read_spss + - tabulate>=0.8.3 # DataFrame.to_markdown - pip: - git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master + - git+https://github.com/numpy/numpydoc diff --git a/pandas/__init__.py b/pandas/__init__.py index 30b7e5bafe1df..d526531b159b2 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -10,7 +10,7 @@ try: __import__(dependency) except ImportError as e: - missing_dependencies.append("{0}: {1}".format(dependency, str(e))) + missing_dependencies.append(f"{dependency}: {e}") if missing_dependencies: raise ImportError( @@ -33,14 +33,11 @@ # hack but overkill to use re module = str(e).replace("cannot import name ", "") raise ImportError( - "C extension: {0} not built. If you want to import " + f"C extension: {module} not built. If you want to import " "pandas from the source directory, you may need to run " - "'python setup.py build_ext --inplace --force' to build " - "the C extensions first.".format(module) + "'python setup.py build_ext --inplace --force' to build the C extensions first." ) -from datetime import datetime - from pandas._config import ( get_option, set_option, @@ -105,7 +102,6 @@ to_datetime, to_timedelta, # misc - np, Grouper, factorize, unique, @@ -118,7 +114,7 @@ DataFrame, ) -from pandas.core.arrays.sparse import SparseArray, SparseDtype +from pandas.core.arrays.sparse import SparseDtype from pandas.tseries.api import infer_freq from pandas.tseries import offsets @@ -141,6 +137,7 @@ qcut, ) +import pandas.api from pandas.util._print_versions import show_versions from pandas.io.api import ( @@ -189,7 +186,6 @@ __git_version__ = v.get("full-revisionid") del get_versions, v - # GH 27101 # TODO: remove Panel compat in 1.0 if pandas.compat.PY37: @@ -201,8 +197,7 @@ def __getattr__(name): warnings.warn( "The Panel class is removed from pandas. Accessing it " - "from the top-level namespace will also be removed in " - "the next version", + "from the top-level namespace will also be removed in the next version", FutureWarning, stacklevel=2, ) @@ -211,18 +206,57 @@ class Panel: pass return Panel + + elif name == "datetime": + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime module instead.", + FutureWarning, + stacklevel=2, + ) + + from datetime import datetime as dt + + return dt + + elif name == "np": + + warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + import numpy as np + + return np + elif name in {"SparseSeries", "SparseDataFrame"}: warnings.warn( - "The {} class is removed from pandas. Accessing it from " - "the top-level namespace will also be removed in the next " - "version".format(name), + f"The {name} class is removed from pandas. Accessing it from " + "the top-level namespace will also be removed in the next version", FutureWarning, stacklevel=2, ) return type(name, (), {}) - raise AttributeError("module 'pandas' has no attribute '{}'".format(name)) + elif name == "SparseArray": + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=2, + ) + from pandas.core.arrays.sparse import SparseArray as _SparseArray + + return _SparseArray + + raise AttributeError(f"module 'pandas' has no attribute '{name}'") else: @@ -236,6 +270,96 @@ class SparseDataFrame: class SparseSeries: pass + class __numpy: + def __init__(self): + import numpy as np + import warnings + + self.np = np + self.warnings = warnings + + def __getattr__(self, item): + self.warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + + try: + return getattr(self.np, item) + except AttributeError: + raise AttributeError(f"module numpy has no attribute {item}") + + np = __numpy() + + class __Datetime(type): + + from datetime import datetime as dt + + datetime = dt + + def __getattr__(cls, item): + cls.emit_warning() + + try: + return getattr(cls.datetime, item) + except AttributeError: + raise AttributeError(f"module datetime has no attribute {item}") + + def __instancecheck__(cls, other): + return isinstance(other, cls.datetime) + + class __DatetimeSub(metaclass=__Datetime): + def emit_warning(dummy=0): + import warnings + + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime instead.", + FutureWarning, + stacklevel=3, + ) + + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from datetime import datetime as dt + + return dt(*args, **kwargs) + + datetime = __DatetimeSub + + class __SparseArray(type): + + from pandas.core.arrays.sparse import SparseArray as sa + + SparseArray = sa + + def __instancecheck__(cls, other): + return isinstance(other, cls.SparseArray) + + class __SparseArraySub(metaclass=__SparseArray): + def emit_warning(dummy=0): + import warnings + + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=3, + ) + + def __new__(cls, *args, **kwargs): + cls.emit_warning() + from pandas.core.arrays.sparse import SparseArray as sa + + return sa(*args, **kwargs) + + SparseArray = __SparseArraySub + # module level doc-string __doc__ = """ diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 6844df495547a..cacd6f5454de7 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -51,7 +51,18 @@ from collections import namedtuple from contextlib import contextmanager import re -from typing import Any, Dict, Iterable, List +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, + TypeVar, + cast, +) import warnings DeprecatedOption = namedtuple("DeprecatedOption", "key msg rkey removal_ver") @@ -80,7 +91,7 @@ class OptionError(AttributeError, KeyError): # User API -def _get_single_key(pat, silent): +def _get_single_key(pat: str, silent: bool) -> str: keys = _select_options(pat) if len(keys) == 0: if not silent: @@ -98,7 +109,7 @@ def _get_single_key(pat, silent): return key -def _get_option(pat, silent=False): +def _get_option(pat: str, silent: bool = False): key = _get_single_key(pat, silent) # walk the nested dict @@ -106,7 +117,7 @@ def _get_option(pat, silent=False): return root[k] -def _set_option(*args, **kwargs): +def _set_option(*args, **kwargs) -> None: # must at least 1 arg deal with constraints later nargs = len(args) if not nargs or nargs % 2 != 0: @@ -138,7 +149,7 @@ def _set_option(*args, **kwargs): o.cb(key) -def _describe_option(pat="", _print_desc=True): +def _describe_option(pat: str = "", _print_desc: bool = True): keys = _select_options(pat) if len(keys) == 0: @@ -154,7 +165,7 @@ def _describe_option(pat="", _print_desc=True): return s -def _reset_option(pat, silent=False): +def _reset_option(pat: str, silent: bool = False) -> None: keys = _select_options(pat) @@ -165,15 +176,14 @@ def _reset_option(pat, silent=False): raise ValueError( "You must specify at least 4 characters when " "resetting multiple keys, use the special keyword " - '"all" to reset all the options to their default ' - "value" + '"all" to reset all the options to their default value' ) for k in keys: _set_option(k, _registered_options[k].defval, silent=silent) -def get_default_val(pat): +def get_default_val(pat: str): key = _get_single_key(pat, silent=True) return _get_registered_option(key).defval @@ -181,11 +191,11 @@ def get_default_val(pat): class DictWrapper: """ provide attribute-style access to a nested dict""" - def __init__(self, d, prefix=""): + def __init__(self, d: Dict[str, Any], prefix: str = ""): object.__setattr__(self, "d", d) object.__setattr__(self, "prefix", prefix) - def __setattr__(self, key, val): + def __setattr__(self, key: str, val: Any) -> None: prefix = object.__getattribute__(self, "prefix") if prefix: prefix += "." @@ -197,7 +207,7 @@ def __setattr__(self, key, val): else: raise OptionError("You can only set the value of existing options") - def __getattr__(self, key): + def __getattr__(self, key: str): prefix = object.__getattribute__(self, "prefix") if prefix: prefix += "." @@ -211,7 +221,7 @@ def __getattr__(self, key): else: return _get_option(prefix) - def __dir__(self): + def __dir__(self) -> Iterable[str]: return list(self.d.keys()) @@ -412,23 +422,31 @@ def __exit__(self, *args): _set_option(pat, val, silent=True) -def register_option(key: str, defval: object, doc="", validator=None, cb=None): - """Register an option in the package-wide pandas config object +def register_option( + key: str, + defval: object, + doc: str = "", + validator: Optional[Callable[[Any], Any]] = None, + cb: Optional[Callable[[str], Any]] = None, +) -> None: + """ + Register an option in the package-wide pandas config object Parameters ---------- - key - a fully-qualified key, e.g. "x.y.option - z". - defval - the default value of the option - doc - a string description of the option - validator - a function of a single argument, should raise `ValueError` if - called with a value which is not a legal value for the option. - cb - a function of a single argument "key", which is called - immediately after an option value is set/reset. key is - the full name of the option. - - Returns - ------- - Nothing. + key : str + Fully-qualified key, e.g. "x.y.option - z". + defval : object + Default value of the option. + doc : str + Description of the option. + validator : Callable, optional + Function of a single argument, should raise `ValueError` if + called with a value which is not a legal value for the option. + cb + a function of a single argument "key", which is called + immediately after an option value is set/reset. key is + the full name of the option. Raises ------ @@ -481,7 +499,9 @@ def register_option(key: str, defval: object, doc="", validator=None, cb=None): ) -def deprecate_option(key, msg=None, rkey=None, removal_ver=None): +def deprecate_option( + key: str, msg: Optional[str] = None, rkey: Optional[str] = None, removal_ver=None +) -> None: """ Mark option `key` as deprecated, if code attempts to access this option, a warning will be produced, using `msg` if given, or a default message @@ -494,32 +514,27 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None): Parameters ---------- - key - the name of the option to be deprecated. must be a fully-qualified - option name (e.g "x.y.z.rkey"). - - msg - (Optional) a warning message to output when the key is referenced. - if no message is given a default message will be emitted. - - rkey - (Optional) the name of an option to reroute access to. - If specified, any referenced `key` will be re-routed to `rkey` - including set/get/reset. - rkey must be a fully-qualified option name (e.g "x.y.z.rkey"). - used by the default message if no `msg` is specified. - - removal_ver - (Optional) specifies the version in which this option will - be removed. used by the default message if no `msg` - is specified. - - Returns - ------- - Nothing + key : str + Name of the option to be deprecated. + must be a fully-qualified option name (e.g "x.y.z.rkey"). + msg : str, optional + Warning message to output when the key is referenced. + if no message is given a default message will be emitted. + rkey : str, optional + Name of an option to reroute access to. + If specified, any referenced `key` will be + re-routed to `rkey` including set/get/reset. + rkey must be a fully-qualified option name (e.g "x.y.z.rkey"). + used by the default message if no `msg` is specified. + removal_ver : optional + Specifies the version in which this option will + be removed. used by the default message if no `msg` is specified. Raises ------ - OptionError - if key has already been deprecated. - + OptionError + If the specified key has already been deprecated. """ - key = key.lower() if key in _deprecated_options: @@ -532,7 +547,7 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None): # functions internal to the module -def _select_options(pat): +def _select_options(pat: str) -> List[str]: """returns a list of keys matching `pat` if pat=="all", returns all registered options @@ -550,7 +565,7 @@ def _select_options(pat): return [k for k in keys if re.search(pat, k, re.I)] -def _get_root(key): +def _get_root(key: str) -> Tuple[Dict[str, Any], str]: path = key.split(".") cursor = _global_config for p in path[:-1]: @@ -558,14 +573,14 @@ def _get_root(key): return cursor, path[-1] -def _is_deprecated(key): +def _is_deprecated(key: str) -> bool: """ Returns True if the given option has been deprecated """ key = key.lower() return key in _deprecated_options -def _get_deprecated_option(key): +def _get_deprecated_option(key: str): """ Retrieves the metadata for a deprecated option, if `key` is deprecated. @@ -582,7 +597,7 @@ def _get_deprecated_option(key): return d -def _get_registered_option(key): +def _get_registered_option(key: str): """ Retrieves the option metadata if `key` is a registered option. @@ -593,7 +608,7 @@ def _get_registered_option(key): return _registered_options.get(key) -def _translate_key(key): +def _translate_key(key: str) -> str: """ if key id deprecated and a replacement key defined, will return the replacement key, otherwise returns `key` as - is @@ -606,7 +621,7 @@ def _translate_key(key): return key -def _warn_if_deprecated(key): +def _warn_if_deprecated(key: str) -> bool: """ Checks if `key` is a deprecated option and if so, prints a warning. @@ -634,7 +649,7 @@ def _warn_if_deprecated(key): return False -def _build_option_description(k): +def _build_option_description(k: str) -> str: """ Builds a formatted description of a registered option and prints it """ o = _get_registered_option(k) @@ -659,7 +674,7 @@ def _build_option_description(k): return s -def pp_options_list(keys, width=80, _print=False): +def pp_options_list(keys: Iterable[str], width=80, _print: bool = False): """ Builds a concise listing of available options, grouped by prefix """ from textwrap import wrap @@ -697,6 +712,9 @@ def pp(name: str, ks: Iterable[str]) -> List[str]: # # helpers +FuncType = Callable[..., Any] +F = TypeVar("F", bound=FuncType) + @contextmanager def config_prefix(prefix): @@ -728,12 +746,12 @@ def config_prefix(prefix): global register_option, get_option, set_option, reset_option - def wrap(func): - def inner(key, *args, **kwds): + def wrap(func: F) -> F: + def inner(key: str, *args, **kwds): pkey = f"{prefix}.{key}" return func(pkey, *args, **kwds) - return inner + return cast(F, inner) _register_option = register_option _get_option = get_option @@ -751,7 +769,7 @@ def inner(key, *args, **kwds): # arg in register_option -def is_type_factory(_type): +def is_type_factory(_type: Type[Any]) -> Callable[[Any], None]: """ Parameters @@ -765,14 +783,14 @@ def is_type_factory(_type): """ - def inner(x): + def inner(x) -> None: if type(x) != _type: raise ValueError(f"Value must have type '{_type}'") return inner -def is_instance_factory(_type): +def is_instance_factory(_type) -> Callable[[Any], None]: """ Parameters @@ -792,19 +810,19 @@ def is_instance_factory(_type): else: type_repr = f"'{_type}'" - def inner(x): + def inner(x) -> None: if not isinstance(x, _type): raise ValueError(f"Value must be an instance of {type_repr}") return inner -def is_one_of_factory(legal_values): +def is_one_of_factory(legal_values) -> Callable[[Any], None]: callables = [c for c in legal_values if callable(c)] legal_values = [c for c in legal_values if not callable(c)] - def inner(x): + def inner(x) -> None: if x not in legal_values: if not any(c(x) for c in callables): @@ -818,7 +836,7 @@ def inner(x): return inner -def is_nonnegative_int(value): +def is_nonnegative_int(value: Optional[int]) -> None: """ Verify that value is None or a positive int. @@ -853,7 +871,7 @@ def is_nonnegative_int(value): is_text = is_instance_factory((str, bytes)) -def is_callable(obj): +def is_callable(obj) -> bool: """ Parameters diff --git a/pandas/_config/display.py b/pandas/_config/display.py index 067b7c503baab..ef319f4447565 100644 --- a/pandas/_config/display.py +++ b/pandas/_config/display.py @@ -1,6 +1,7 @@ """ Unopinionated display configuration. """ + import locale import sys @@ -11,7 +12,7 @@ _initial_defencoding = None -def detect_console_encoding(): +def detect_console_encoding() -> str: """ Try to find the most capable encoding supported by the console. slightly modified from the way IPython handles the same issue. diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index dd1d4948aa6e3..0d68e78372d8a 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -12,7 +12,7 @@ @contextmanager -def set_locale(new_locale, lc_var=locale.LC_ALL): +def set_locale(new_locale, lc_var: int = locale.LC_ALL): """ Context manager for temporarily setting a locale. @@ -44,7 +44,7 @@ def set_locale(new_locale, lc_var=locale.LC_ALL): locale.setlocale(lc_var, current_locale) -def can_set_locale(lc, lc_var=locale.LC_ALL): +def can_set_locale(lc: str, lc_var: int = locale.LC_ALL) -> bool: """ Check to see if we can set a locale, and subsequently get the locale, without raising an Exception. @@ -58,7 +58,7 @@ def can_set_locale(lc, lc_var=locale.LC_ALL): Returns ------- - is_valid : bool + bool Whether the passed locale can be set """ diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7a2fc9dc7845a..dd1f38ce3a842 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -914,8 +914,7 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') + raise ValueError('first not supported for non-numeric data') else: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = j + 1 @@ -971,8 +970,7 @@ def rank_1d(rank_t[:] in_arr, ties_method='average', ranks[argsorted[j]] = i + 1 elif tiebreak == TIEBREAK_FIRST: if rank_t is object: - raise ValueError('first not supported for ' - 'non-numeric data') + raise ValueError('first not supported for non-numeric data') else: for j in range(i - dups + 1, i + 1): ranks[argsorted[j]] = j + 1 @@ -1137,8 +1135,7 @@ def rank_2d(rank_t[:, :] in_arr, axis=0, ties_method='average', ranks[i, argsorted[i, z]] = j + 1 elif tiebreak == TIEBREAK_FIRST: if rank_t is object: - raise ValueError('first not supported ' - 'for non-numeric data') + raise ValueError('first not supported for non-numeric data') else: for z in range(j - dups + 1, j + 1): ranks[i, argsorted[i, z]] = z + 1 diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 420e08a3d68d4..995fabbedcb5d 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -116,7 +116,7 @@ def take_2d_axis0_{{name}}_{{dest}}(ndarray[{{c_type_in}}, ndim=2] values, IF {{True if c_type_in == c_type_out != "object" else False}}: cdef: - {{c_type_out}} *v + const {{c_type_out}} *v {{c_type_out}} *o # GH#3130 diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index abb8a6d388d26..93ea94f7b18fc 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -686,8 +686,7 @@ def _group_ohlc(floating[:, :] out, raise ValueError('Output array must have 4 columns') if K > 1: - raise NotImplementedError("Argument 'values' must have only " - "one dimension") + raise NotImplementedError("Argument 'values' must have only one dimension") out[:] = np.nan with nogil: diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index d735890f7d07e..878da670b2f68 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -51,8 +51,9 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): k = key.encode(encoding) kb = k if len(k) != 16: - raise ValueError("key should be a 16-byte string encoded, " - f"got {k} (len {len(k)})") + raise ValueError( + f"key should be a 16-byte string encoded, got {k} (len {len(k)})" + ) n = len(arr) @@ -70,9 +71,17 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'): # null, stringify and encode data = str(val).encode(encoding) + elif isinstance(val, tuple): + # GH#28969 we could have a tuple, but need to ensure that + # the tuple entries are themselves hashable before converting + # to str + hash(val) + data = str(val).encode(encoding) else: - raise TypeError(f"{val} of type {type(val)} is not a valid type " - "for hashing, must be string or null") + raise TypeError( + f"{val} of type {type(val)} is not a valid type for hashing, " + "must be string or null" + ) l = len(data) lens[i] = l diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 6e68a687de94a..884db9ee931d4 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,7 +1,7 @@ cimport cython from cpython.ref cimport PyObject, Py_INCREF -from cpython.mem cimport PyMem_Malloc, PyMem_Realloc, PyMem_Free +from cpython.mem cimport PyMem_Malloc, PyMem_Free from libc.stdlib cimport malloc, free @@ -13,26 +13,45 @@ cnp.import_array() cdef extern from "numpy/npy_math.h": float64_t NAN "NPY_NAN" - from pandas._libs.khash cimport ( khiter_t, - - kh_str_t, kh_init_str, kh_put_str, kh_exist_str, - kh_get_str, kh_destroy_str, kh_resize_str, - - kh_put_strbox, kh_get_strbox, kh_init_strbox, - - kh_int64_t, kh_init_int64, kh_resize_int64, kh_destroy_int64, - kh_get_int64, kh_exist_int64, kh_put_int64, - - kh_float64_t, kh_exist_float64, kh_put_float64, kh_init_float64, - kh_get_float64, kh_destroy_float64, kh_resize_float64, - - kh_resize_uint64, kh_exist_uint64, kh_destroy_uint64, kh_put_uint64, - kh_get_uint64, kh_init_uint64, - - kh_destroy_pymap, kh_exist_pymap, kh_init_pymap, kh_get_pymap, - kh_put_pymap, kh_resize_pymap) + kh_str_t, + kh_init_str, + kh_put_str, + kh_exist_str, + kh_get_str, + kh_destroy_str, + kh_resize_str, + kh_put_strbox, + kh_get_strbox, + kh_init_strbox, + kh_int64_t, + kh_init_int64, + kh_resize_int64, + kh_destroy_int64, + kh_get_int64, + kh_exist_int64, + kh_put_int64, + kh_float64_t, + kh_exist_float64, + kh_put_float64, + kh_init_float64, + kh_get_float64, + kh_destroy_float64, + kh_resize_float64, + kh_resize_uint64, + kh_exist_uint64, + kh_destroy_uint64, + kh_put_uint64, + kh_get_uint64, + kh_init_uint64, + kh_destroy_pymap, + kh_exist_pymap, + kh_init_pymap, + kh_get_pymap, + kh_put_pymap, + kh_resize_pymap, +) cimport pandas._libs.util as util @@ -63,8 +82,9 @@ cdef class Factorizer: def get_count(self): return self.count - def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1, - na_value=None): + def factorize( + self, ndarray[object] values, sort=False, na_sentinel=-1, na_value=None + ): """ Factorize values with nans replaced by na_sentinel >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index ac8172146d351..e4ec9db560b80 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -17,8 +17,8 @@ cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.tslibs.conversion cimport maybe_datetimelike_to_i8 from pandas._libs.tslibs.nattype cimport c_NaT as NaT +from pandas._libs.tslibs.c_timestamp cimport _Timestamp from pandas._libs.hashtable cimport HashTable @@ -72,9 +72,10 @@ cdef class IndexEngine: self.over_size_threshold = n >= _SIZE_CUTOFF self.clear_mapping() - def __contains__(self, object val): + def __contains__(self, val: object) -> bool: + # We assume before we get here: + # - val is hashable self._ensure_mapping_populated() - hash(val) return val in self.mapping cpdef get_value(self, ndarray arr, object key, object tz=None): @@ -85,7 +86,6 @@ cdef class IndexEngine: """ cdef: object loc - void* data_ptr loc = self.get_loc(key) if isinstance(loc, slice) or util.is_array(loc): @@ -101,7 +101,6 @@ cdef class IndexEngine: """ cdef: object loc - void* data_ptr loc = self.get_loc(key) value = convert_scalar(arr, value) @@ -215,7 +214,8 @@ cdef class IndexEngine: return self.monotonic_dec == 1 cdef inline _do_monotonic_check(self): - cdef object is_unique + cdef: + bint is_unique try: values = self._get_index_values() self.monotonic_inc, self.monotonic_dec, is_unique = \ @@ -238,10 +238,10 @@ cdef class IndexEngine: cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=False) - def get_backfill_indexer(self, other, limit=None): + def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: return algos.backfill(self._get_index_values(), other, limit=limit) - def get_pad_indexer(self, other, limit=None): + def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: return algos.pad(self._get_index_values(), other, limit=limit) cdef _make_hash_table(self, Py_ssize_t n): @@ -409,20 +409,29 @@ cdef class DatetimeEngine(Int64Engine): cdef _get_box_dtype(self): return 'M8[ns]' - def __contains__(self, object val): + cdef int64_t _unbox_scalar(self, scalar) except? -1: + # NB: caller is responsible for ensuring tzawareness compat + # before we get here + if not (isinstance(scalar, _Timestamp) or scalar is NaT): + raise TypeError(scalar) + return scalar.value + + def __contains__(self, val: object) -> bool: + # We assume before we get here: + # - val is hashable cdef: - int64_t loc + int64_t loc, conv + conv = self._unbox_scalar(val) if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: - return self._get_loc_duplicates(val) + return self._get_loc_duplicates(conv) values = self._get_index_values() - conv = maybe_datetimelike_to_i8(val) loc = values.searchsorted(conv, side='left') return values[loc] == conv self._ensure_mapping_populated() - return maybe_datetimelike_to_i8(val) in self.mapping + return conv in self.mapping cdef _get_index_values(self): return self.vgetter().view('i8') @@ -431,24 +440,26 @@ cdef class DatetimeEngine(Int64Engine): return algos.is_monotonic(values, timelike=True) cpdef get_loc(self, object val): + # NB: the caller is responsible for ensuring that we are called + # with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine) + cdef: int64_t loc if is_definitely_invalid_key(val): raise TypeError + try: + conv = self._unbox_scalar(val) + except TypeError: + raise KeyError(val) + # Welcome to the spaghetti factory if self.over_size_threshold and self.is_monotonic_increasing: if not self.is_unique: - val = maybe_datetimelike_to_i8(val) - return self._get_loc_duplicates(val) + return self._get_loc_duplicates(conv) values = self._get_index_values() - try: - conv = maybe_datetimelike_to_i8(val) - loc = values.searchsorted(conv, side='left') - except TypeError: - self._date_check_type(val) - raise KeyError(val) + loc = values.searchsorted(conv, side='left') if loc == len(values) or values[loc] != conv: raise KeyError(val) @@ -456,27 +467,12 @@ cdef class DatetimeEngine(Int64Engine): self._ensure_mapping_populated() if not self.unique: - val = maybe_datetimelike_to_i8(val) - return self._get_loc_duplicates(val) + return self._get_loc_duplicates(conv) try: - return self.mapping.get_item(val.value) + return self.mapping.get_item(conv) except KeyError: raise KeyError(val) - except AttributeError: - pass - - try: - val = maybe_datetimelike_to_i8(val) - return self.mapping.get_item(val) - except (TypeError, ValueError): - self._date_check_type(val) - raise KeyError(val) - - cdef inline _date_check_type(self, object val): - hash(val) - if not util.is_integer_object(val): - raise KeyError(val) def get_indexer(self, values): self._ensure_mapping_populated() @@ -485,13 +481,13 @@ cdef class DatetimeEngine(Int64Engine): values = np.asarray(values).view('i8') return self.mapping.lookup(values) - def get_pad_indexer(self, other, limit=None): + def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: if other.dtype != self._get_box_dtype(): return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') return algos.pad(self._get_index_values(), other, limit=limit) - def get_backfill_indexer(self, other, limit=None): + def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: if other.dtype != self._get_box_dtype(): return np.repeat(-1, len(other)).astype('i4') other = np.asarray(other).view('i8') @@ -503,22 +499,24 @@ cdef class TimedeltaEngine(DatetimeEngine): cdef _get_box_dtype(self): return 'm8[ns]' + cdef int64_t _unbox_scalar(self, scalar) except? -1: + if not (isinstance(scalar, Timedelta) or scalar is NaT): + raise TypeError(scalar) + return scalar.value + cdef class PeriodEngine(Int64Engine): cdef _get_index_values(self): - return super(PeriodEngine, self).vgetter() - - cdef void _call_map_locations(self, values): - # super(...) pattern doesn't seem to work with `cdef` - Int64Engine._call_map_locations(self, values.view('i8')) + return super(PeriodEngine, self).vgetter().view("i8") cdef _call_monotonic(self, values): # super(...) pattern doesn't seem to work with `cdef` return Int64Engine._call_monotonic(self, values.view('i8')) def get_indexer(self, values): - cdef ndarray[int64_t, ndim=1] ordinals + cdef: + ndarray[int64_t, ndim=1] ordinals super(PeriodEngine, self)._ensure_mapping_populated() @@ -527,14 +525,14 @@ cdef class PeriodEngine(Int64Engine): return self.mapping.lookup(ordinals) - def get_pad_indexer(self, other, limit=None): + def get_pad_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: freq = super(PeriodEngine, self).vgetter().freq ordinal = periodlib.extract_ordinals(other, freq) return algos.pad(self._get_index_values(), np.asarray(ordinal), limit=limit) - def get_backfill_indexer(self, other, limit=None): + def get_backfill_indexer(self, other: np.ndarray, limit=None) -> np.ndarray: freq = super(PeriodEngine, self).vgetter().freq ordinal = periodlib.extract_ordinals(other, freq) @@ -717,7 +715,9 @@ cdef class BaseMultiIndexCodesEngine: return indexer - def __contains__(self, object val): + def __contains__(self, val: object) -> bool: + # We assume before we get here: + # - val is hashable # Default __contains__ looks in the underlying mapping, which in this # case only contains integer representations. try: diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index 093cca4fe7ed5..cd2b9fbe7d6d6 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -53,10 +53,7 @@ cdef class {{name}}Engine(IndexEngine): ndarray[{{ctype}}] values int count = 0 - {{if name not in {'Float64', 'Float32'} }} - if not util.is_integer_object(val): - raise KeyError(val) - {{endif}} + self._check_type(val) # A view is needed for some subclasses, such as PeriodEngine: values = self._get_index_values().view('{{dtype}}') diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index 01f4fb060d982..cdccdb504571c 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -18,6 +18,7 @@ cdef class _NDFrameIndexerBase: if ndim is None: ndim = self._ndim = self.obj.ndim if ndim > 2: - raise ValueError("NDFrameIndexer does not support " - "NDFrame objects with ndim > 2") + raise ValueError( + "NDFrameIndexer does not support NDFrame objects with ndim > 2" + ) return ndim diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 4293108ea7ec2..1166768472449 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -326,7 +326,7 @@ cdef class Interval(IntervalMixin): def __hash__(self): return hash((self.left, self.right, self.closed)) - def __contains__(self, key): + def __contains__(self, key) -> bool: if _interval_like(key): raise TypeError("__contains__ not defined for two intervals") return ((self.left < key if self.open_left else self.left <= key) and diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 8cb51be36645e..d09413bfa5210 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -6,12 +6,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in from pandas._libs.algos import is_monotonic -ctypedef fused scalar_t: - float64_t - float32_t +ctypedef fused int_scalar_t: int64_t - int32_t + float64_t + +ctypedef fused uint_scalar_t: uint64_t + float64_t + +ctypedef fused scalar_t: + int_scalar_t + uint_scalar_t # ---------------------------------------------------------------------- # IntervalTree @@ -114,43 +119,6 @@ cdef class IntervalTree(IntervalMixin): sort_order = np.lexsort(values) return is_monotonic(sort_order, False)[0] - def get_loc(self, scalar_t key): - """Return all positions corresponding to intervals that overlap with - the given scalar key - """ - result = Int64Vector() - self.root.query(result, key) - if not result.data.n: - raise KeyError(key) - return result.to_array().astype('intp') - - def _get_partial_overlap(self, key_left, key_right, side): - """Return all positions corresponding to intervals with the given side - falling between the left and right bounds of an interval query - """ - if side == 'left': - values = self.left - sorter = self.left_sorter - else: - values = self.right - sorter = self.right_sorter - key = [key_left, key_right] - i, j = values.searchsorted(key, sorter=sorter) - return sorter[i:j] - - def get_loc_interval(self, key_left, key_right): - """Lookup the intervals enclosed in the given interval bounds - - The given interval is presumed to have closed bounds. - """ - import pandas as pd - left_overlap = self._get_partial_overlap(key_left, key_right, 'left') - right_overlap = self._get_partial_overlap(key_left, key_right, 'right') - enclosing = self.get_loc(0.5 * (key_left + key_right)) - combined = np.concatenate([left_overlap, right_overlap, enclosing]) - uniques = pd.unique(combined) - return uniques.astype('intp') - def get_indexer(self, scalar_t[:] target): """Return the positions corresponding to unique intervals that overlap with the given array of scalar targets. @@ -165,7 +133,12 @@ cdef class IntervalTree(IntervalMixin): result = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + if result.data.n == old_len: result.append(-1) elif result.data.n > old_len + 1: @@ -187,7 +160,12 @@ cdef class IntervalTree(IntervalMixin): missing = Int64Vector() old_len = 0 for i in range(len(target)): - self.root.query(result, target[i]) + try: + self.root.query(result, target[i]) + except OverflowError: + # overflow -> no match, which is already handled below + pass + if result.data.n == old_len: result.append(-1) missing.append(i) @@ -231,7 +209,7 @@ cdef sort_values_and_indices(all_values, all_indices, subset): {{py: nodes = [] -for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']: +for dtype in ['float64', 'int64', 'uint64']: for closed, cmp_left, cmp_right in [ ('left', '<=', '<'), ('right', '<', '<='), @@ -239,19 +217,26 @@ for dtype in ['float32', 'float64', 'int32', 'int64', 'uint64']: ('neither', '<', '<')]: cmp_left_converse = '<' if cmp_left == '<=' else '<=' cmp_right_converse = '<' if cmp_right == '<=' else '<=' + if dtype.startswith('int'): + fused_prefix = 'int_' + elif dtype.startswith('uint'): + fused_prefix = 'uint_' + elif dtype.startswith('float'): + fused_prefix = '' nodes.append((dtype, dtype.title(), closed, closed.title(), cmp_left, cmp_right, cmp_left_converse, - cmp_right_converse)) + cmp_right_converse, + fused_prefix)) }} NODE_CLASSES = {} {{for dtype, dtype_title, closed, closed_title, cmp_left, cmp_right, - cmp_left_converse, cmp_right_converse in nodes}} + cmp_left_converse, cmp_right_converse, fused_prefix in nodes}} cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: """Non-terminal node for an IntervalTree @@ -354,7 +339,7 @@ cdef class {{dtype_title}}Closed{{closed_title}}IntervalNode: @cython.wraparound(False) @cython.boundscheck(False) @cython.initializedcheck(False) - cpdef query(self, Int64Vector result, scalar_t point): + cpdef query(self, Int64Vector result, {{fused_prefix}}scalar_t point): """Recursively query this node and its sub-nodes for intervals that overlap with the query point. """ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 089a7a04abb63..acd74591134bc 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -19,7 +19,7 @@ PyDateTime_IMPORT import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, PyArray_GETITEM, +from numpy cimport (ndarray, PyArray_Check, PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, PyArray_IterNew, flatiter, NPY_OBJECT, int64_t, float32_t, float64_t, @@ -524,8 +524,11 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: # we are either not equal or both nan # I think None == None will be true here try: - if not (PyObject_RichCompareBool(x, y, Py_EQ) or - (x is None or is_nan(x)) and (y is None or is_nan(y))): + if PyArray_Check(x) and PyArray_Check(y): + if not array_equivalent_object(x, y): + return False + elif not (PyObject_RichCompareBool(x, y, Py_EQ) or + (x is None or is_nan(x)) and (y is None or is_nan(y))): return False except TypeError as err: # Avoid raising TypeError on tzawareness mismatch @@ -1621,6 +1624,10 @@ cdef class StringValidator(Validator): cdef inline bint is_array_typed(self) except -1: return issubclass(self.dtype.type, np.str_) + cdef bint is_valid_null(self, object value) except -1: + # We deliberately exclude None / NaN here since StringArray uses NA + return value is C_NA + cpdef bint is_string_array(ndarray values, bint skipna=False): cdef: @@ -2232,13 +2239,14 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, return objects -_no_default = object() +# Note: no_default is exported to the public API in pandas.api.extensions +no_default = object() #: Sentinel indicating the default value. @cython.boundscheck(False) @cython.wraparound(False) def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, - object na_value=_no_default, object dtype=object): + object na_value=no_default, object dtype=object): """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2269,7 +2277,7 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=1, result = np.empty(n, dtype=dtype) for i in range(n): if mask[i]: - if na_value is _no_default: + if na_value is no_default: val = arr[i] else: val = na_value diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index f1cfa0978c3a0..4d17a6f883c1c 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -14,6 +14,7 @@ from pandas._libs.tslibs.np_datetime cimport ( get_timedelta64_value, get_datetime64_value) from pandas._libs.tslibs.nattype cimport ( checknull_with_nat, c_NaT as NaT, is_null_datetimelike) +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas.compat import is_platform_32bit @@ -290,16 +291,29 @@ cdef inline bint is_null_period(v): # Implementation of NA singleton -def _create_binary_propagating_op(name, divmod=False): +def _create_binary_propagating_op(name, is_divmod=False): def method(self, other): if (other is C_NA or isinstance(other, str) - or isinstance(other, (numbers.Number, np.bool_))): - if divmod: + or isinstance(other, (numbers.Number, np.bool_)) + or isinstance(other, np.ndarray) and not other.shape): + # Need the other.shape clause to handle NumPy scalars, + # since we do a setitem on `out` below, which + # won't work for NumPy scalars. + if is_divmod: return NA, NA else: return NA + elif isinstance(other, np.ndarray): + out = np.empty(other.shape, dtype=object) + out[:] = NA + + if is_divmod: + return out, out.copy() + else: + return out + return NotImplemented method.__name__ = name @@ -340,10 +354,7 @@ class NAType(C_NAType): return NAType._instance def __repr__(self) -> str: - return "NA" - - def __str__(self) -> str: - return "NA" + return "" def __bool__(self): raise TypeError("boolean value of NA is ambiguous") @@ -369,8 +380,8 @@ class NAType(C_NAType): __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__") __mod__ = _create_binary_propagating_op("__mod__") __rmod__ = _create_binary_propagating_op("__rmod__") - __divmod__ = _create_binary_propagating_op("__divmod__", divmod=True) - __rdivmod__ = _create_binary_propagating_op("__rdivmod__", divmod=True) + __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True) + __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True) # __lshift__ and __rshift__ are not implemented __eq__ = _create_binary_propagating_op("__eq__") @@ -397,6 +408,8 @@ class NAType(C_NAType): return type(other)(1) else: return NA + elif isinstance(other, np.ndarray): + return np.where(other == 0, other.dtype.type(1), NA) return NotImplemented @@ -404,10 +417,12 @@ class NAType(C_NAType): if other is C_NA: return NA elif isinstance(other, (numbers.Number, np.bool_)): - if other == 1 or other == -1: + if other == 1: return other else: return NA + elif isinstance(other, np.ndarray): + return np.where(other == 1, other, NA) return NotImplemented @@ -440,6 +455,31 @@ class NAType(C_NAType): __rxor__ = __xor__ + __array_priority__ = 1000 + _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_) + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + types = self._HANDLED_TYPES + (NAType,) + for x in inputs: + if not isinstance(x, types): + return NotImplemented + + if method != "__call__": + raise ValueError(f"ufunc method '{method}' not supported for NA") + result = maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is NotImplemented: + # For a NumPy ufunc that's not a binop, like np.logaddexp + index = [i for i, x in enumerate(inputs) if x is NA][0] + result = np.broadcast_arrays(*inputs)[index] + if result.ndim == 0: + result = result.item() + if ufunc.nout > 1: + result = (NA,) * ufunc.nout + + return result + C_NA = NAType() # C-visible NA = C_NA # Python-visible diff --git a/pandas/_libs/ops_dispatch.pyx b/pandas/_libs/ops_dispatch.pyx new file mode 100644 index 0000000000000..f6ecef2038cf3 --- /dev/null +++ b/pandas/_libs/ops_dispatch.pyx @@ -0,0 +1,94 @@ +DISPATCHED_UFUNCS = { + "add", + "sub", + "mul", + "pow", + "mod", + "floordiv", + "truediv", + "divmod", + "eq", + "ne", + "lt", + "gt", + "le", + "ge", + "remainder", + "matmul", + "or", + "xor", + "and", +} +UFUNC_ALIASES = { + "subtract": "sub", + "multiply": "mul", + "floor_divide": "floordiv", + "true_divide": "truediv", + "power": "pow", + "remainder": "mod", + "divide": "div", + "equal": "eq", + "not_equal": "ne", + "less": "lt", + "less_equal": "le", + "greater": "gt", + "greater_equal": "ge", + "bitwise_or": "or", + "bitwise_and": "and", + "bitwise_xor": "xor", +} + +# For op(., Array) -> Array.__r{op}__ +REVERSED_NAMES = { + "lt": "__gt__", + "le": "__ge__", + "gt": "__lt__", + "ge": "__le__", + "eq": "__eq__", + "ne": "__ne__", +} + + +def maybe_dispatch_ufunc_to_dunder_op( + object self, object ufunc, str method, *inputs, **kwargs +): + """ + Dispatch a ufunc to the equivalent dunder method. + + Parameters + ---------- + self : ArrayLike + The array whose dunder method we dispatch to + ufunc : Callable + A NumPy ufunc + method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'} + inputs : ArrayLike + The input arrays. + kwargs : Any + The additional keyword arguments, e.g. ``out``. + + Returns + ------- + result : Any + The result of applying the ufunc + """ + # special has the ufuncs we dispatch to the dunder op on + + op_name = ufunc.__name__ + op_name = UFUNC_ALIASES.get(op_name, op_name) + + def not_implemented(*args, **kwargs): + return NotImplemented + + if (method == "__call__" + and op_name in DISPATCHED_UFUNCS + and kwargs.get("out") is None): + if isinstance(inputs[0], type(self)): + name = f"__{op_name}__" + return getattr(self, name, not_implemented)(inputs[1]) + else: + name = REVERSED_NAMES.get(op_name, f"__r{op_name}__") + result = getattr(self, name, not_implemented)(inputs[0]) + return result + else: + return NotImplemented diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 1b566af7a5437..377d49f2bbd29 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2,6 +2,7 @@ # See LICENSE for the license import bz2 import gzip +import io import os import sys import time @@ -171,12 +172,9 @@ cdef extern from "parser/tokenizer.h": int64_t skip_first_N_rows int64_t skipfooter # pick one, depending on whether the converter requires GIL - float64_t (*double_converter_nogil)(const char *, char **, - char, char, char, - int, int *, int *) nogil - float64_t (*double_converter_withgil)(const char *, char **, - char, char, char, - int, int *, int *) + float64_t (*double_converter)(const char *, char **, + char, char, char, + int, int *, int *) nogil # error handling char *warn_msg @@ -469,16 +467,11 @@ cdef class TextReader: if float_precision == "round_trip": # see gh-15140 - # - # Our current roundtrip implementation requires the GIL. - self.parser.double_converter_nogil = NULL - self.parser.double_converter_withgil = round_trip + self.parser.double_converter = round_trip elif float_precision == "high": - self.parser.double_converter_withgil = NULL - self.parser.double_converter_nogil = precise_xstrtod + self.parser.double_converter = precise_xstrtod else: - self.parser.double_converter_withgil = NULL - self.parser.double_converter_nogil = xstrtod + self.parser.double_converter = xstrtod if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) @@ -645,11 +638,10 @@ cdef class TextReader: raise ValueError(f'Unrecognized compression type: ' f'{self.compression}') - if b'utf-16' in (self.encoding or b''): - # we need to read utf-16 through UTF8Recoder. - # if source is utf-16, convert source to utf-8 by UTF8Recoder. - source = icom.UTF8Recoder(source, - self.encoding.decode('utf-8')) + if self.encoding and isinstance(source, io.BufferedIOBase): + source = io.TextIOWrapper( + source, self.encoding.decode('utf-8'), newline='') + self.encoding = b'utf-8' self.c_encoding = self.encoding @@ -1377,6 +1369,7 @@ STR_NA_VALUES = { "N/A", "n/a", "NA", + "", "#NA", "NULL", "null", @@ -1663,22 +1656,12 @@ cdef _try_double(parser_t *parser, int64_t col, result = np.empty(lines, dtype=np.float64) data = result.data na_fset = kset_float64_from_list(na_flist) - if parser.double_converter_nogil != NULL: # if it can run without the GIL - with nogil: - error = _try_double_nogil(parser, parser.double_converter_nogil, - col, line_start, line_end, - na_filter, na_hashset, use_na_flist, - na_fset, NA, data, &na_count) - else: - assert parser.double_converter_withgil != NULL - error = _try_double_nogil(parser, - parser.double_converter_withgil, + with nogil: + error = _try_double_nogil(parser, parser.double_converter, col, line_start, line_end, na_filter, na_hashset, use_na_flist, na_fset, NA, data, &na_count) + kh_destroy_float64(na_fset) if error != 0: return None, None diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 0019fc4b36d20..8571761f77265 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,3 +1,4 @@ +from copy import copy from distutils.version import LooseVersion from cython import Py_ssize_t @@ -15,7 +16,7 @@ from numpy cimport (ndarray, cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.lib import maybe_convert_objects +from pandas._libs.lib import maybe_convert_objects, is_scalar cdef _check_result_array(object obj, Py_ssize_t cnt): @@ -492,14 +493,19 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if piece.index is chunk.index: - piece = piece.copy(deep='all') - else: + if piece.index is not chunk.index: mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int pass + if not is_scalar(piece): + # Need to copy data to avoid appending references + if hasattr(piece, "copy"): + piece = piece.copy(deep="all") + else: + piece = copy(piece) + results.append(piece) # If the data was modified inplace we need to diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 32aa936672aab..4e831081c8e54 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -28,7 +28,7 @@ def unstack(reshape_t[:, :] values, uint8_t[:] mask, Py_ssize_t stride, Py_ssize_t length, Py_ssize_t width, reshape_t[:, :] new_values, uint8_t[:, :] new_mask): """ - transform long sorted_values to wide new_values + Transform long values to wide new_values. Parameters ---------- diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index ee83901040b36..3a6dd506b2428 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -72,9 +72,9 @@ cdef class IntIndex(SparseIndex): """ if self.npoints > self.length: - msg = (f"Too many indices. Expected " - f"{self.length} but found {self.npoints}") - raise ValueError(msg) + raise ValueError( + f"Too many indices. Expected {self.length} but found {self.npoints}" + ) # Indices are vacuously ordered and non-negative # if the sequence of indices is empty. diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 9f2b26b0dea19..2188ff6b0d464 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1774,11 +1774,18 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { + // This is called from a nogil block in parsers.pyx + // so need to explicitly get GIL before Python calls + PyGILState_STATE gstate; + gstate = PyGILState_Ensure(); + double r = PyOS_string_to_double(p, q, 0); if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); + + PyGILState_Release(gstate); return r; } diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index b37de47662feb..4fd2065c07100 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -155,11 +155,8 @@ typedef struct parser_t { PyObject *skipfunc; int64_t skip_first_N_rows; int64_t skip_footer; - // pick one, depending on whether the converter requires GIL - double (*double_converter_nogil)(const char *, char **, - char, char, char, int, int *, int *); - double (*double_converter_withgil)(const char *, char **, - char, char, char, int, int *, int *); + double (*double_converter)(const char *, char **, + char, char, char, int, int *, int *); // error handling char *warn_msg; @@ -226,6 +223,8 @@ double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, double precise_xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); + +// GH-15140 - round_trip requires and acquires the GIL on its own double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int); int to_boolean(const char *item, uint8_t *val); diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 05c3ae4096ad5..8d04874b4c9bf 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -154,6 +154,8 @@ enum JSTYPES { JT_ARRAY, // Array structure JT_OBJECT, // Key/Value structure JT_INVALID, // Internal, do not return nor expect + JT_POS_INF, // Positive infinity + JT_NEG_INF, // Negative infinity }; typedef void * JSOBJ; @@ -290,6 +292,8 @@ typedef struct __JSONObjectDecoder { JSOBJ (*newTrue)(void *prv); JSOBJ (*newFalse)(void *prv); JSOBJ (*newNull)(void *prv); + JSOBJ (*newPosInf)(void *prv); + JSOBJ (*newNegInf)(void *prv); JSOBJ (*newObject)(void *prv, void *decoder); JSOBJ (*endObject)(void *prv, JSOBJ obj); JSOBJ (*newArray)(void *prv, void *decoder); diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 26b00c0cacd31..4eb18ee13d70b 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -127,9 +127,16 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { JSUINT64 overflowLimit = LLONG_MAX; - if (*(offset) == '-') { + if (*(offset) == 'I') { + goto DECODE_INF; + } else if (*(offset) == 'N') { + goto DECODE_NAN; + } else if (*(offset) == '-') { offset++; intNeg = -1; + if (*(offset) == 'I') { + goto DECODE_INF; + } overflowLimit = LLONG_MIN; } @@ -281,6 +288,48 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { } } +DECODE_NAN: + offset++; + if (*(offset++) != 'a') goto SET_NAN_ERROR; + if (*(offset++) != 'N') goto SET_NAN_ERROR; + + ds->lastType = JT_NULL; + ds->start = offset; + return ds->dec->newNull(ds->prv); + +SET_NAN_ERROR: + return SetError(ds, -1, "Unexpected character found when decoding 'NaN'"); + +DECODE_INF: + offset++; + if (*(offset++) != 'n') goto SET_INF_ERROR; + if (*(offset++) != 'f') goto SET_INF_ERROR; + if (*(offset++) != 'i') goto SET_INF_ERROR; + if (*(offset++) != 'n') goto SET_INF_ERROR; + if (*(offset++) != 'i') goto SET_INF_ERROR; + if (*(offset++) != 't') goto SET_INF_ERROR; + if (*(offset++) != 'y') goto SET_INF_ERROR; + + ds->start = offset; + + if (intNeg == 1) { + ds->lastType = JT_POS_INF; + return ds->dec->newPosInf(ds->prv); + } else { + ds->lastType = JT_NEG_INF; + return ds->dec->newNegInf(ds->prv); + } + +SET_INF_ERROR: + if (intNeg == 1) { + const char *msg = "Unexpected character found when decoding 'Infinity'"; + return SetError(ds, -1, msg); + } else { + const char *msg = "Unexpected character found when decoding '-Infinity'"; + return SetError(ds, -1, msg); + } + + BREAK_EXP_LOOP: // FIXME: Check for arithmetic overflow here ds->lastType = JT_DOUBLE; @@ -1070,6 +1119,8 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { case '7': case '8': case '9': + case 'I': + case 'N': case '-': return decode_numeric(ds); diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index 7a2e5a584443a..b2fc788478864 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -459,6 +459,10 @@ JSOBJ Object_newFalse(void *prv) { Py_RETURN_FALSE; } JSOBJ Object_newNull(void *prv) { Py_RETURN_NONE; } +JSOBJ Object_newPosInf(void *prv) { return PyFloat_FromDouble(Py_HUGE_VAL); } + +JSOBJ Object_newNegInf(void *prv) { return PyFloat_FromDouble(-Py_HUGE_VAL); } + JSOBJ Object_newObject(void *prv, void *decoder) { return PyDict_New(); } JSOBJ Object_endObject(void *prv, JSOBJ obj) { return obj; } @@ -502,10 +506,11 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { JSONObjectDecoder dec = { Object_newString, Object_objectAddKey, Object_arrayAddItem, Object_newTrue, Object_newFalse, Object_newNull, - Object_newObject, Object_endObject, Object_newArray, - Object_endArray, Object_newInteger, Object_newLong, - Object_newDouble, Object_releaseObject, PyObject_Malloc, - PyObject_Free, PyObject_Realloc}; + Object_newPosInf, Object_newNegInf, Object_newObject, + Object_endObject, Object_newArray, Object_endArray, + Object_newInteger, Object_newLong, Object_newDouble, + Object_releaseObject, PyObject_Malloc, PyObject_Free, + PyObject_Realloc}; dec.preciseFloat = 0; dec.prv = NULL; diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 37e9c36a85327..c5ac279ed3243 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -54,13 +54,12 @@ static PyTypeObject *cls_dataframe; static PyTypeObject *cls_series; static PyTypeObject *cls_index; static PyTypeObject *cls_nat; -PyObject *cls_timestamp; PyObject *cls_timedelta; npy_int64 get_nat(void) { return NPY_MIN_INT64; } -typedef void *(*PFN_PyTypeToJSON)(JSOBJ obj, JSONTypeContext *ti, - void *outValue, size_t *_outLen); +typedef char *(*PFN_PyTypeToUTF8)(JSOBJ obj, JSONTypeContext *ti, + size_t *_outLen); typedef struct __NpyArrContext { PyObject *array; @@ -94,7 +93,7 @@ typedef struct __TypeContext { JSPFN_ITERNEXT iterNext; JSPFN_ITERGETNAME iterGetName; JSPFN_ITERGETVALUE iterGetValue; - PFN_PyTypeToJSON PyTypeToJSON; + PFN_PyTypeToUTF8 PyTypeToUTF8; PyObject *newObj; PyObject *dictObj; Py_ssize_t index; @@ -166,7 +165,6 @@ void *initObjToJSON(void) { cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); cls_series = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); - cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp"); cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); Py_DECREF(mod_pandas); } @@ -178,9 +176,8 @@ void *initObjToJSON(void) { Py_DECREF(mod_nattype); } - /* Initialise numpy API and use 2/3 compatible return */ + /* Initialise numpy API */ import_array(); - return NUMPY_IMPORT_ARRAY_RETVAL; } static TypeContext *createTypeContext(void) { @@ -243,65 +240,39 @@ static int scaleNanosecToUnit(npy_int64 *value, NPY_DATETIMEUNIT unit) { static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; - values = PyObject_GetAttrString(obj, "values"); PRINTMARK(); - if (values && !PyArray_CheckExact(values)) { - - if (PyObject_HasAttrString(values, "to_numpy")) { - values = PyObject_CallMethod(values, "to_numpy", NULL); - } - - if (PyObject_HasAttrString(values, "values")) { - PyObject *subvals = get_values(values); - PyErr_Clear(); - PRINTMARK(); - // subvals are sometimes missing a dimension - if (subvals) { - PyArrayObject *reshape = (PyArrayObject *)subvals; - PyObject *shape = PyObject_GetAttrString(obj, "shape"); - PyArray_Dims dims; - PRINTMARK(); - - if (!shape || !PyArray_IntpConverter(shape, &dims)) { - subvals = NULL; - } else { - subvals = PyArray_Newshape(reshape, &dims, NPY_ANYORDER); - PyDimMem_FREE(dims.ptr); - } - Py_DECREF(reshape); - Py_XDECREF(shape); - } - Py_DECREF(values); - values = subvals; - } else { - PRINTMARK(); - Py_DECREF(values); - values = NULL; - } - } - - if (!values && PyObject_HasAttrString(obj, "_internal_get_values")) { + if (PyObject_HasAttrString(obj, "_internal_get_values")) { PRINTMARK(); values = PyObject_CallMethod(obj, "_internal_get_values", NULL); - if (values && !PyArray_CheckExact(values)) { + + if (values == NULL) { + // Clear so we can subsequently try another method + PyErr_Clear(); + } else if (!PyArray_CheckExact(values)) { + // Didn't get a numpy array, so keep trying PRINTMARK(); Py_DECREF(values); values = NULL; } } - if (!values && PyObject_HasAttrString(obj, "get_block_values")) { + if ((values == NULL) && PyObject_HasAttrString(obj, "get_block_values")) { PRINTMARK(); values = PyObject_CallMethod(obj, "get_block_values", NULL); - if (values && !PyArray_CheckExact(values)) { + + if (values == NULL) { + // Clear so we can subsequently try another method + PyErr_Clear(); + } else if (!PyArray_CheckExact(values)) { + // Didn't get a numpy array, so keep trying PRINTMARK(); Py_DECREF(values); values = NULL; } } - if (!values) { + if (values == NULL) { PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); PyObject *repr; PRINTMARK(); @@ -396,96 +367,129 @@ static PyObject *get_item(PyObject *obj, Py_ssize_t i) { return ret; } -static void *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, +static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), size_t *_outLen) { PyObject *obj = (PyObject *)_obj; *_outLen = PyBytes_GET_SIZE(obj); return PyBytes_AS_STRING(obj); } -static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, +static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), size_t *_outLen) { - return PyUnicode_AsUTF8AndSize(_obj, _outLen); + return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen); } -static void *PandasDateTimeStructToJSON(npy_datetimestruct *dts, - JSONTypeContext *tc, void *outValue, - size_t *_outLen) { +/* Converts the int64_t representation of a datetime to ISO; mutates len */ +static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { + npy_datetimestruct dts; + int ret_code; + + pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts); + + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); + + if (result == NULL) { + PyErr_NoMemory(); + return NULL; + } + + ret_code = make_iso_8601_datetime(&dts, result, *len, base); + if (ret_code != 0) { + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); + } + + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; +} + +/* JSON callback. returns a char* and mutates the pointer to *len */ +static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), + JSONTypeContext *tc, size_t *len) { NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + return int64ToIso(GET_TC(tc)->longValue, base, len); +} - if (((PyObjectEncoder *)tc->encoder)->datetimeIso) { - PRINTMARK(); - *_outLen = (size_t)get_datetime_iso_8601_strlen(0, base); - GET_TC(tc)->cStr = PyObject_Malloc(sizeof(char) * (*_outLen)); - if (!GET_TC(tc)->cStr) { - PyErr_NoMemory(); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; - } +static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { + scaleNanosecToUnit(&dt, base); + return dt; +} - if (!make_iso_8601_datetime(dts, GET_TC(tc)->cStr, *_outLen, base)) { - PRINTMARK(); - *_outLen = strlen(GET_TC(tc)->cStr); - return GET_TC(tc)->cStr; - } else { - PRINTMARK(); +/* Convert PyDatetime To ISO C-string. mutates len */ +static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, + size_t *len) { + npy_datetimestruct dts; + int ret; + + ret = convert_pydatetime_to_datetimestruct(obj, &dts); + if (ret != 0) { + if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - PyObject_Free(GET_TC(tc)->cStr); - return NULL; + "Could not convert PyDateTime to numpy datetime"); } - } else { + return NULL; + } + + *len = (size_t)get_datetime_iso_8601_strlen(0, base); + char *result = PyObject_Malloc(*len); + ret = make_iso_8601_datetime(&dts, result, *len, base); + + if (ret != 0) { PRINTMARK(); - *((JSINT64 *)outValue) = npy_datetimestruct_to_datetime(base, dts); + PyErr_SetString(PyExc_ValueError, + "Could not convert datetime value to string"); + PyObject_Free(result); return NULL; } + + // Note that get_datetime_iso_8601_strlen just gives a generic size + // for ISO string conversion, not the actual size used + *len = strlen(result); + return result; } -static void *NpyDateTimeScalarToJSON(JSOBJ _obj, JSONTypeContext *tc, - void *outValue, size_t *_outLen) { - npy_datetimestruct dts; - PyDatetimeScalarObject *obj = (PyDatetimeScalarObject *)_obj; - PRINTMARK(); - // TODO(anyone): Does not appear to be reached in tests. +/* JSON callback */ +static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc, + size_t *len) { + + if (!PyDate_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected date object"); + return NULL; + } - pandas_datetime_to_datetimestruct(obj->obval, - (NPY_DATETIMEUNIT)obj->obmeta.base, &dts); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + return PyDateTimeToIso(obj, base, len); } -static void *PyDateTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *_outLen) { +static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) { npy_datetimestruct dts; - PyDateTime_Date *obj = (PyDateTime_Date *)_obj; + int ret; - PRINTMARK(); + if (!PyDate_Check(obj)) { + // TODO: raise TypeError + } + PyDateTime_Date *dt = (PyDateTime_Date *)obj; - if (!convert_pydatetime_to_datetimestruct(obj, &dts)) { - PRINTMARK(); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); - } else { + ret = convert_pydatetime_to_datetimestruct(dt, &dts); + if (ret != 0) { if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, - "Could not convert datetime value to string"); + "Could not convert PyDateTime to numpy datetime"); } - ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - return NULL; + // TODO: is setting errMsg required? + //((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; + // return NULL; } -} -static void *NpyDatetime64ToJSON(JSOBJ _obj, JSONTypeContext *tc, - void *outValue, size_t *_outLen) { - npy_datetimestruct dts; - PRINTMARK(); - - pandas_datetime_to_datetimestruct((npy_datetime)GET_TC(tc)->longValue, - NPY_FR_ns, &dts); - return PandasDateTimeStructToJSON(&dts, tc, outValue, _outLen); + npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts); + return NpyDateTimeToEpoch(npy_dt, base); } -static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, - size_t *outLen) { +static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { PyObject *obj = (PyObject *)_obj; PyObject *str; PyObject *tmp; @@ -509,54 +513,15 @@ static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, GET_TC(tc)->newObj = str; *outLen = PyBytes_GET_SIZE(str); - outValue = (void *)PyBytes_AS_STRING(str); + char *outValue = PyBytes_AS_STRING(str); return outValue; } -static int NpyTypeToJSONType(PyObject *obj, JSONTypeContext *tc, int npyType, - void *value) { - PyArray_VectorUnaryFunc *castfunc; - npy_int64 longVal; - - if (PyTypeNum_ISDATETIME(npyType)) { - PRINTMARK(); - castfunc = - PyArray_GetCastFunc(PyArray_DescrFromType(npyType), NPY_INT64); - if (!castfunc) { - PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", - npyType); - } - castfunc(value, &longVal, 1, NULL, NULL); - if (longVal == get_nat()) { - PRINTMARK(); - return JT_NULL; - } - - if (((PyObjectEncoder *)tc->encoder)->datetimeIso) { - GET_TC(tc)->longValue = (JSINT64)longVal; - GET_TC(tc)->PyTypeToJSON = NpyDatetime64ToJSON; - return JT_UTF8; - } else { - NPY_DATETIMEUNIT unit = - ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - if (!scaleNanosecToUnit(&longVal, unit)) { - GET_TC(tc)->longValue = longVal; - return JT_LONG; - } else { - // TODO: some kind of error handling - } - } - } - - PRINTMARK(); - return JT_INVALID; -} - //============================================================================= // Numpy array iteration functions //============================================================================= -static void NpyArr_freeItemValue(JSOBJ _obj, JSONTypeContext *tc) { +static void NpyArr_freeItemValue(JSOBJ Py_UNUSED(_obj), JSONTypeContext *tc) { if (GET_TC(tc)->npyarr && GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) { PRINTMARK(); @@ -565,7 +530,9 @@ static void NpyArr_freeItemValue(JSOBJ _obj, JSONTypeContext *tc) { } } -int NpyArr_iterNextNone(JSOBJ _obj, JSONTypeContext *tc) { return 0; } +int NpyArr_iterNextNone(JSOBJ Py_UNUSED(_obj), JSONTypeContext *Py_UNUSED(tc)) { + return 0; +} void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { PyArrayObject *obj; @@ -622,7 +589,10 @@ void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) { } } -void NpyArrPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } +void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj), + JSONTypeContext *Py_UNUSED(tc)) { + PRINTMARK(); +} void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; @@ -701,12 +671,13 @@ int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { return 1; } -JSOBJ NpyArr_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PRINTMARK(); return GET_TC(tc)->itemValue; } -char *NpyArr_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; npy_intp idx; PRINTMARK(); @@ -760,7 +731,8 @@ int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { return NpyArr_iterNextItem(obj, tc); } -char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; npy_intp idx; @@ -782,7 +754,7 @@ char *PdBlock_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { return cStr; } -char *PdBlock_iterGetName_Transpose(JSOBJ obj, JSONTypeContext *tc, +char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; @@ -828,7 +800,7 @@ int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void PdBlockPassThru_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; PRINTMARK(); @@ -1060,13 +1032,14 @@ int Tuple_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Tuple_iterEnd(JSOBJ obj, JSONTypeContext *tc) {} +void Tuple_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} -JSOBJ Tuple_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Tuple_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Tuple_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Tuple_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -1079,7 +1052,7 @@ void Iter_iterBegin(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->iterator = PyObject_GetIter(obj); } -int Iter_iterNext(JSOBJ obj, JSONTypeContext *tc) { +int Iter_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObject *item; if (GET_TC(tc)->itemValue) { @@ -1097,7 +1070,7 @@ int Iter_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Iter_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void Iter_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (GET_TC(tc)->itemValue) { Py_DECREF(GET_TC(tc)->itemValue); GET_TC(tc)->itemValue = NULL; @@ -1109,11 +1082,12 @@ void Iter_iterEnd(JSOBJ obj, JSONTypeContext *tc) { } } -JSOBJ Iter_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Iter_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Iter_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Iter_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -1129,7 +1103,7 @@ void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } -void Dir_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (GET_TC(tc)->itemValue) { Py_DECREF(GET_TC(tc)->itemValue); GET_TC(tc)->itemValue = NULL; @@ -1215,12 +1189,13 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { return 1; } -JSOBJ Dir_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PRINTMARK(); return GET_TC(tc)->itemValue; } -char *Dir_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { PRINTMARK(); *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); @@ -1246,20 +1221,21 @@ int List_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void List_iterEnd(JSOBJ obj, JSONTypeContext *tc) {} +void List_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} -JSOBJ List_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ List_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *List_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *List_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), + size_t *Py_UNUSED(outLen)) { return NULL; } //============================================================================= // pandas Index iteration functions //============================================================================= -void Index_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); if (!GET_TC(tc)->cStr) { @@ -1295,13 +1271,16 @@ int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Index_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } +void Index_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) { + PRINTMARK(); +} -JSOBJ Index_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Index_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1309,7 +1288,7 @@ char *Index_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { //============================================================================= // pandas Series iteration functions //============================================================================= -void Series_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); @@ -1350,17 +1329,18 @@ int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Series_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; enc->outputFormat = enc->originalOutputFormat; PRINTMARK(); } -JSOBJ Series_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Series_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1368,7 +1348,7 @@ char *Series_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { //============================================================================= // pandas DataFrame iteration functions //============================================================================= -void DataFrame_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; GET_TC(tc)->index = 0; GET_TC(tc)->cStr = PyObject_Malloc(20 * sizeof(char)); @@ -1414,17 +1394,18 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void DataFrame_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; enc->outputFormat = enc->originalOutputFormat; PRINTMARK(); } -JSOBJ DataFrame_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = strlen(GET_TC(tc)->cStr); return GET_TC(tc)->cStr; } @@ -1434,12 +1415,12 @@ char *DataFrame_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { // itemName might converted to string (Python_Str). Do refCounting // itemValue is borrowed from object (which is dict). No refCounting //============================================================================= -void Dict_iterBegin(JSOBJ obj, JSONTypeContext *tc) { +void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; PRINTMARK(); } -int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) { +int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObject *itemNameTmp; if (GET_TC(tc)->itemName) { @@ -1467,7 +1448,7 @@ int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 1; } -void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) { +void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (GET_TC(tc)->itemName) { Py_DECREF(GET_TC(tc)->itemName); GET_TC(tc)->itemName = NULL; @@ -1476,11 +1457,12 @@ void Dict_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PRINTMARK(); } -JSOBJ Dict_iterGetValue(JSOBJ obj, JSONTypeContext *tc) { +JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; } -char *Dict_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { +char *Dict_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, + size_t *outLen) { *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -1517,10 +1499,12 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, npy_intp num) { // NOTE this function steals a reference to labels. PyObject *item = NULL; - npy_intp i, stride, len; + size_t len; + npy_intp i, stride; char **ret; char *dataptr, *cLabel; int type_num; + NPY_DATETIMEUNIT base = enc->datetimeUnit; PRINTMARK(); if (!labels) { @@ -1558,79 +1542,85 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, break; } - // TODO: for any matches on type_num (date and timedeltas) should use a - // vectorized solution to convert to epoch or iso formats - if (enc->datetimeIso && - (type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { - PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); - if (td == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); - Py_DECREF(td); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); - len = strlen(cLabel); - } else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) || - PyDate_Check(item)) { - PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item); - if (ts == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; + int is_datetimelike = 0; + npy_int64 nanosecVal; + if (PyTypeNum_ISDATETIME(type_num)) { + is_datetimelike = 1; + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, + "Cannot cast numpy dtype %d to long", + enc->npyType); } - - if (enc->datetimeIso) { - PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL); - Py_DECREF(ts); - if (iso == NULL) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; + castfunc(dataptr, &nanosecVal, 1, NULL, NULL); + } else if (PyDate_Check(item) || PyDelta_Check(item)) { + is_datetimelike = 1; + if (PyObject_HasAttrString(item, "value")) { + nanosecVal = get_long_attr(item, "value"); + } else { + if (PyDelta_Check(item)) { + nanosecVal = total_seconds(item) * + 1000000000LL; // nanoseconds per second + } else { + // datetime.* objects don't follow above rules + nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); } + } + } - cLabel = (char *)PyUnicode_AsUTF8(iso); - Py_DECREF(iso); - len = strlen(cLabel); + if (is_datetimelike) { + if (nanosecVal == get_nat()) { + len = 5; // TODO: shouldn't require extra space for terminator + cLabel = PyObject_Malloc(len); + strncpy(cLabel, "null", len); } else { - npy_int64 value; - // TODO: refactor to not duplicate what goes on in - // beginTypeContext - if (PyObject_HasAttrString(ts, "value")) { - PRINTMARK(); - value = get_long_attr(ts, "value"); + if (enc->datetimeIso) { + // TODO: Vectorized Timedelta function + if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { + PyObject *td = + PyObject_CallFunction(cls_timedelta, "(O)", item); + if (td == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + PyObject *iso = + PyObject_CallMethod(td, "isoformat", NULL); + Py_DECREF(td); + if (iso == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } + + len = strlen(PyUnicode_AsUTF8(iso)); + cLabel = PyObject_Malloc(len + 1); + memcpy(cLabel, PyUnicode_AsUTF8(iso), len + 1); + Py_DECREF(iso); + } else { + if (type_num == NPY_DATETIME) { + cLabel = int64ToIso(nanosecVal, base, &len); + } else { + cLabel = PyDateTimeToIso((PyDateTime_Date *)item, + base, &len); + } + } + if (cLabel == NULL) { + Py_DECREF(item); + NpyArr_freeLabels(ret, num); + ret = 0; + break; + } } else { - PRINTMARK(); - value = total_seconds(ts) * - 1000000000LL; // nanoseconds per second + cLabel = PyObject_Malloc(21); // 21 chars for int64 + sprintf(cLabel, "%" NPY_DATETIME_FMT, + NpyDateTimeToEpoch(nanosecVal, base)); + len = strlen(cLabel); } - Py_DECREF(ts); - - NPY_DATETIMEUNIT unit = enc->datetimeUnit; - if (scaleNanosecToUnit(&value, unit) != 0) { - Py_DECREF(item); - NpyArr_freeLabels(ret, num); - ret = 0; - break; - } - - char buf[21] = {0}; // 21 chars for 2**63 as string - cLabel = buf; - sprintf(buf, "%" NPY_INT64_FMT, value); - len = strlen(cLabel); } } else { // Fallback to string representation PyObject *str = PyObject_Str(item); @@ -1651,6 +1641,10 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, ret[i] = PyObject_Malloc(len + 1); memcpy(ret[i], cLabel, len + 1); + if (is_datetimelike) { + PyObject_Free(cLabel); + } + if (PyErr_Occurred()) { NpyArr_freeLabels(ret, num); ret = 0; @@ -1705,29 +1699,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { obj = (PyObject *)_obj; enc = (PyObjectEncoder *)tc->encoder; - if (enc->npyType >= 0) { - PRINTMARK(); - tc->prv = &(enc->basicTypeContext); - tc->type = NpyTypeToJSONType(obj, tc, enc->npyType, enc->npyValue); - - if (tc->type == JT_INVALID) { - if (enc->defaultHandler) { - enc->npyType = -1; - PRINTMARK(); - Object_invokeDefaultHandler( - enc->npyCtxtPassthru->getitem(enc->npyValue, - enc->npyCtxtPassthru->array), - enc); - } else { - PyErr_Format(PyExc_RuntimeError, "Unhandled numpy dtype %d", - enc->npyType); - } - } - enc->npyCtxtPassthru = NULL; - enc->npyType = -1; - return; - } - if (PyBool_Check(obj)) { PRINTMARK(); tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; @@ -1745,6 +1716,44 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } tc->prv = pc; + if (PyTypeNum_ISDATETIME(enc->npyType)) { + PRINTMARK(); + int64_t longVal; + PyArray_VectorUnaryFunc *castfunc = + PyArray_GetCastFunc(PyArray_DescrFromType(enc->npyType), NPY_INT64); + if (!castfunc) { + PyErr_Format(PyExc_ValueError, "Cannot cast numpy dtype %d to long", + enc->npyType); + } + castfunc(enc->npyValue, &longVal, 1, NULL, NULL); + if (longVal == get_nat()) { + PRINTMARK(); + tc->type = JT_NULL; + } else { + + if (enc->datetimeIso) { + PRINTMARK(); + pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; + // Currently no way to pass longVal to iso function, so use + // state management + GET_TC(tc)->longValue = longVal; + tc->type = JT_UTF8; + } else { + PRINTMARK(); + NPY_DATETIMEUNIT base = + ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + GET_TC(tc)->longValue = NpyDateTimeToEpoch(longVal, base); + tc->type = JT_LONG; + } + } + + // TODO: this prevents infinite loop with mixed-type DataFrames; + // refactor + enc->npyCtxtPassthru = NULL; + enc->npyType = -1; + return; + } + if (PyIter_Check(obj) || (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) { PRINTMARK(); @@ -1776,12 +1785,12 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } else if (PyBytes_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyBytesToUTF8; + pc->PyTypeToUTF8 = PyBytesToUTF8; tc->type = JT_UTF8; return; } else if (PyUnicode_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyUnicodeToUTF8; + pc->PyTypeToUTF8 = PyUnicodeToUTF8; tc->type = JT_UTF8; return; } else if (PyObject_TypeCheck(obj, type_decimal)) { @@ -1799,19 +1808,19 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { PRINTMARK(); if (enc->datetimeIso) { PRINTMARK(); - pc->PyTypeToJSON = PyDateTimeToJSON; + pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; tc->type = JT_UTF8; } else { PRINTMARK(); - // TODO: last argument here is unused; should decouple string - // from long datetimelike conversion routines - PyDateTimeToJSON(obj, tc, &(GET_TC(tc)->longValue), 0); + NPY_DATETIMEUNIT base = + ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; } else if (PyTime_Check(obj)) { PRINTMARK(); - pc->PyTypeToJSON = PyTimeToJSON; + pc->PyTypeToUTF8 = PyTimeToJSON; tc->type = JT_UTF8; return; } else if (PyArray_IsScalar(obj, Datetime)) { @@ -1823,8 +1832,17 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } PRINTMARK(); - pc->PyTypeToJSON = NpyDateTimeScalarToJSON; - tc->type = enc->datetimeIso ? JT_UTF8 : JT_LONG; + if (enc->datetimeIso) { + PRINTMARK(); + pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; + tc->type = JT_UTF8; + } else { + PRINTMARK(); + NPY_DATETIMEUNIT base = + ((PyObjectEncoder *)tc->encoder)->datetimeUnit; + GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); + tc->type = JT_LONG; + } return; } else if (PyDelta_Check(obj)) { if (PyObject_HasAttrString(obj, "value")) { @@ -2203,7 +2221,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } -void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { +void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PRINTMARK(); if (tc->prv) { Py_XDECREF(GET_TC(tc)->newObj); @@ -2226,14 +2244,14 @@ void Object_endTypeContext(JSOBJ obj, JSONTypeContext *tc) { const char *Object_getStringValue(JSOBJ obj, JSONTypeContext *tc, size_t *_outLen) { - return GET_TC(tc)->PyTypeToJSON(obj, tc, NULL, _outLen); + return GET_TC(tc)->PyTypeToUTF8(obj, tc, _outLen); } -JSINT64 Object_getLongValue(JSOBJ obj, JSONTypeContext *tc) { +JSINT64 Object_getLongValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->longValue; } -double Object_getDoubleValue(JSOBJ obj, JSONTypeContext *tc) { +double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->doubleValue; } @@ -2259,7 +2277,8 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) { return GET_TC(tc)->iterGetName(obj, tc, outLen); } -PyObject *objToJSON(PyObject *self, PyObject *args, PyObject *kwargs) { +PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, + PyObject *kwargs) { static char *kwlist[] = {"obj", "ensure_ascii", "double_precision", diff --git a/pandas/_libs/src/ujson/python/ujson.c b/pandas/_libs/src/ujson/python/ujson.c index 39320d73d0cab..4a88fb7a4e849 100644 --- a/pandas/_libs/src/ujson/python/ujson.c +++ b/pandas/_libs/src/ujson/python/ujson.c @@ -65,35 +65,15 @@ static PyMethodDef ujsonMethods[] = { {NULL, NULL, 0, NULL} /* Sentinel */ }; -static struct PyModuleDef moduledef = { - PyModuleDef_HEAD_INIT, - "_libjson", - 0, /* m_doc */ - -1, /* m_size */ - ujsonMethods, /* m_methods */ - NULL, /* m_reload */ - NULL, /* m_traverse */ - NULL, /* m_clear */ - NULL /* m_free */ +static PyModuleDef moduledef = { + .m_base = PyModuleDef_HEAD_INIT, + .m_name = "_libjson", + .m_methods = ujsonMethods }; -#define PYMODINITFUNC PyMODINIT_FUNC PyInit_json(void) -#define PYMODULE_CREATE() PyModule_Create(&moduledef) -#define MODINITERROR return NULL -PYMODINITFUNC { - PyObject *module; - PyObject *version_string; +PyMODINIT_FUNC PyInit_json(void) { + initObjToJSON(); // TODO: clean up, maybe via tp_free? + return PyModuleDef_Init(&moduledef); - initObjToJSON(); - module = PYMODULE_CREATE(); - - if (module == NULL) { - MODINITERROR; - } - - version_string = PyUnicode_FromString(UJSON_VERSION); - PyModule_AddObject(module, "__version__", version_string); - - return module; } diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 026bd7a44a509..0e57b563d4d25 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -123,23 +123,23 @@ cpdef assert_almost_equal(a, b, if isiterable(a): if not isiterable(b): - from pandas.util.testing import assert_class_equal + from pandas._testing import assert_class_equal # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) - assert has_length(a) and has_length(b), ("Can't compare objects without " - "length, one or both is invalid: " - f"({a}, {b})") + assert has_length(a) and has_length(b), ( + f"Can't compare objects without length, one or both is invalid: ({a}, {b})" + ) if a_is_ndarray and b_is_ndarray: na, nb = a.size, b.size if a.shape != b.shape: - from pandas.util.testing import raise_assert_detail + from pandas._testing import raise_assert_detail raise_assert_detail( obj, f'{obj} shapes are different', a.shape, b.shape) if check_dtype and not is_dtype_equal(a.dtype, b.dtype): - from pandas.util.testing import assert_attr_equal + from pandas._testing import assert_attr_equal assert_attr_equal('dtype', a, b, obj=obj) if array_equivalent(a, b, strict_nan=True): @@ -149,7 +149,7 @@ cpdef assert_almost_equal(a, b, na, nb = len(a), len(b) if na != nb: - from pandas.util.testing import raise_assert_detail + from pandas._testing import raise_assert_detail # if we have a small diff set, print it if abs(na - nb) < 10: @@ -168,7 +168,7 @@ cpdef assert_almost_equal(a, b, diff += 1 if is_unequal: - from pandas.util.testing import raise_assert_detail + from pandas._testing import raise_assert_detail msg = (f"{obj} values are different " f"({np.round(diff * 100.0 / na, 5)} %)") raise_assert_detail(obj, msg, lobj, robj) @@ -176,7 +176,7 @@ cpdef assert_almost_equal(a, b, return True elif isiterable(b): - from pandas.util.testing import assert_class_equal + from pandas._testing import assert_class_equal # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index cbe6dd6c2322d..53e3354ca8eb6 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -120,8 +120,7 @@ def ints_to_pydatetime(const int64_t[:] arr, object tz=None, object freq=None, elif box == "datetime": func_create = create_datetime_from_ts else: - raise ValueError("box must be one of 'datetime', 'date', 'time' or" - " 'timestamp'") + raise ValueError("box must be one of 'datetime', 'date', 'time' or 'timestamp'") if is_utc(tz) or tz is None: for i in range(n): @@ -296,10 +295,15 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -def array_with_unit_to_datetime(ndarray values, object unit, +def array_with_unit_to_datetime(ndarray values, ndarray mask, object unit, str errors='coerce'): """ - convert the ndarray according to the unit + Convert the ndarray to datetime according to the time unit. + + This function converts an array of objects into a numpy array of + datetime64[ns]. It returns the converted array + and also returns the timezone offset + if errors: - raise: return converted values or raise OutOfBoundsDatetime if out of range on the conversion or @@ -307,6 +311,18 @@ def array_with_unit_to_datetime(ndarray values, object unit, - ignore: return non-convertible values as the same unit - coerce: NaT for non-convertibles + Parameters + ---------- + values : ndarray of object + Date-like objects to convert + mask : ndarray of bool + Not-a-time mask for non-nullable integer types conversion, + can be None + unit : object + Time unit to use during conversion + errors : str, default 'raise' + Error behavior when parsing + Returns ------- result : ndarray of m8 values @@ -316,7 +332,6 @@ def array_with_unit_to_datetime(ndarray values, object unit, Py_ssize_t i, j, n=len(values) int64_t m ndarray[float64_t] fvalues - ndarray mask bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' bint is_raise = errors=='raise' @@ -329,9 +344,13 @@ def array_with_unit_to_datetime(ndarray values, object unit, if unit == 'ns': if issubclass(values.dtype.type, np.integer): - return values.astype('M8[ns]'), tz - # This will return a tz - return array_to_datetime(values.astype(object), errors=errors) + result = values.astype('M8[ns]') + else: + result, tz = array_to_datetime(values.astype(object), errors=errors) + if mask is not None: + iresult = result.view('i8') + iresult[mask] = NPY_NAT + return result, tz m = cast_from_unit(None, unit) @@ -343,7 +362,9 @@ def array_with_unit_to_datetime(ndarray values, object unit, if values.dtype.kind == "i": # Note: this condition makes the casting="same_kind" redundant iresult = values.astype('i8', casting='same_kind', copy=False) - mask = iresult == NPY_NAT + # If no mask, fill mask by comparing to NPY_NAT constant + if mask is None: + mask = iresult == NPY_NAT iresult[mask] = 0 fvalues = iresult.astype('f8') * m need_to_iterate = False diff --git a/pandas/_libs/tslibs/c_timestamp.pyx b/pandas/_libs/tslibs/c_timestamp.pyx index 6e6b809b9b5a6..ed1df5f4fa595 100644 --- a/pandas/_libs/tslibs/c_timestamp.pyx +++ b/pandas/_libs/tslibs/c_timestamp.pyx @@ -57,11 +57,12 @@ def integer_op_not_supported(obj): # the caller; mypy finds this more palatable. cls = type(obj).__name__ + # GH#30886 using an fstring raises SystemError int_addsub_msg = ( - f"Addition/subtraction of integers and integer-arrays with {cls} is " + "Addition/subtraction of integers and integer-arrays with {cls} is " "no longer supported. Instead of adding/subtracting `n`, " "use `n * obj.freq`" - ) + ).format(cls=cls) return TypeError(int_addsub_msg) diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 0b77948027ad7..d4ae3fa8c5b99 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from cpython.datetime cimport datetime, tzinfo +from cpython.datetime cimport datetime from numpy cimport int64_t, int32_t @@ -25,6 +25,4 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef int64_t pydt_to_i8(object pydt) except? -1 -cdef maybe_datetimelike_to_i8(object val) - cpdef datetime localize_pydatetime(datetime dt, object tz) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 2988d7bae9a5e..77f46016ee846 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -29,7 +29,7 @@ from pandas._libs.tslibs.util cimport ( from pandas._libs.tslibs.timedeltas cimport cast_from_unit from pandas._libs.tslibs.timezones cimport ( is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info, - get_timezone, maybe_get_tz, tz_compare) + get_timezone, maybe_get_tz, tz_compare, treat_tz_as_dateutil) from pandas._libs.tslibs.timezones import UTC from pandas._libs.tslibs.parsing import parse_datetime_string @@ -99,6 +99,11 @@ def ensure_datetime64ns(arr: ndarray, copy: bool=True): shape = (arr).shape + if (arr).dtype.byteorder == ">": + # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap + dtype = arr.dtype + arr = arr.astype(dtype.newbyteorder("<")) + ivalues = arr.view(np.int64).ravel() result = np.empty(shape, dtype=NS_DTYPE) @@ -202,31 +207,6 @@ def datetime_to_datetime64(object[:] values): return result, inferred_tz -cdef inline maybe_datetimelike_to_i8(object val): - """ - Try to convert to a nanosecond timestamp. Fall back to returning the - input value. - - Parameters - ---------- - val : object - - Returns - ------- - val : int64 timestamp or original input - """ - cdef: - npy_datetimestruct dts - try: - return val.value - except AttributeError: - if is_datetime64_object(val): - return get_datetime64_value(val) - elif PyDateTime_Check(val): - return convert_datetime_to_tsobject(val, None).value - return val - - # ---------------------------------------------------------------------- # _TSObject Conversion @@ -382,6 +362,14 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, obj.tzinfo = tz else: obj.value = pydatetime_to_dt64(ts, &obj.dts) + # GH 24329 When datetime is ambiguous, + # pydatetime_to_dt64 doesn't take DST into account + # but with dateutil timezone, get_utcoffset does + # so we need to correct for it + if treat_tz_as_dateutil(ts.tzinfo): + if ts.tzinfo.is_ambiguous(ts): + dst_offset = ts.tzinfo.dst(ts) + obj.value += int(dst_offset.total_seconds() * 1e9) obj.tzinfo = ts.tzinfo if obj.tzinfo is not None and not is_utc(obj.tzinfo): diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 76a694c64e1fb..67c0f0cc33ab8 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -5,6 +5,9 @@ from cpython.object cimport ( from cpython.datetime cimport (datetime, PyDateTime_Check, PyDelta_Check, PyDateTime_IMPORT) + +from cpython.version cimport PY_MINOR_VERSION + PyDateTime_IMPORT import numpy as np @@ -19,6 +22,7 @@ from pandas._libs.tslibs.util cimport ( get_nat, is_integer_object, is_float_object, is_datetime64_object, is_timedelta64_object) + # ---------------------------------------------------------------------- # Constants nat_strings = {'NaT', 'nat', 'NAT', 'nan', 'NaN', 'NAN'} @@ -427,6 +431,10 @@ class NaTType(_NaT): tzname = _make_error_func('tzname', datetime) utcoffset = _make_error_func('utcoffset', datetime) + # "fromisocalendar" was introduced in 3.8 + if PY_MINOR_VERSION >= 8: + fromisocalendar = _make_error_func('fromisocalendar', datetime) + # ---------------------------------------------------------------------- # The remaining methods have docstrings copy/pasted from the analogous # Timestamp methods. diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 41420dbceef9d..31dc2945f0395 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -216,7 +216,7 @@ def _get_calendar(weekmask, holidays, calendar): holidays = holidays + calendar.holidays().tolist() except AttributeError: pass - holidays = [_to_dt64(dt, dtype='datetime64[D]') for dt in holidays] + holidays = [_to_dt64D(dt) for dt in holidays] holidays = tuple(sorted(holidays)) kwargs = {'weekmask': weekmask} @@ -227,7 +227,7 @@ def _get_calendar(weekmask, holidays, calendar): return busdaycalendar, holidays -def _to_dt64(dt, dtype='datetime64'): +def _to_dt64D(dt): # Currently # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]') # numpy.datetime64('2013-05-01T02:00:00.000000+0200') @@ -238,8 +238,8 @@ def _to_dt64(dt, dtype='datetime64'): dt = np.int64(dt).astype('datetime64[ns]') else: dt = np.datetime64(dt) - if dt.dtype.name != dtype: - dt = dt.astype(dtype) + if dt.dtype.name != "datetime64[D]": + dt = dt.astype("datetime64[D]") return dt @@ -933,7 +933,7 @@ def shift_month(stamp: datetime, months: int, cpdef int get_day_of_month(datetime other, day_opt) except? -1: """ - Find the day in `other`'s month that satisfies a DateOffset's onOffset + Find the day in `other`'s month that satisfies a DateOffset's is_on_offset policy, as described by the `day_opt` argument. Parameters diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 3705b0a41fe55..ebdf7a1e29216 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -217,7 +217,7 @@ def parse_datetime_string(date_string: str, freq=None, dayfirst=False, return dt try: - dt, _, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + dt, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) return dt except DateParseError: raise @@ -280,7 +280,6 @@ cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False, Returns ------- datetime - datetime/dateutil.parser._result str Inferred resolution of the parsed string. @@ -297,7 +296,7 @@ cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False, parsed, reso = _parse_delimited_date(date_string, dayfirst) if parsed is not None: - return parsed, parsed, reso + return parsed, reso try: return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) @@ -315,7 +314,7 @@ cdef parse_datetime_string_with_reso(str date_string, freq=None, dayfirst=False, raise DateParseError(err) if parsed is None: raise DateParseError(f"Could not parse {date_string}") - return parsed, parsed, reso + return parsed, reso cpdef bint _does_string_look_like_datetime(str py_string): @@ -375,7 +374,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, assert isinstance(date_string, str) if date_string in nat_strings: - return NaT, NaT, '' + return NaT, '' date_string = date_string.upper() date_len = len(date_string) @@ -384,7 +383,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, # parse year only like 2000 try: ret = default.replace(year=int(date_string)) - return ret, ret, 'year' + return ret, 'year' except ValueError: pass @@ -441,7 +440,7 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, month = (quarter - 1) * 3 + 1 ret = default.replace(year=year, month=month) - return ret, ret, 'quarter' + return ret, 'quarter' except DateParseError: raise @@ -454,14 +453,14 @@ cdef inline object _parse_dateabbr_string(object date_string, object default, month = int(date_string[4:6]) try: ret = default.replace(year=year, month=month) - return ret, ret, 'month' + return ret, 'month' except ValueError: pass for pat in ['%Y-%m', '%b %Y', '%b-%Y']: try: ret = datetime.strptime(date_string, pat) - return ret, ret, 'month' + return ret, 'month' except ValueError: pass diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index a8dabac1527b5..3dd560ece188d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1212,8 +1212,7 @@ cdef str period_format(int64_t value, int freq, object fmt=None): elif freq_group == 4000: # WK left = period_asfreq(value, freq, 6000, 0) right = period_asfreq(value, freq, 6000, 1) - return '%s/%s' % (period_format(left, 6000), - period_format(right, 6000)) + return f"{period_format(left, 6000)}/{period_format(right, 6000)}" elif (freq_group == 5000 # BUS or freq_group == 6000): # DAY fmt = b'%Y-%m-%d' @@ -1230,7 +1229,7 @@ cdef str period_format(int64_t value, int freq, object fmt=None): elif freq_group == 12000: # NANOSEC fmt = b'%Y-%m-%d %H:%M:%S.%n' else: - raise ValueError(f'Unknown freq: {freq}') + raise ValueError(f"Unknown freq: {freq}") return _period_strftime(value, freq, fmt) @@ -1276,15 +1275,15 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): if i == 0: repl = str(quarter) elif i == 1: # %f, 2-digit year - repl = f'{(year % 100):02d}' + repl = f"{(year % 100):02d}" elif i == 2: repl = str(year) elif i == 3: - repl = f'{(value % 1_000):03d}' + repl = f"{(value % 1_000):03d}" elif i == 4: - repl = f'{(value % 1_000_000):06d}' + repl = f"{(value % 1_000_000):06d}" elif i == 5: - repl = f'{(value % 1_000_000_000):09d}' + repl = f"{(value % 1_000_000_000):09d}" result = result.replace(str_extra_fmts[i], repl) @@ -1392,7 +1391,7 @@ def get_period_field_arr(int code, int64_t[:] arr, int freq): func = _get_accessor_func(code) if func is NULL: - raise ValueError(f'Unrecognized period code: {code}') + raise ValueError(f"Unrecognized period code: {code}") sz = len(arr) out = np.empty(sz, dtype=np.int64) @@ -1579,8 +1578,8 @@ cdef class _Period: freq = to_offset(freq) if freq.n <= 0: - raise ValueError(f'Frequency must be positive, because it ' - f'represents span: {freq.freqstr}') + raise ValueError("Frequency must be positive, because it " + f"represents span: {freq.freqstr}") return freq @@ -1614,8 +1613,8 @@ cdef class _Period: return NotImplemented elif op == Py_NE: return NotImplemented - raise TypeError(f'Cannot compare type {type(self).__name__} ' - f'with type {type(other).__name__}') + raise TypeError(f"Cannot compare type {type(self).__name__} " + f"with type {type(other).__name__}") def __hash__(self): return hash((self.ordinal, self.freqstr)) @@ -1633,8 +1632,8 @@ cdef class _Period: if nanos % offset_nanos == 0: ordinal = self.ordinal + (nanos // offset_nanos) return Period(ordinal=ordinal, freq=self.freq) - raise IncompatibleFrequency(f'Input cannot be converted to ' - f'Period(freq={self.freqstr})') + raise IncompatibleFrequency("Input cannot be converted to " + f"Period(freq={self.freqstr})") elif util.is_offset_object(other): freqstr = other.rule_code base = get_base_alias(freqstr) @@ -2467,7 +2466,7 @@ class Period(_Period): if util.is_integer_object(value): value = str(value) value = value.upper() - dt, _, reso = parse_time_string(value, freq) + dt, reso = parse_time_string(value, freq) if dt is NaT: ordinal = NPY_NAT diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index fda508e51e48f..5508b208de00a 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -278,8 +278,8 @@ def array_strptime(object[:] values, object fmt, "the ISO year directive '%G' and a weekday " "directive '%A', '%a', '%w', or '%u'.") else: - raise ValueError("ISO week directive '%V' is incompatible with" - " the year directive '%Y'. Use the ISO year " + raise ValueError("ISO week directive '%V' is incompatible with " + "the year directive '%Y'. Use the ISO year " "'%G' instead.") # If we know the wk of the year and what day of that wk, we can figure @@ -588,7 +588,7 @@ class TimeRE(dict): else: return '' regex = '|'.join(re.escape(stuff) for stuff in to_convert) - regex = f'(?P<{directive}>{regex})' + regex = f"(?P<{directive}>{regex})" return regex def pattern(self, format): diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 86a9d053730b8..36566b55e74ad 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -161,8 +161,7 @@ def round_nsint64(values, mode, freq): # if/elif above should catch all rounding modes defined in enum 'RoundTo': # if flow of control arrives here, it is a bug - raise ValueError("round_nsint64 called with an unrecognized " - "rounding mode") + raise ValueError("round_nsint64 called with an unrecognized rounding mode") # ---------------------------------------------------------------------- @@ -324,8 +323,10 @@ class Timestamp(_Timestamp): Function is not implemented. Use pd.to_datetime(). """ - raise NotImplementedError("Timestamp.strptime() is not implemented." - "Use to_datetime() to parse date strings.") + raise NotImplementedError( + "Timestamp.strptime() is not implemented. " + "Use to_datetime() to parse date strings." + ) @classmethod def combine(cls, date, time): @@ -381,8 +382,9 @@ class Timestamp(_Timestamp): if tzinfo is not None: if not PyTZInfo_Check(tzinfo): # tzinfo must be a datetime.tzinfo object, GH#17690 - raise TypeError(f'tzinfo must be a datetime.tzinfo object, ' - f'not {type(tzinfo)}') + raise TypeError( + f"tzinfo must be a datetime.tzinfo object, not {type(tzinfo)}" + ) elif tz is not None: raise ValueError('Can provide at most one of tz, tzinfo') @@ -393,8 +395,10 @@ class Timestamp(_Timestamp): # User passed a date string to parse. # Check that the user didn't also pass a date attribute kwarg. if any(arg is not None for arg in _date_attributes): - raise ValueError('Cannot pass a date attribute keyword ' - 'argument when passing a date string') + raise ValueError( + "Cannot pass a date attribute keyword " + "argument when passing a date string" + ) elif ts_input is _no_input: # User passed keyword arguments. @@ -578,8 +582,10 @@ timedelta}, default 'raise' @tz.setter def tz(self, value): # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError("Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate") + raise AttributeError( + "Cannot directly set timezone. " + "Use tz_localize() or tz_convert() as appropriate" + ) def __setstate__(self, state): self.value = state[0] @@ -598,9 +604,10 @@ timedelta}, default 'raise' if self.tz is not None: # GH#21333 - warnings.warn("Converting to Period representation will " - "drop timezone information.", - UserWarning) + warnings.warn( + "Converting to Period representation will drop timezone information.", + UserWarning, + ) if freq is None: freq = self.freq @@ -810,13 +817,13 @@ default 'raise' if ambiguous == 'infer': raise ValueError('Cannot infer offset with only one time.') - nonexistent_options = ('raise', 'NaT', 'shift_forward', - 'shift_backward') + nonexistent_options = ('raise', 'NaT', 'shift_forward', 'shift_backward') if nonexistent not in nonexistent_options and not isinstance( nonexistent, timedelta): - raise ValueError("The nonexistent argument must be one of 'raise'," - " 'NaT', 'shift_forward', 'shift_backward' or" - " a timedelta object") + raise ValueError( + "The nonexistent argument must be one of 'raise', " + "'NaT', 'shift_forward', 'shift_backward' or a timedelta object" + ) if self.tzinfo is None: # tz naive, localize @@ -833,8 +840,9 @@ default 'raise' value = tz_convert_single(self.value, UTC, self.tz) return Timestamp(value, tz=tz, freq=self.freq) else: - raise TypeError('Cannot localize tz-aware Timestamp, use ' - 'tz_convert for conversions') + raise TypeError( + "Cannot localize tz-aware Timestamp, use tz_convert for conversions" + ) def tz_convert(self, tz): """ @@ -857,17 +865,28 @@ default 'raise' """ if self.tzinfo is None: # tz naive, use tz_localize - raise TypeError('Cannot convert tz-naive Timestamp, use ' - 'tz_localize to localize') + raise TypeError( + "Cannot convert tz-naive Timestamp, use tz_localize to localize" + ) else: # Same UTC timestamp, different time zone return Timestamp(self.value, tz=tz, freq=self.freq) astimezone = tz_convert - def replace(self, year=None, month=None, day=None, - hour=None, minute=None, second=None, microsecond=None, - nanosecond=None, tzinfo=object, fold=0): + def replace( + self, + year=None, + month=None, + day=None, + hour=None, + minute=None, + second=None, + microsecond=None, + nanosecond=None, + tzinfo=object, + fold=0, + ): """ implements datetime.replace, handles nanoseconds. @@ -910,8 +929,9 @@ default 'raise' def validate(k, v): """ validate integers """ if not is_integer_object(v): - raise ValueError(f"value must be an integer, received " - f"{type(v)} for {k}") + raise ValueError( + f"value must be an integer, received {type(v)} for {k}" + ) return v if year is not None: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 0348843abc129..f675818599b2c 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -56,8 +56,9 @@ cdef: cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b -cdef inline bint is_monotonic_start_end_bounds(ndarray[int64_t, ndim=1] start, - ndarray[int64_t, ndim=1] end): +cdef inline bint is_monotonic_start_end_bounds( + ndarray[int64_t, ndim=1] start, ndarray[int64_t, ndim=1] end +): return is_monotonic(start, False)[0] and is_monotonic(end, False)[0] # Cython implementations of rolling sum, mean, variance, skewness, @@ -90,8 +91,12 @@ cdef inline bint is_monotonic_start_end_bounds(ndarray[int64_t, ndim=1] start, # this is only an impl for index not None, IOW, freq aware -def roll_count(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, - int64_t minp): +def roll_count( + ndarray[float64_t] values, + ndarray[int64_t] start, + ndarray[int64_t] end, + int64_t minp, +): cdef: float64_t val, count_x = 0.0 int64_t s, e, nobs, N = len(values) @@ -1871,8 +1876,7 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, bint is_observation if len(input_y) != N: - raise ValueError(f"arrays are of different lengths " - f"({N} and {len(input_y)})") + raise ValueError(f"arrays are of different lengths ({N} and {len(input_y)})") output = np.empty(N, dtype=float) if N == 0: diff --git a/pandas/_testing.py b/pandas/_testing.py new file mode 100644 index 0000000000000..631d550c60534 --- /dev/null +++ b/pandas/_testing.py @@ -0,0 +1,2759 @@ +import bz2 +from collections import Counter +from contextlib import contextmanager +from datetime import datetime +from functools import wraps +import gzip +import os +from shutil import rmtree +import string +import tempfile +from typing import Any, List, Optional, Union, cast +import warnings +import zipfile + +import numpy as np +from numpy.random import rand, randn + +from pandas._config.localization import ( # noqa:F401 + can_set_locale, + get_locales, + set_locale, +) + +import pandas._libs.testing as _testing +from pandas._typing import FilePathOrBuffer, FrameOrSeries +from pandas.compat import _get_lzma_file, _import_lzma + +from pandas.core.dtypes.common import ( + is_bool, + is_categorical_dtype, + is_datetime64_dtype, + is_datetime64tz_dtype, + is_extension_array_dtype, + is_interval_dtype, + is_list_like, + is_number, + is_period_dtype, + is_sequence, + is_timedelta64_dtype, + needs_i8_conversion, +) +from pandas.core.dtypes.missing import array_equivalent + +import pandas as pd +from pandas import ( + Categorical, + CategoricalIndex, + DataFrame, + DatetimeIndex, + Index, + IntervalIndex, + MultiIndex, + RangeIndex, + Series, + bdate_range, +) +from pandas.core.algorithms import take_1d +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + IntervalArray, + PeriodArray, + TimedeltaArray, + period_array, +) + +from pandas.io.common import urlopen +from pandas.io.formats.printing import pprint_thing + +lzma = _import_lzma() + +N = 30 +K = 4 +_RAISE_NETWORK_ERROR_DEFAULT = False + +# set testing_mode +_testing_mode_warnings = (DeprecationWarning, ResourceWarning) + + +def set_testing_mode(): + # set the testing mode filters + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("always", _testing_mode_warnings) + + +def reset_testing_mode(): + # reset the testing mode filters + testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") + if "deprecate" in testing_mode: + warnings.simplefilter("ignore", _testing_mode_warnings) + + +set_testing_mode() + + +def reset_display_options(): + """ + Reset the display options for printing and representing objects. + """ + pd.reset_option("^display.", silent=True) + + +def round_trip_pickle( + obj: Any, path: Optional[FilePathOrBuffer] = None +) -> FrameOrSeries: + """ + Pickle an object and then read it again. + + Parameters + ---------- + obj : any object + The object to pickle and then re-read. + path : str, path object or file-like object, default None + The path where the pickled object is written and then read. + + Returns + ------- + pandas object + The original object that was pickled and then re-read. + """ + _path = path + if _path is None: + _path = f"__{rands(10)}__.pickle" + with ensure_clean(_path) as temp_path: + pd.to_pickle(obj, temp_path) + return pd.read_pickle(temp_path) + + +def round_trip_pathlib(writer, reader, path: Optional[str] = None): + """ + Write an object to file specified by a pathlib.Path and read it back + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + pandas object + The original object that was serialized and then re-read. + """ + import pytest + + Path = pytest.importorskip("pathlib").Path + if path is None: + path = "___pathlib___" + with ensure_clean(path) as path: + writer(Path(path)) + obj = reader(Path(path)) + return obj + + +def round_trip_localpath(writer, reader, path: Optional[str] = None): + """ + Write an object to file specified by a py.path LocalPath and read it back. + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + pandas object + The original object that was serialized and then re-read. + """ + import pytest + + LocalPath = pytest.importorskip("py.path").local + if path is None: + path = "___localpath___" + with ensure_clean(path) as path: + writer(LocalPath(path)) + obj = reader(LocalPath(path)) + return obj + + +@contextmanager +def decompress_file(path, compression): + """ + Open a compressed file and return a file object. + + Parameters + ---------- + path : str + The path where the file is read from. + + compression : {'gzip', 'bz2', 'zip', 'xz', None} + Name of the decompression to use + + Returns + ------- + file object + """ + if compression is None: + f = open(path, "rb") + elif compression == "gzip": + f = gzip.open(path, "rb") + elif compression == "bz2": + f = bz2.BZ2File(path, "rb") + elif compression == "xz": + f = _get_lzma_file(lzma)(path, "rb") + elif compression == "zip": + zip_file = zipfile.ZipFile(path) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + else: + raise ValueError(f"ZIP file {path} error. Only one file per ZIP.") + else: + raise ValueError(f"Unrecognized compression type: {compression}") + + try: + yield f + finally: + f.close() + if compression == "zip": + zip_file.close() + + +def write_to_compressed(compression, path, data, dest="test"): + """ + Write data to a compressed file. + + Parameters + ---------- + compression : {'gzip', 'bz2', 'zip', 'xz'} + The compression type to use. + path : str + The file path to write the data. + data : str + The data to write. + dest : str, default "test" + The destination file (for ZIP only) + + Raises + ------ + ValueError : An invalid compression value was passed in. + """ + if compression == "zip": + import zipfile + + compress_method = zipfile.ZipFile + elif compression == "gzip": + import gzip + + compress_method = gzip.GzipFile + elif compression == "bz2": + import bz2 + + compress_method = bz2.BZ2File + elif compression == "xz": + compress_method = _get_lzma_file(lzma) + else: + raise ValueError(f"Unrecognized compression type: {compression}") + + if compression == "zip": + mode = "w" + args = (dest, data) + method = "writestr" + else: + mode = "wb" + args = (data,) + method = "write" + + with compress_method(path, mode=mode) as f: + getattr(f, method)(*args) + + +def assert_almost_equal( + left, + right, + check_dtype: Union[bool, str] = "equiv", + check_less_precise: Union[bool, int] = False, + **kwargs, +): + """ + Check that the left and right objects are approximately equal. + + By approximately equal, we refer to objects that are numbers or that + contain numbers which may be equivalent to specific levels of precision. + + Parameters + ---------- + left : object + right : object + check_dtype : bool or {'equiv'}, default 'equiv' + Check dtype if both a and b are the same type. If 'equiv' is passed in, + then `RangeIndex` and `Int64Index` are also considered equivalent + when doing type checking. + check_less_precise : bool or int, default False + Specify comparison precision. 5 digits (False) or 3 digits (True) + after decimal points are compared. If int, then specify the number + of digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + """ + if isinstance(left, pd.Index): + assert_index_equal( + left, + right, + check_exact=False, + exact=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + elif isinstance(left, pd.Series): + assert_series_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + elif isinstance(left, pd.DataFrame): + assert_frame_equal( + left, + right, + check_exact=False, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + else: + # Other sequences. + if check_dtype: + if is_number(left) and is_number(right): + # Do not compare numeric classes, like np.float64 and float. + pass + elif is_bool(left) and is_bool(right): + # Do not compare bool classes, like np.bool_ and bool. + pass + else: + if isinstance(left, np.ndarray) or isinstance(right, np.ndarray): + obj = "numpy array" + else: + obj = "Input" + assert_class_equal(left, right, obj=obj) + _testing.assert_almost_equal( + left, + right, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + **kwargs, + ) + + +def _check_isinstance(left, right, cls): + """ + Helper method for our assert_* methods that ensures that + the two objects being compared have the right type before + proceeding with the comparison. + + Parameters + ---------- + left : The first object being compared. + right : The second object being compared. + cls : The class type to check against. + + Raises + ------ + AssertionError : Either `left` or `right` is not an instance of `cls`. + """ + cls_name = cls.__name__ + + if not isinstance(left, cls): + raise AssertionError( + f"{cls_name} Expected type {cls}, found {type(left)} instead" + ) + if not isinstance(right, cls): + raise AssertionError( + f"{cls_name} Expected type {cls}, found {type(right)} instead" + ) + + +def assert_dict_equal(left, right, compare_keys: bool = True): + + _check_isinstance(left, right, dict) + _testing.assert_dict_equal(left, right, compare_keys=compare_keys) + + +def randbool(size=(), p: float = 0.5): + return rand(*size) <= p + + +RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) +RANDU_CHARS = np.array( + list("".join(map(chr, range(1488, 1488 + 26))) + string.digits), + dtype=(np.unicode_, 1), +) + + +def rands_array(nchars, size, dtype="O"): + """ + Generate an array of byte strings. + """ + retval = ( + np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) + .view((np.str_, nchars)) + .reshape(size) + ) + if dtype is None: + return retval + else: + return retval.astype(dtype) + + +def randu_array(nchars, size, dtype="O"): + """ + Generate an array of unicode strings. + """ + retval = ( + np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) + .view((np.unicode_, nchars)) + .reshape(size) + ) + if dtype is None: + return retval + else: + return retval.astype(dtype) + + +def rands(nchars): + """ + Generate one random byte string. + + See `rands_array` if you want to create an array of random strings. + + """ + return "".join(np.random.choice(RANDS_CHARS, nchars)) + + +def randu(nchars): + """ + Generate one random unicode string. + + See `randu_array` if you want to create an array of random unicode strings. + + """ + return "".join(np.random.choice(RANDU_CHARS, nchars)) + + +def close(fignum=None): + from matplotlib.pyplot import get_fignums, close as _close + + if fignum is None: + for fignum in get_fignums(): + _close(fignum) + else: + _close(fignum) + + +# ----------------------------------------------------------------------------- +# contextmanager to ensure the file cleanup + + +@contextmanager +def ensure_clean(filename=None, return_filelike=False, **kwargs): + """ + Gets a temporary path and agrees to remove on close. + + Parameters + ---------- + filename : str (optional) + if None, creates a temporary file which is then removed when out of + scope. if passed, creates temporary file with filename as ending. + return_filelike : bool (default False) + if True, returns a file-like which is *always* cleaned. Necessary for + savefig and other functions which want to append extensions. + **kwargs + Additional keywords passed in for creating a temporary file. + :meth:`tempFile.TemporaryFile` is used when `return_filelike` is ``True``. + :meth:`tempfile.mkstemp` is used when `return_filelike` is ``False``. + Note that the `filename` parameter will be passed in as the `suffix` + argument to either function. + + See Also + -------- + tempfile.TemporaryFile + tempfile.mkstemp + """ + filename = filename or "" + fd = None + + kwargs["suffix"] = filename + + if return_filelike: + f = tempfile.TemporaryFile(**kwargs) + + try: + yield f + finally: + f.close() + else: + # Don't generate tempfile if using a path with directory specified. + if len(os.path.dirname(filename)): + raise ValueError("Can't pass a qualified name to ensure_clean()") + + try: + fd, filename = tempfile.mkstemp(**kwargs) + except UnicodeEncodeError: + import pytest + + pytest.skip("no unicode file names on this system") + + try: + yield filename + finally: + try: + os.close(fd) + except OSError: + print(f"Couldn't close file descriptor: {fd} (file: {filename})") + try: + if os.path.exists(filename): + os.remove(filename) + except OSError as e: + print(f"Exception on removing file: {e}") + + +@contextmanager +def ensure_clean_dir(): + """ + Get a temporary directory path and agrees to remove on close. + + Yields + ------ + Temporary directory path + """ + directory_name = tempfile.mkdtemp(suffix="") + try: + yield directory_name + finally: + try: + rmtree(directory_name) + except OSError: + pass + + +@contextmanager +def ensure_safe_environment_variables(): + """ + Get a context manager to safely set environment variables + + All changes will be undone on close, hence environment variables set + within this contextmanager will neither persist nor change global state. + """ + saved_environ = dict(os.environ) + try: + yield + finally: + os.environ.clear() + os.environ.update(saved_environ) + + +# ----------------------------------------------------------------------------- +# Comparators + + +def equalContents(arr1, arr2) -> bool: + """ + Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + + +def assert_index_equal( + left: Index, + right: Index, + exact: Union[bool, str] = "equiv", + check_names: bool = True, + check_less_precise: Union[bool, int] = False, + check_exact: bool = True, + check_categorical: bool = True, + obj: str = "Index", +) -> None: + """ + Check that left and right Index are equal. + + Parameters + ---------- + left : Index + right : Index + exact : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + check_names : bool, default True + Whether to check the names attribute. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + check_exact : bool, default True + Whether to compare number exactly. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + obj : str, default 'Index' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + def _check_types(l, r, obj="Index"): + if exact: + assert_class_equal(l, r, exact=exact, obj=obj) + + # Skip exact dtype checking when `check_categorical` is False + if check_categorical: + assert_attr_equal("dtype", l, r, obj=obj) + + # allow string-like to have different inferred_types + if l.inferred_type in ("string"): + assert r.inferred_type in ("string") + else: + assert_attr_equal("inferred_type", l, r, obj=obj) + + def _get_ilevel_values(index, level): + # accept level number only + unique = index.levels[level] + level_codes = index.codes[level] + filled = take_1d(unique._values, level_codes, fill_value=unique._na_value) + values = unique._shallow_copy(filled, name=index.names[level]) + return values + + # instance validation + _check_isinstance(left, right, Index) + + # class / dtype comparison + _check_types(left, right, obj=obj) + + # level comparison + if left.nlevels != right.nlevels: + msg1 = f"{obj} levels are different" + msg2 = f"{left.nlevels}, {left}" + msg3 = f"{right.nlevels}, {right}" + raise_assert_detail(obj, msg1, msg2, msg3) + + # length comparison + if len(left) != len(right): + msg1 = f"{obj} length are different" + msg2 = f"{len(left)}, {left}" + msg3 = f"{len(right)}, {right}" + raise_assert_detail(obj, msg1, msg2, msg3) + + # MultiIndex special comparison for little-friendly error messages + if left.nlevels > 1: + left = cast(MultiIndex, left) + right = cast(MultiIndex, right) + + for level in range(left.nlevels): + # cannot use get_level_values here because it can change dtype + llevel = _get_ilevel_values(left, level) + rlevel = _get_ilevel_values(right, level) + + lobj = f"MultiIndex level [{level}]" + assert_index_equal( + llevel, + rlevel, + exact=exact, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + obj=lobj, + ) + # get_level_values may change dtype + _check_types(left.levels[level], right.levels[level], obj=obj) + + # skip exact index checking when `check_categorical` is False + if check_exact and check_categorical: + if not left.equals(right): + diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) + msg = f"{obj} values are different ({np.round(diff, 5)} %)" + raise_assert_detail(obj, msg, left, right) + else: + _testing.assert_almost_equal( + left.values, + right.values, + check_less_precise=check_less_precise, + check_dtype=exact, + obj=obj, + lobj=left, + robj=right, + ) + + # metadata comparison + if check_names: + assert_attr_equal("names", left, right, obj=obj) + if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): + assert_attr_equal("freq", left, right, obj=obj) + if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): + assert_interval_array_equal(left.values, right.values) + + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + + +def assert_class_equal(left, right, exact: Union[bool, str] = True, obj="Input"): + """ + Checks classes are equal. + """ + __tracebackhide__ = True + + def repr_class(x): + if isinstance(x, Index): + # return Index as it is to include values in the error message + return x + + try: + return type(x).__name__ + except AttributeError: + return repr(type(x)) + + if exact == "equiv": + if type(left) != type(right): + # allow equivalence of Int64Index/RangeIndex + types = {type(left).__name__, type(right).__name__} + if len(types - {"Int64Index", "RangeIndex"}): + msg = f"{obj} classes are not equivalent" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + elif exact: + if type(left) != type(right): + msg = f"{obj} classes are different" + raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) + + +def assert_attr_equal(attr, left, right, obj="Attributes"): + """checks attributes are equal. Both objects must have attribute. + + Parameters + ---------- + attr : str + Attribute name being compared. + left : object + right : object + obj : str, default 'Attributes' + Specify object name being compared, internally used to show appropriate + assertion message + """ + __tracebackhide__ = True + + left_attr = getattr(left, attr) + right_attr = getattr(right, attr) + + if left_attr is right_attr: + return True + elif ( + is_number(left_attr) + and np.isnan(left_attr) + and is_number(right_attr) + and np.isnan(right_attr) + ): + # np.nan + return True + + try: + result = left_attr == right_attr + except TypeError: + # datetimetz on rhs may raise TypeError + result = False + if not isinstance(result, bool): + result = result.all() + + if result: + return True + else: + msg = f'Attribute "{attr}" are different' + raise_assert_detail(obj, msg, left_attr, right_attr) + + +def assert_is_valid_plot_return_object(objs): + import matplotlib.pyplot as plt + + if isinstance(objs, (pd.Series, np.ndarray)): + for el in objs.ravel(): + msg = ( + "one of 'objs' is not a matplotlib Axes instance, " + f"type encountered {repr(type(el).__name__)}" + ) + assert isinstance(el, (plt.Axes, dict)), msg + else: + msg = ( + "objs is neither an ndarray of Artist instances nor a single " + "ArtistArtist instance, tuple, or dict, 'objs' is a " + f"{repr(type(objs).__name__)}" + ) + assert isinstance(objs, (plt.Artist, tuple, dict)), msg + + +def isiterable(obj): + return hasattr(obj, "__iter__") + + +def assert_is_sorted(seq): + """Assert that the sequence is sorted.""" + if isinstance(seq, (Index, Series)): + seq = seq.values + # sorting does not change precisions + assert_numpy_array_equal(seq, np.sort(np.array(seq))) + + +def assert_categorical_equal( + left, right, check_dtype=True, check_category_order=True, obj="Categorical" +): + """Test that Categoricals are equivalent. + + Parameters + ---------- + left : Categorical + right : Categorical + check_dtype : bool, default True + Check that integer dtype of the codes are the same + check_category_order : bool, default True + Whether the order of the categories should be compared, which + implies identical integer codes. If False, only the resulting + values are compared. The ordered attribute is + checked regardless. + obj : str, default 'Categorical' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, Categorical) + + if check_category_order: + assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") + assert_numpy_array_equal( + left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", + ) + else: + assert_index_equal( + left.categories.sort_values(), + right.categories.sort_values(), + obj=f"{obj}.categories", + ) + assert_index_equal( + left.categories.take(left.codes), + right.categories.take(right.codes), + obj=f"{obj}.values", + ) + + assert_attr_equal("ordered", left, right, obj=obj) + + +def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray"): + """Test that two IntervalArrays are equivalent. + + Parameters + ---------- + left, right : IntervalArray + The IntervalArrays to compare. + exact : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. If 'equiv', then RangeIndex can be substituted for + Int64Index as well. + obj : str, default 'IntervalArray' + Specify object name being compared, internally used to show appropriate + assertion message + """ + _check_isinstance(left, right, IntervalArray) + + assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") + assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") + assert_attr_equal("closed", left, right, obj=obj) + + +def assert_period_array_equal(left, right, obj="PeriodArray"): + _check_isinstance(left, right, PeriodArray) + + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}.values") + assert_attr_equal("freq", left, right, obj=obj) + + +def assert_datetime_array_equal(left, right, obj="DatetimeArray"): + __tracebackhide__ = True + _check_isinstance(left, right, DatetimeArray) + + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") + assert_attr_equal("freq", left, right, obj=obj) + assert_attr_equal("tz", left, right, obj=obj) + + +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): + __tracebackhide__ = True + _check_isinstance(left, right, TimedeltaArray) + assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") + assert_attr_equal("freq", left, right, obj=obj) + + +def raise_assert_detail(obj, message, left, right, diff=None): + __tracebackhide__ = True + + if isinstance(left, np.ndarray): + left = pprint_thing(left) + elif is_categorical_dtype(left): + left = repr(left) + + if isinstance(right, np.ndarray): + right = pprint_thing(right) + elif is_categorical_dtype(right): + right = repr(right) + + msg = f"""{obj} are different + +{message} +[left]: {left} +[right]: {right}""" + + if diff is not None: + msg += f"\n[diff]: {diff}" + + raise AssertionError(msg) + + +def assert_numpy_array_equal( + left, + right, + strict_nan=False, + check_dtype=True, + err_msg=None, + check_same=None, + obj="numpy array", +): + """ + Check that 'np.ndarray' is equivalent. + + Parameters + ---------- + left, right : numpy.ndarray or iterable + The two arrays to be compared. + strict_nan : bool, default False + If True, consider NaN and None to be different. + check_dtype : bool, default True + Check dtype if both a and b are np.ndarray. + err_msg : str, default None + If provided, used as assertion message. + check_same : None|'copy'|'same', default None + Ensure left and right refer/do not refer to the same memory area. + obj : str, default 'numpy array' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + # instance validation + # Show a detailed error message when classes are different + assert_class_equal(left, right, obj=obj) + # both classes must be an np.ndarray + _check_isinstance(left, right, np.ndarray) + + def _get_base(obj): + return obj.base if getattr(obj, "base", None) is not None else obj + + left_base = _get_base(left) + right_base = _get_base(right) + + if check_same == "same": + if left_base is not right_base: + raise AssertionError(f"{repr(left_base)} is not {repr(right_base)}") + elif check_same == "copy": + if left_base is right_base: + raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") + + def _raise(left, right, err_msg): + if err_msg is None: + if left.shape != right.shape: + raise_assert_detail( + obj, f"{obj} shapes are different", left.shape, right.shape, + ) + + diff = 0 + for l, r in zip(left, right): + # count up differences + if not array_equivalent(l, r, strict_nan=strict_nan): + diff += 1 + + diff = diff * 100.0 / left.size + msg = f"{obj} values are different ({np.round(diff, 5)} %)" + raise_assert_detail(obj, msg, left, right) + + raise AssertionError(err_msg) + + # compare shape and values + if not array_equivalent(left, right, strict_nan=strict_nan): + _raise(left, right, err_msg) + + if check_dtype: + if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): + assert_attr_equal("dtype", left, right, obj=obj) + + +def assert_extension_array_equal( + left, right, check_dtype=True, check_less_precise=False, check_exact=False +): + """Check that left and right ExtensionArrays are equal. + + Parameters + ---------- + left, right : ExtensionArray + The two arrays to compare + check_dtype : bool, default True + Whether to check if the ExtensionArray dtypes are identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + check_exact : bool, default False + Whether to compare number exactly. + + Notes + ----- + Missing values are checked separately from valid values. + A mask of missing values is computed for each and checked to match. + The remaining all-valid values are cast to object dtype and checked. + """ + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" + assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" + if check_dtype: + assert_attr_equal("dtype", left, right, obj="ExtensionArray") + + if hasattr(left, "asi8") and type(right) == type(left): + # Avoid slow object-dtype comparisons + assert_numpy_array_equal(left.asi8, right.asi8) + return + + left_na = np.asarray(left.isna()) + right_na = np.asarray(right.isna()) + assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask") + + left_valid = np.asarray(left[~left_na].astype(object)) + right_valid = np.asarray(right[~right_na].astype(object)) + if check_exact: + assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray") + else: + _testing.assert_almost_equal( + left_valid, + right_valid, + check_dtype=check_dtype, + check_less_precise=check_less_precise, + obj="ExtensionArray", + ) + + +# This could be refactored to use the NDFrame.equals method +def assert_series_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_series_type=True, + check_less_precise=False, + check_names=True, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + obj="Series", +): + """ + Check that left and right Series are equal. + + Parameters + ---------- + left : Series + right : Series + check_dtype : bool, default True + Whether to check the Series dtype is identical. + check_index_type : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. + check_series_type : bool, default True + Whether to check the Series class is identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + check_names : bool, default True + Whether to check the Series and Index names attribute. + check_exact : bool, default False + Whether to compare number exactly. + check_datetimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + obj : str, default 'Series' + Specify object name being compared, internally used to show appropriate + assertion message. + """ + __tracebackhide__ = True + + # instance validation + _check_isinstance(left, right, Series) + + if check_series_type: + # ToDo: There are some tests using rhs is sparse + # lhs is dense. Should use assert_class_equal in future + assert isinstance(left, type(right)) + # assert_class_equal(left, right, obj=obj) + + # length comparison + if len(left) != len(right): + msg1 = f"{len(left)}, {left.index}" + msg2 = f"{len(right)}, {right.index}" + raise_assert_detail(obj, "Series length are different", msg1, msg2) + + # index comparison + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) + + if check_dtype: + # We want to skip exact dtype checking when `check_categorical` + # is False. We'll still raise if only one is a `Categorical`, + # regardless of `check_categorical` + if ( + is_categorical_dtype(left) + and is_categorical_dtype(right) + and not check_categorical + ): + pass + else: + assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") + + if check_exact: + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + obj=str(obj), + ) + elif check_datetimelike_compat: + # we want to check only if we have compat dtypes + # e.g. integer and M|m are NOT compat, but we can simply check + # the values in that case + if needs_i8_conversion(left) or needs_i8_conversion(right): + + # datetimelike may have different objects (e.g. datetime.datetime + # vs Timestamp) but will compare equal + if not Index(left.values).equals(Index(right.values)): + msg = ( + f"[datetimelike_compat=True] {left.values} " + f"is not equal to {right.values}." + ) + raise AssertionError(msg) + else: + assert_numpy_array_equal( + left._internal_get_values(), + right._internal_get_values(), + check_dtype=check_dtype, + ) + elif is_interval_dtype(left) or is_interval_dtype(right): + assert_interval_array_equal(left.array, right.array) + elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): + # .values is an ndarray, but ._values is the ExtensionArray. + # TODO: Use .array + assert is_extension_array_dtype(right.dtype) + assert_extension_array_equal(left._values, right._values) + elif ( + is_extension_array_dtype(left) + and not is_categorical_dtype(left) + and is_extension_array_dtype(right) + and not is_categorical_dtype(right) + ): + assert_extension_array_equal(left.array, right.array) + else: + _testing.assert_almost_equal( + left._internal_get_values(), + right._internal_get_values(), + check_less_precise=check_less_precise, + check_dtype=check_dtype, + obj=str(obj), + ) + + # metadata comparison + if check_names: + assert_attr_equal("name", left, right, obj=obj) + + if check_categorical: + if is_categorical_dtype(left) or is_categorical_dtype(right): + assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + + +# This could be refactored to use the NDFrame.equals method +def assert_frame_equal( + left, + right, + check_dtype=True, + check_index_type="equiv", + check_column_type="equiv", + check_frame_type=True, + check_less_precise=False, + check_names=True, + by_blocks=False, + check_exact=False, + check_datetimelike_compat=False, + check_categorical=True, + check_like=False, + obj="DataFrame", +): + """ + Check that left and right DataFrame are equal. + + This function is intended to compare two DataFrames and output any + differences. Is is mostly intended for use in unit tests. + Additional parameters allow varying the strictness of the + equality checks performed. + + Parameters + ---------- + left : DataFrame + First DataFrame to compare. + right : DataFrame + Second DataFrame to compare. + check_dtype : bool, default True + Whether to check the DataFrame dtype is identical. + check_index_type : bool or {'equiv'}, default 'equiv' + Whether to check the Index class, dtype and inferred_type + are identical. + check_column_type : bool or {'equiv'}, default 'equiv' + Whether to check the columns class, dtype and inferred_type + are identical. Is passed as the ``exact`` argument of + :func:`assert_index_equal`. + check_frame_type : bool, default True + Whether to check the DataFrame class is identical. + check_less_precise : bool or int, default False + Specify comparison precision. Only used when check_exact is False. + 5 digits (False) or 3 digits (True) after decimal points are compared. + If int, then specify the digits to compare. + + When comparing two numbers, if the first number has magnitude less + than 1e-5, we compare the two numbers directly and check whether + they are equivalent within the specified precision. Otherwise, we + compare the **ratio** of the second number to the first number and + check whether it is equivalent to 1 within the specified precision. + check_names : bool, default True + Whether to check that the `names` attribute for both the `index` + and `column` attributes of the DataFrame is identical. + by_blocks : bool, default False + Specify how to compare internal data. If False, compare by columns. + If True, compare by blocks. + check_exact : bool, default False + Whether to compare number exactly. + check_datetimelike_compat : bool, default False + Compare datetime-like which is comparable ignoring dtype. + check_categorical : bool, default True + Whether to compare internal Categorical exactly. + check_like : bool, default False + If True, ignore the order of index & columns. + Note: index labels must match their respective rows + (same as in columns) - same labels must be with the same data. + obj : str, default 'DataFrame' + Specify object name being compared, internally used to show appropriate + assertion message. + + See Also + -------- + assert_series_equal : Equivalent method for asserting Series equality. + DataFrame.equals : Check DataFrame equality. + + Examples + -------- + This example shows comparing two DataFrames that are equal + but with columns of differing dtypes. + + >>> from pandas._testing import assert_frame_equal + >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) + >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) + + df1 equals itself. + + >>> assert_frame_equal(df1, df1) + + df1 differs from df2 as column 'b' is of a different type. + + >>> assert_frame_equal(df1, df2) + Traceback (most recent call last): + ... + AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different + + Attribute "dtype" are different + [left]: int64 + [right]: float64 + + Ignore differing dtypes in columns with check_dtype. + + >>> assert_frame_equal(df1, df2, check_dtype=False) + """ + __tracebackhide__ = True + + # instance validation + _check_isinstance(left, right, DataFrame) + + if check_frame_type: + assert isinstance(left, type(right)) + # assert_class_equal(left, right, obj=obj) + + # shape comparison + if left.shape != right.shape: + raise_assert_detail( + obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", + ) + + if check_like: + left, right = left.reindex_like(right), right + + # index comparison + assert_index_equal( + left.index, + right.index, + exact=check_index_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.index", + ) + + # column comparison + assert_index_equal( + left.columns, + right.columns, + exact=check_column_type, + check_names=check_names, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_categorical=check_categorical, + obj=f"{obj}.columns", + ) + + # compare by blocks + if by_blocks: + rblocks = right._to_dict_of_blocks() + lblocks = left._to_dict_of_blocks() + for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): + assert dtype in lblocks + assert dtype in rblocks + assert_frame_equal( + lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj + ) + + # compare by columns + else: + for i, col in enumerate(left.columns): + assert col in right + lcol = left.iloc[:, i] + rcol = right.iloc[:, i] + assert_series_equal( + lcol, + rcol, + check_dtype=check_dtype, + check_index_type=check_index_type, + check_less_precise=check_less_precise, + check_exact=check_exact, + check_names=check_names, + check_datetimelike_compat=check_datetimelike_compat, + check_categorical=check_categorical, + obj=f'{obj}.iloc[:, {i}] (column name="{col}")', + ) + + +def assert_equal(left, right, **kwargs): + """ + Wrapper for tm.assert_*_equal to dispatch to the appropriate test function. + + Parameters + ---------- + left, right : Index, Series, DataFrame, ExtensionArray, or np.ndarray + The two items to be compared. + **kwargs + All keyword arguments are passed through to the underlying assert method. + """ + __tracebackhide__ = True + + if isinstance(left, pd.Index): + assert_index_equal(left, right, **kwargs) + elif isinstance(left, pd.Series): + assert_series_equal(left, right, **kwargs) + elif isinstance(left, pd.DataFrame): + assert_frame_equal(left, right, **kwargs) + elif isinstance(left, IntervalArray): + assert_interval_array_equal(left, right, **kwargs) + elif isinstance(left, PeriodArray): + assert_period_array_equal(left, right, **kwargs) + elif isinstance(left, DatetimeArray): + assert_datetime_array_equal(left, right, **kwargs) + elif isinstance(left, TimedeltaArray): + assert_timedelta_array_equal(left, right, **kwargs) + elif isinstance(left, ExtensionArray): + assert_extension_array_equal(left, right, **kwargs) + elif isinstance(left, np.ndarray): + assert_numpy_array_equal(left, right, **kwargs) + elif isinstance(left, str): + assert kwargs == {} + assert left == right + else: + raise NotImplementedError(type(left)) + + +def box_expected(expected, box_cls, transpose=True): + """ + Helper function to wrap the expected output of a test in a given box_class. + + Parameters + ---------- + expected : np.ndarray, Index, Series + box_cls : {Index, Series, DataFrame} + + Returns + ------- + subclass of box_cls + """ + if box_cls is pd.Index: + expected = pd.Index(expected) + elif box_cls is pd.Series: + expected = pd.Series(expected) + elif box_cls is pd.DataFrame: + expected = pd.Series(expected).to_frame() + if transpose: + # for vector operations, we we need a DataFrame to be a single-row, + # not a single-column, in order to operate against non-DataFrame + # vectors of the same length. + expected = expected.T + elif box_cls is PeriodArray: + # the PeriodArray constructor is not as flexible as period_array + expected = period_array(expected) + elif box_cls is DatetimeArray: + expected = DatetimeArray(expected) + elif box_cls is TimedeltaArray: + expected = TimedeltaArray(expected) + elif box_cls is np.ndarray: + expected = np.array(expected) + elif box_cls is to_array: + expected = to_array(expected) + else: + raise NotImplementedError(box_cls) + return expected + + +def to_array(obj): + # temporary implementation until we get pd.array in place + if is_period_dtype(obj): + return period_array(obj) + elif is_datetime64_dtype(obj) or is_datetime64tz_dtype(obj): + return DatetimeArray._from_sequence(obj) + elif is_timedelta64_dtype(obj): + return TimedeltaArray._from_sequence(obj) + else: + return np.array(obj) + + +# ----------------------------------------------------------------------------- +# Sparse + + +def assert_sp_array_equal( + left, + right, + check_dtype=True, + check_kind=True, + check_fill_value=True, + consolidate_block_indices=False, +): + """Check that the left and right SparseArray are equal. + + Parameters + ---------- + left : SparseArray + right : SparseArray + check_dtype : bool, default True + Whether to check the data dtype is identical. + check_kind : bool, default True + Whether to just the kind of the sparse index for each column. + check_fill_value : bool, default True + Whether to check that left.fill_value matches right.fill_value + consolidate_block_indices : bool, default False + Whether to consolidate contiguous blocks for sparse arrays with + a BlockIndex. Some operations, e.g. concat, will end up with + block indices that could be consolidated. Setting this to true will + create a new BlockIndex for that array, with consolidated + block indices. + """ + + _check_isinstance(left, right, pd.arrays.SparseArray) + + assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) + + # SparseIndex comparison + assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) + assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex) + + if not check_kind: + left_index = left.sp_index.to_block_index() + right_index = right.sp_index.to_block_index() + else: + left_index = left.sp_index + right_index = right.sp_index + + if consolidate_block_indices and left.kind == "block": + # we'll probably remove this hack... + left_index = left_index.to_int_index().to_block_index() + right_index = right_index.to_int_index().to_block_index() + + if not left_index.equals(right_index): + raise_assert_detail( + "SparseArray.index", "index are not equal", left_index, right_index + ) + else: + # Just ensure a + pass + + if check_fill_value: + assert_attr_equal("fill_value", left, right) + if check_dtype: + assert_attr_equal("dtype", left, right) + assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) + + +# ----------------------------------------------------------------------------- +# Others + + +def assert_contains_all(iterable, dic): + for k in iterable: + assert k in dic, f"Did not contain item: {repr(k)}" + + +def assert_copy(iter1, iter2, **eql_kwargs): + """ + iter1, iter2: iterables that produce elements + comparable with assert_almost_equal + + Checks that the elements are equal, but not + the same object. (Does not check that items + in sequences are also not the same object) + """ + for elem1, elem2 in zip(iter1, iter2): + assert_almost_equal(elem1, elem2, **eql_kwargs) + msg = ( + f"Expected object {repr(type(elem1))} and object {repr(type(elem2))} to be " + "different objects, but they were the same object." + ) + assert elem1 is not elem2, msg + + +def getCols(k): + return string.ascii_uppercase[:k] + + +# make index +def makeStringIndex(k=10, name=None): + return Index(rands_array(nchars=10, size=k), name=name) + + +def makeUnicodeIndex(k=10, name=None): + return Index(randu_array(nchars=10, size=k), name=name) + + +def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): + """ make a length k index or n categories """ + x = rands_array(nchars=4, size=n) + return CategoricalIndex( + Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs + ) + + +def makeIntervalIndex(k=10, name=None, **kwargs): + """ make a length k IntervalIndex """ + x = np.linspace(0, 100, num=(k + 1)) + return IntervalIndex.from_breaks(x, name=name, **kwargs) + + +def makeBoolIndex(k=10, name=None): + if k == 1: + return Index([True], name=name) + elif k == 2: + return Index([False, True], name=name) + return Index([False, True] + [False] * (k - 2), name=name) + + +def makeIntIndex(k=10, name=None): + return Index(list(range(k)), name=name) + + +def makeUIntIndex(k=10, name=None): + return Index([2 ** 63 + i for i in range(k)], name=name) + + +def makeRangeIndex(k=10, name=None, **kwargs): + return RangeIndex(0, k, 1, name=name, **kwargs) + + +def makeFloatIndex(k=10, name=None): + values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) + return Index(values * (10 ** np.random.randint(0, 9)), name=name) + + +def makeDateIndex(k=10, freq="B", name=None, **kwargs): + dt = datetime(2000, 1, 1) + dr = bdate_range(dt, periods=k, freq=freq, name=name) + return DatetimeIndex(dr, name=name, **kwargs) + + +def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): + return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) + + +def makePeriodIndex(k=10, name=None, **kwargs): + dt = datetime(2000, 1, 1) + dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) + return dr + + +def makeMultiIndex(k=10, names=None, **kwargs): + return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) + + +_names = [ + "Alice", + "Bob", + "Charlie", + "Dan", + "Edith", + "Frank", + "George", + "Hannah", + "Ingrid", + "Jerry", + "Kevin", + "Laura", + "Michael", + "Norbert", + "Oliver", + "Patricia", + "Quinn", + "Ray", + "Sarah", + "Tim", + "Ursula", + "Victor", + "Wendy", + "Xavier", + "Yvonne", + "Zelda", +] + + +def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None): + """ + Make a DataFrame with a DatetimeIndex + + Parameters + ---------- + start : str or Timestamp, default "2000-01-01" + The start of the index. Passed to date_range with `freq`. + end : str or Timestamp, default "2000-12-31" + The end of the index. Passed to date_range with `freq`. + freq : str or Freq + The frequency to use for the DatetimeIndex + seed : int, optional + The random state seed. + + * name : object dtype with string names + * id : int dtype with + * x, y : float dtype + + Examples + -------- + >>> _make_timeseries() + id name x y + timestamp + 2000-01-01 982 Frank 0.031261 0.986727 + 2000-01-02 1025 Edith -0.086358 -0.032920 + 2000-01-03 982 Edith 0.473177 0.298654 + 2000-01-04 1009 Sarah 0.534344 -0.750377 + 2000-01-05 963 Zelda -0.271573 0.054424 + ... ... ... ... ... + 2000-12-27 980 Ingrid -0.132333 -0.422195 + 2000-12-28 972 Frank -0.376007 -0.298687 + 2000-12-29 1009 Ursula -0.865047 -0.503133 + 2000-12-30 1000 Hannah -0.063757 -0.507336 + 2000-12-31 972 Tim -0.869120 0.531685 + """ + index = pd.date_range(start=start, end=end, freq=freq, name="timestamp") + n = len(index) + state = np.random.RandomState(seed) + columns = { + "name": state.choice(_names, size=n), + "id": state.poisson(1000, size=n), + "x": state.rand(n) * 2 - 1, + "y": state.rand(n) * 2 - 1, + } + df = pd.DataFrame(columns, index=index, columns=sorted(columns)) + if df.index[-1] == end: + df = df.iloc[:-1] + return df + + +def all_index_generator(k=10): + """Generator which can be iterated over to get instances of all the various + index classes. + + Parameters + ---------- + k: length of each of the index instances + """ + all_make_index_funcs = [ + makeIntIndex, + makeFloatIndex, + makeStringIndex, + makeUnicodeIndex, + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeBoolIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + ] + for make_index_func in all_make_index_funcs: + yield make_index_func(k=k) + + +def index_subclass_makers_generator(): + make_index_funcs = [ + makeDateIndex, + makePeriodIndex, + makeTimedeltaIndex, + makeRangeIndex, + makeIntervalIndex, + makeCategoricalIndex, + makeMultiIndex, + ] + for make_index_func in make_index_funcs: + yield make_index_func + + +def all_timeseries_index_generator(k=10): + """Generator which can be iterated over to get instances of all the classes + which represent time-series. + + Parameters + ---------- + k: length of each of the index instances + """ + make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex] + for make_index_func in make_index_funcs: + yield make_index_func(k=k) + + +# make series +def makeFloatSeries(name=None): + index = makeStringIndex(N) + return Series(randn(N), index=index, name=name) + + +def makeStringSeries(name=None): + index = makeStringIndex(N) + return Series(randn(N), index=index, name=name) + + +def makeObjectSeries(name=None): + data = makeStringIndex(N) + data = Index(data, dtype=object) + index = makeStringIndex(N) + return Series(data, index=index, name=name) + + +def getSeriesData(): + index = makeStringIndex(N) + return {c: Series(randn(N), index=index) for c in getCols(K)} + + +def makeTimeSeries(nper=None, freq="B", name=None): + if nper is None: + nper = N + return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) + + +def makePeriodSeries(nper=None, name=None): + if nper is None: + nper = N + return Series(randn(nper), index=makePeriodIndex(nper), name=name) + + +def getTimeSeriesData(nper=None, freq="B"): + return {c: makeTimeSeries(nper, freq) for c in getCols(K)} + + +def getPeriodData(nper=None): + return {c: makePeriodSeries(nper) for c in getCols(K)} + + +# make frame +def makeTimeDataFrame(nper=None, freq="B"): + data = getTimeSeriesData(nper, freq) + return DataFrame(data) + + +def makeDataFrame(): + data = getSeriesData() + return DataFrame(data) + + +def getMixedTypeDict(): + index = Index(["a", "b", "c", "d", "e"]) + + data = { + "A": [0.0, 1.0, 2.0, 3.0, 4.0], + "B": [0.0, 1.0, 0.0, 1.0, 0.0], + "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], + "D": bdate_range("1/1/2009", periods=5), + } + + return index, data + + +def makeMixedDataFrame(): + return DataFrame(getMixedTypeDict()[1]) + + +def makePeriodFrame(nper=None): + data = getPeriodData(nper) + return DataFrame(data) + + +def makeCustomIndex( + nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None +): + """Create an index/multindex with given dimensions, levels, names, etc' + + nentries - number of entries in index + nlevels - number of levels (> 1 produces multindex) + prefix - a string prefix for labels + names - (Optional), bool or list of strings. if True will use default + names, if false will use no names, if a list is given, the name of + each level in the index will be taken from the list. + ndupe_l - (Optional), list of ints, the number of rows for which the + label will repeated at the corresponding level, you can specify just + the first few, the rest will use the default ndupe_l of 1. + len(ndupe_l) <= nlevels. + idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + "td" create a datetime index. + + if unspecified, string labels will be generated. + """ + + if ndupe_l is None: + ndupe_l = [1] * nlevels + assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels + assert names is None or names is False or names is True or len(names) is nlevels + assert idx_type is None or ( + idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 + ) + + if names is True: + # build default names + names = [prefix + str(i) for i in range(nlevels)] + if names is False: + # pass None to index constructor for no name + names = None + + # make singleton case uniform + if isinstance(names, str) and nlevels == 1: + names = [names] + + # specific 1D index type requested? + idx_func = dict( + i=makeIntIndex, + f=makeFloatIndex, + s=makeStringIndex, + u=makeUnicodeIndex, + dt=makeDateIndex, + td=makeTimedeltaIndex, + p=makePeriodIndex, + ).get(idx_type) + if idx_func: + idx = idx_func(nentries) + # but we need to fill in the name + if names: + idx.name = names[0] + return idx + elif idx_type is not None: + raise ValueError( + f"{repr(idx_type)} is not a legal value for `idx_type`, " + "use 'i'/'f'/'s'/'u'/'dt'/'p'/'td'." + ) + + if len(ndupe_l) < nlevels: + ndupe_l.extend([1] * (nlevels - len(ndupe_l))) + assert len(ndupe_l) == nlevels + + assert all(x > 0 for x in ndupe_l) + + tuples = [] + for i in range(nlevels): + + def keyfunc(x): + import re + + numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") + return [int(num) for num in numeric_tuple] + + # build a list of lists to create the index from + div_factor = nentries // ndupe_l[i] + 1 + cnt = Counter() + for j in range(div_factor): + label = f"{prefix}_l{i}_g{j}" + cnt[label] = ndupe_l[i] + # cute Counter trick + result = sorted(cnt.elements(), key=keyfunc)[:nentries] + tuples.append(result) + + tuples = list(zip(*tuples)) + + # convert tuples to index + if nentries == 1: + # we have a single level of tuples, i.e. a regular Index + index = Index(tuples[0], name=names[0]) + elif nlevels == 1: + name = None if names is None else names[0] + index = Index((x[0] for x in tuples), name=name) + else: + index = MultiIndex.from_tuples(tuples, names=names) + return index + + +def makeCustomDataframe( + nrows, + ncols, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): + """ + nrows, ncols - number of data rows/cols + c_idx_names, idx_names - False/True/list of strings, yields No names , + default names or uses the provided names for the levels of the + corresponding index. You can provide a single string when + c_idx_nlevels ==1. + c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex + r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex + data_gen_f - a function f(row,col) which return the data value + at that position, the default generator used yields values of the form + "RxCy" based on position. + c_ndupe_l, r_ndupe_l - list of integers, determines the number + of duplicates for each label at a given level of the corresponding + index. The default `None` value produces a multiplicity of 1 across + all levels, i.e. a unique index. Will accept a partial list of length + N < idx_nlevels, for just the first N levels. If ndupe doesn't divide + nrows/ncol, the last label might have lower multiplicity. + dtype - passed to the DataFrame constructor as is, in case you wish to + have more control in conjunction with a custom `data_gen_f` + r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". + If idx_type is not None, `idx_nlevels` must be 1. + "i"/"f" creates an integer/float index, + "s"/"u" creates a string/unicode index + "dt" create a datetime index. + "td" create a timedelta index. + + if unspecified, string labels will be generated. + + Examples: + + # 5 row, 3 columns, default names on both, single index on both axis + >> makeCustomDataframe(5,3) + + # make the data a random int between 1 and 100 + >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) + + # 2-level multiindex on rows with each label duplicated + # twice on first level, default names on both axis, single + # index on both axis + >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) + + # DatetimeIndex on row, index with unicode labels on columns + # no names on either axis + >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, + r_idx_type="dt",c_idx_type="u") + + # 4-level multindex on rows with names provided, 2-level multindex + # on columns with default labels and default names. + >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, + r_idx_names=["FEE","FI","FO","FAM"], + c_idx_nlevels=2) + + >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + """ + + assert c_idx_nlevels > 0 + assert r_idx_nlevels > 0 + assert r_idx_type is None or ( + r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 + ) + assert c_idx_type is None or ( + c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 + ) + + columns = makeCustomIndex( + ncols, + nlevels=c_idx_nlevels, + prefix="C", + names=c_idx_names, + ndupe_l=c_ndupe_l, + idx_type=c_idx_type, + ) + index = makeCustomIndex( + nrows, + nlevels=r_idx_nlevels, + prefix="R", + names=r_idx_names, + ndupe_l=r_ndupe_l, + idx_type=r_idx_type, + ) + + # by default, generate data based on location + if data_gen_f is None: + data_gen_f = lambda r, c: f"R{r}C{c}" + + data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] + + return DataFrame(data, index, columns, dtype=dtype) + + +def _create_missing_idx(nrows, ncols, density, random_state=None): + if random_state is None: + random_state = np.random + else: + random_state = np.random.RandomState(random_state) + + # below is cribbed from scipy.sparse + size = int(np.round((1 - density) * nrows * ncols)) + # generate a few more to ensure unique values + min_rows = 5 + fac = 1.02 + extra_size = min(size + min_rows, fac * size) + + def _gen_unique_rand(rng, _extra_size): + ind = rng.rand(int(_extra_size)) + return np.unique(np.floor(ind * nrows * ncols))[:size] + + ind = _gen_unique_rand(random_state, extra_size) + while ind.size < size: + extra_size *= 1.05 + ind = _gen_unique_rand(random_state, extra_size) + + j = np.floor(ind * 1.0 / nrows).astype(int) + i = (ind - j * nrows).astype(int) + return i.tolist(), j.tolist() + + +def makeMissingCustomDataframe( + nrows, + ncols, + density=0.9, + random_state=None, + c_idx_names=True, + r_idx_names=True, + c_idx_nlevels=1, + r_idx_nlevels=1, + data_gen_f=None, + c_ndupe_l=None, + r_ndupe_l=None, + dtype=None, + c_idx_type=None, + r_idx_type=None, +): + """ + Parameters + ---------- + Density : float, optional + Float in (0, 1) that gives the percentage of non-missing numbers in + the DataFrame. + random_state : {np.random.RandomState, int}, optional + Random number generator or random seed. + + See makeCustomDataframe for descriptions of the rest of the parameters. + """ + df = makeCustomDataframe( + nrows, + ncols, + c_idx_names=c_idx_names, + r_idx_names=r_idx_names, + c_idx_nlevels=c_idx_nlevels, + r_idx_nlevels=r_idx_nlevels, + data_gen_f=data_gen_f, + c_ndupe_l=c_ndupe_l, + r_ndupe_l=r_ndupe_l, + dtype=dtype, + c_idx_type=c_idx_type, + r_idx_type=r_idx_type, + ) + + i, j = _create_missing_idx(nrows, ncols, density, random_state) + df.values[i, j] = np.nan + return df + + +def makeMissingDataframe(density=0.9, random_state=None): + df = makeDataFrame() + i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) + df.values[i, j] = np.nan + return df + + +def optional_args(decorator): + """allows a decorator to take optional positional and keyword arguments. + Assumes that taking a single, callable, positional argument means that + it is decorating a function, i.e. something like this:: + + @my_decorator + def function(): pass + + Calls decorator with decorator(f, *args, **kwargs)""" + + @wraps(decorator) + def wrapper(*args, **kwargs): + def dec(f): + return decorator(f, *args, **kwargs) + + is_decorating = not kwargs and len(args) == 1 and callable(args[0]) + if is_decorating: + f = args[0] + args = [] + return dec(f) + else: + return dec + + return wrapper + + +# skip tests on exceptions with this message +_network_error_messages = ( + # 'urlopen error timed out', + # 'timeout: timed out', + # 'socket.timeout: timed out', + "timed out", + "Server Hangup", + "HTTP Error 503: Service Unavailable", + "502: Proxy Error", + "HTTP Error 502: internal error", + "HTTP Error 502", + "HTTP Error 503", + "HTTP Error 403", + "HTTP Error 400", + "Temporary failure in name resolution", + "Name or service not known", + "Connection refused", + "certificate verify", +) + +# or this e.errno/e.reason.errno +_network_errno_vals = ( + 101, # Network is unreachable + 111, # Connection refused + 110, # Connection timed out + 104, # Connection reset Error + 54, # Connection reset by peer + 60, # urllib.error.URLError: [Errno 60] Connection timed out +) + +# Both of the above shouldn't mask real issues such as 404's +# or refused connections (changed DNS). +# But some tests (test_data yahoo) contact incredibly flakey +# servers. + +# and conditionally raise on exception types in _get_default_network_errors + + +def _get_default_network_errors(): + # Lazy import for http.client because it imports many things from the stdlib + import http.client + + return (IOError, http.client.HTTPException, TimeoutError) + + +def can_connect(url, error_classes=None): + """Try to connect to the given url. True if succeeds, False if IOError + raised + + Parameters + ---------- + url : basestring + The URL to try to connect to + + Returns + ------- + connectable : bool + Return True if no IOError (unable to connect) or URLError (bad url) was + raised + """ + + if error_classes is None: + error_classes = _get_default_network_errors() + + try: + with urlopen(url): + pass + except error_classes: + return False + else: + return True + + +@optional_args +def network( + t, + url="http://www.google.com", + raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, + check_before_test=False, + error_classes=None, + skip_errnos=_network_errno_vals, + _skip_on_messages=_network_error_messages, +): + """ + Label a test as requiring network connection and, if an error is + encountered, only raise if it does not find a network connection. + + In comparison to ``network``, this assumes an added contract to your test: + you must assert that, under normal conditions, your test will ONLY fail if + it does not have network connectivity. + + You can call this in 3 ways: as a standard decorator, with keyword + arguments, or with a positional argument that is the url to check. + + Parameters + ---------- + t : callable + The test requiring network connectivity. + url : path + The url to test via ``pandas.io.common.urlopen`` to check + for connectivity. Defaults to 'http://www.google.com'. + raise_on_error : bool + If True, never catches errors. + check_before_test : bool + If True, checks connectivity before running the test case. + error_classes : tuple or Exception + error classes to ignore. If not in ``error_classes``, raises the error. + defaults to IOError. Be careful about changing the error classes here. + skip_errnos : iterable of int + Any exception that has .errno or .reason.erno set to one + of these values will be skipped with an appropriate + message. + _skip_on_messages: iterable of string + any exception e for which one of the strings is + a substring of str(e) will be skipped with an appropriate + message. Intended to suppress errors where an errno isn't available. + + Notes + ----- + * ``raise_on_error`` supercedes ``check_before_test`` + + Returns + ------- + t : callable + The decorated test ``t``, with checks for connectivity errors. + + Example + ------- + + Tests decorated with @network will fail if it's possible to make a network + connection to another URL (defaults to google.com):: + + >>> from pandas._testing import network + >>> from pandas.io.common import urlopen + >>> @network + ... def test_network(): + ... with urlopen("rabbit://bonanza.com"): + ... pass + Traceback + ... + URLError: + + You can specify alternative URLs:: + + >>> @network("http://www.yahoo.com") + ... def test_something_with_yahoo(): + ... raise IOError("Failure Message") + >>> test_something_with_yahoo() + Traceback (most recent call last): + ... + IOError: Failure Message + + If you set check_before_test, it will check the url first and not run the + test on failure:: + + >>> @network("failing://url.blaher", check_before_test=True) + ... def test_something(): + ... print("I ran!") + ... raise ValueError("Failure") + >>> test_something() + Traceback (most recent call last): + ... + + Errors not related to networking will always be raised. + """ + from pytest import skip + + if error_classes is None: + error_classes = _get_default_network_errors() + + t.network = True + + @wraps(t) + def wrapper(*args, **kwargs): + if check_before_test and not raise_on_error: + if not can_connect(url, error_classes): + skip() + try: + return t(*args, **kwargs) + except Exception as err: + errno = getattr(err, "errno", None) + if not errno and hasattr(errno, "reason"): + errno = getattr(err.reason, "errno", None) + + if errno in skip_errnos: + skip(f"Skipping test due to known errno and error {err}") + + e_str = str(err) + + if any(m.lower() in e_str.lower() for m in _skip_on_messages): + skip( + f"Skipping test because exception message is known and error {err}" + ) + + if not isinstance(err, error_classes): + raise + + if raise_on_error or can_connect(url, error_classes): + raise + else: + skip(f"Skipping test due to lack of connectivity and error {err}") + + return wrapper + + +with_connectivity_check = network + + +@contextmanager +def assert_produces_warning( + expected_warning=Warning, + filter_level="always", + clear=None, + check_stacklevel=True, + raise_on_extra_warnings=True, +): + """ + Context manager for running code expected to either raise a specific + warning, or not raise any warnings. Verifies that the code raises the + expected warning, and that it does not raise any other unexpected + warnings. It is basically a wrapper around ``warnings.catch_warnings``. + + Parameters + ---------- + expected_warning : {Warning, False, None}, default Warning + The type of Exception raised. ``exception.Warning`` is the base + class for all warnings. To check that no warning is returned, + specify ``False`` or ``None``. + filter_level : str or None, default "always" + Specifies whether warnings are ignored, displayed, or turned + into errors. + Valid values are: + + * "error" - turns matching warnings into exceptions + * "ignore" - discard the warning + * "always" - always emit a warning + * "default" - print the warning the first time it is generated + from each location + * "module" - print the warning the first time it is generated + from each module + * "once" - print the warning the first time it is generated + + clear : str, default None + If not ``None`` then remove any previously raised warnings from + the ``__warningsregistry__`` to ensure that no warning messages are + suppressed by this context manager. If ``None`` is specified, + the ``__warningsregistry__`` keeps track of which warnings have been + shown, and does not show them again. + check_stacklevel : bool, default True + If True, displays the line that called the function containing + the warning to show were the function is called. Otherwise, the + line that implements the function is displayed. + raise_on_extra_warnings : bool, default True + Whether extra warnings not of the type `expected_warning` should + cause the test to fail. + + Examples + -------- + >>> import warnings + >>> with assert_produces_warning(): + ... warnings.warn(UserWarning()) + ... + >>> with assert_produces_warning(False): + ... warnings.warn(RuntimeWarning()) + ... + Traceback (most recent call last): + ... + AssertionError: Caused unexpected warning(s): ['RuntimeWarning']. + >>> with assert_produces_warning(UserWarning): + ... warnings.warn(RuntimeWarning()) + Traceback (most recent call last): + ... + AssertionError: Did not see expected warning of class 'UserWarning'. + + ..warn:: This is *not* thread-safe. + """ + __tracebackhide__ = True + + with warnings.catch_warnings(record=True) as w: + + if clear is not None: + # make sure that we are clearing these warnings + # if they have happened before + # to guarantee that we will catch them + if not is_list_like(clear): + clear = [clear] + for m in clear: + try: + m.__warningregistry__.clear() + except AttributeError: + # module may not have __warningregistry__ + pass + + saw_warning = False + warnings.simplefilter(filter_level) + yield w + extra_warnings = [] + + for actual_warning in w: + if expected_warning and issubclass( + actual_warning.category, expected_warning + ): + saw_warning = True + + if check_stacklevel and issubclass( + actual_warning.category, (FutureWarning, DeprecationWarning) + ): + from inspect import getframeinfo, stack + + caller = getframeinfo(stack()[2][0]) + msg = ( + "Warning not set with correct stacklevel. " + f"File where warning is raised: {actual_warning.filename} != " + f"{caller.filename}. Warning message: {actual_warning.message}" + ) + assert actual_warning.filename == caller.filename, msg + else: + extra_warnings.append( + ( + actual_warning.category.__name__, + actual_warning.message, + actual_warning.filename, + actual_warning.lineno, + ) + ) + if expected_warning: + msg = ( + f"Did not see expected warning of class " + f"{repr(expected_warning.__name__)}" + ) + assert saw_warning, msg + if raise_on_extra_warnings and extra_warnings: + raise AssertionError( + f"Caused unexpected warning(s): {repr(extra_warnings)}" + ) + + +class RNGContext: + """ + Context manager to set the numpy random number generator speed. Returns + to the original value upon exiting the context manager. + + Parameters + ---------- + seed : int + Seed for numpy.random.seed + + Examples + -------- + + with RNGContext(42): + np.random.randn() + """ + + def __init__(self, seed): + self.seed = seed + + def __enter__(self): + + self.start_state = np.random.get_state() + np.random.seed(self.seed) + + def __exit__(self, exc_type, exc_value, traceback): + + np.random.set_state(self.start_state) + + +@contextmanager +def with_csv_dialect(name, **kwargs): + """ + Context manager to temporarily register a CSV dialect for parsing CSV. + + Parameters + ---------- + name : str + The name of the dialect. + kwargs : mapping + The parameters for the dialect. + + Raises + ------ + ValueError : the name of the dialect conflicts with a builtin one. + + See Also + -------- + csv : Python's CSV library. + """ + import csv + + _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} + + if name in _BUILTIN_DIALECTS: + raise ValueError("Cannot override builtin dialect.") + + csv.register_dialect(name, **kwargs) + yield + csv.unregister_dialect(name) + + +@contextmanager +def use_numexpr(use, min_elements=None): + from pandas.core.computation import expressions as expr + + if min_elements is None: + min_elements = expr._MIN_ELEMENTS + + olduse = expr._USE_NUMEXPR + oldmin = expr._MIN_ELEMENTS + expr.set_use_numexpr(use) + expr._MIN_ELEMENTS = min_elements + yield + expr._MIN_ELEMENTS = oldmin + expr.set_use_numexpr(olduse) + + +def test_parallel(num_threads=2, kwargs_list=None): + """Decorator to run the same function multiple times in parallel. + + Parameters + ---------- + num_threads : int, optional + The number of times the function is run in parallel. + kwargs_list : list of dicts, optional + The list of kwargs to update original + function kwargs on different threads. + Notes + ----- + This decorator does not pass the return value of the decorated function. + + Original from scikit-image: + + https://github.com/scikit-image/scikit-image/pull/1519 + + """ + + assert num_threads > 0 + has_kwargs_list = kwargs_list is not None + if has_kwargs_list: + assert len(kwargs_list) == num_threads + import threading + + def wrapper(func): + @wraps(func) + def inner(*args, **kwargs): + if has_kwargs_list: + update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) + else: + update_kwargs = lambda i: kwargs + threads = [] + for i in range(num_threads): + updated_kwargs = update_kwargs(i) + thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) + threads.append(thread) + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + return inner + + return wrapper + + +class SubclassedSeries(Series): + _metadata = ["testattr", "name"] + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + +class SubclassedDataFrame(DataFrame): + _metadata = ["testattr"] + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + + +class SubclassedCategorical(Categorical): + @property + def _constructor(self): + return SubclassedCategorical + + +@contextmanager +def set_timezone(tz: str): + """ + Context manager for temporarily setting a timezone. + + Parameters + ---------- + tz : str + A string representing a valid timezone. + + Examples + -------- + + >>> from datetime import datetime + >>> from dateutil.tz import tzlocal + >>> tzlocal().tzname(datetime.now()) + 'IST' + + >>> with set_timezone('US/Eastern'): + ... tzlocal().tzname(datetime.now()) + ... + 'EDT' + """ + + import os + import time + + def setTZ(tz): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + time.tzset() + + orig_tz = os.environ.get("TZ") + setTZ(tz) + try: + yield + finally: + setTZ(orig_tz) + + +def _make_skipna_wrapper(alternative, skipna_alternative=None): + """ + Create a function for calling on an array. + + Parameters + ---------- + alternative : function + The function to be called on the array with no NaNs. + Only used when 'skipna_alternative' is None. + skipna_alternative : function + The function to be called on the original array + + Returns + ------- + function + """ + if skipna_alternative: + + def skipna_wrapper(x): + return skipna_alternative(x.values) + + else: + + def skipna_wrapper(x): + nona = x.dropna() + if len(nona) == 0: + return np.nan + return alternative(nona) + + return skipna_wrapper + + +def convert_rows_list_to_csv_str(rows_list: List[str]): + """ + Convert list of CSV rows to single CSV-formatted string for current OS. + + This method is used for creating expected value of to_csv() method. + + Parameters + ---------- + rows_list : List[str] + Each element represents the row of csv. + + Returns + ------- + str + Expected output of to_csv() in current OS. + """ + sep = os.linesep + expected = sep.join(rows_list) + sep + return expected diff --git a/pandas/_typing.py b/pandas/_typing.py index 69b08c581cff9..171b76b4d2c4b 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -2,10 +2,14 @@ from typing import ( IO, TYPE_CHECKING, + Any, AnyStr, + Callable, Collection, Dict, + Hashable, List, + Mapping, Optional, TypeVar, Union, @@ -21,23 +25,49 @@ from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.series import Series # noqa: F401 from pandas.core.generic import NDFrame # noqa: F401 + from pandas import Interval # noqa: F401 + from pandas.core.series import Series # noqa: F401 + from pandas.core.frame import DataFrame # noqa: F401 +# array-like AnyArrayLike = TypeVar("AnyArrayLike", "ExtensionArray", "Index", "Series", np.ndarray) ArrayLike = TypeVar("ArrayLike", "ExtensionArray", np.ndarray) + +# scalars + +PythonScalar = Union[str, int, float, bool] DatetimeLikeScalar = TypeVar("DatetimeLikeScalar", "Period", "Timestamp", "Timedelta") +PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] +Scalar = Union[PythonScalar, PandasScalar] + +# other + Dtype = Union[str, np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr]] +# FrameOrSeriesUnion means either a DataFrame or a Series. E.g. +# `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series +# is passed in, either a Series or DataFrame is returned, and if a DataFrame is passed +# in, either a DataFrame or a Series is returned. +FrameOrSeriesUnion = Union["DataFrame", "Series"] + +# FrameOrSeries is stricter and ensures that the same subclass of NDFrame always is +# used. E.g. `def func(a: FrameOrSeries) -> FrameOrSeries: ...` means that if a +# Series is passed into a function, a Series is always returned and if a DataFrame is +# passed in, a DataFrame is always returned. FrameOrSeries = TypeVar("FrameOrSeries", bound="NDFrame") -Scalar = Union[str, int, float, bool] + Axis = Union[str, int] +Label = Optional[Hashable] +Level = Union[Label, int] Ordered = Optional[bool] -JSONSerializable = Union[Scalar, List, Dict] - +JSONSerializable = Union[PythonScalar, List, Dict] Axes = Collection +# For functions like rename that convert one label to another +Renamer = Union[Mapping[Label, Any], Callable[[Label], Label]] + # to maintain type information across generic functions and parametrization -_T = TypeVar("_T") +T = TypeVar("T") diff --git a/pandas/_version.py b/pandas/_version.py index dfed9574c7cb0..66e756a4744c8 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -79,17 +79,17 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): if e.errno == errno.ENOENT: continue if verbose: - print("unable to run {dispcmd}".format(dispcmd=dispcmd)) + print(f"unable to run {dispcmd}") print(e) return None else: if verbose: - print("unable to find command, tried %s" % (commands,)) + print(f"unable to find command, tried {commands}") return None stdout = p.communicate()[0].strip().decode() if p.returncode != 0: if verbose: - print("unable to run {dispcmd} (error)".format(dispcmd=dispcmd)) + print(f"unable to run {dispcmd} (error)") return None return stdout @@ -101,10 +101,8 @@ def versions_from_parentdir(parentdir_prefix, root, verbose): if not dirname.startswith(parentdir_prefix): if verbose: print( - "guessing rootdir is '{root}', but '{dirname}' " - "doesn't start with prefix '{parentdir_prefix}'".format( - root=root, dirname=dirname, parentdir_prefix=parentdir_prefix - ) + f"guessing rootdir is '{root}', but '{dirname}' " + f"doesn't start with prefix '{parentdir_prefix}'" ) raise NotThisMethod("rootdir doesn't start with parentdir_prefix") return { @@ -163,15 +161,15 @@ def git_versions_from_keywords(keywords, tag_prefix, verbose): # "stabilization", as well as "HEAD" and "master". tags = {r for r in refs if re.search(r"\d", r)} if verbose: - print("discarding '{}', no digits".format(",".join(refs - tags))) + print(f"discarding '{','.join(refs - tags)}', no digits") if verbose: - print("likely tags: {}".format(",".join(sorted(tags)))) + print(f"likely tags: {','.join(sorted(tags))}") for ref in sorted(tags): # sorting will prefer e.g. "2.0" over "2.0rc1" if ref.startswith(tag_prefix): r = ref[len(tag_prefix) :] if verbose: - print("picking {r}".format(r=r)) + print(f"picking {r}") return { "version": r, "full-revisionid": keywords["full"].strip(), @@ -198,7 +196,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if not os.path.exists(os.path.join(root, ".git")): if verbose: - print("no .git in {root}".format(root=root)) + print(f"no .git in {root}") raise NotThisMethod("no .git directory") GITS = ["git"] @@ -240,17 +238,13 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) if not mo: # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ( - "unable to parse git-describe output: " - "'{describe_out}'".format(describe_out=describe_out) - ) + pieces["error"] = f"unable to parse git-describe output: '{describe_out}'" return pieces # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): - fmt = "tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" - msg = fmt.format(full_tag=full_tag, tag_prefix=tag_prefix) + msg = f"tag '{full_tag}' doesn't start with prefix '{tag_prefix}'" if verbose: print(msg) pieces["error"] = msg @@ -291,12 +285,12 @@ def render_pep440(pieces): rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: rendered += plus_or_dot(pieces) - rendered += "{:d}.g{}".format(pieces["distance"], pieces["short"]) + rendered += f"{pieces['distance']:d}.g{pieces['short']}" if pieces["dirty"]: rendered += ".dirty" else: # exception #1 - rendered = "0+untagged.{:d}.g{}".format(pieces["distance"], pieces["short"]) + rendered = f"0+untagged.{pieces['distance']:d}.g{pieces['short']}" if pieces["dirty"]: rendered += ".dirty" return rendered @@ -311,10 +305,10 @@ def render_pep440_pre(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += ".post.dev%d" % pieces["distance"] + rendered += f".post.dev{pieces['distance']:d}" else: # exception #1 - rendered = "0.post.dev%d" % pieces["distance"] + rendered = f"0.post.dev{pieces['distance']:d}" return rendered @@ -330,17 +324,17 @@ def render_pep440_post(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += ".post{:d}".format(pieces["distance"]) + rendered += f".post{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" rendered += plus_or_dot(pieces) - rendered += "g{}".format(pieces["short"]) + rendered += f"g{pieces['short']}" else: # exception #1 - rendered = "0.post%d" % pieces["distance"] + rendered = f"0.pos{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" - rendered += "+g{}".format(pieces["short"]) + rendered += f"+g{pieces['short']}" return rendered @@ -353,12 +347,12 @@ def render_pep440_old(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] + rendered += f".post{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" else: # exception #1 - rendered = "0.post%d" % pieces["distance"] + rendered = f"0.post{pieces['distance']:d}" if pieces["dirty"]: rendered += ".dev0" return rendered @@ -374,7 +368,7 @@ def render_git_describe(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] if pieces["distance"]: - rendered += "-{:d}-g{}".format(pieces["distance"], pieces["short"]) + rendered += f"-{pieces['distance']:d}-g{pieces['short']}" else: # exception #1 rendered = pieces["short"] @@ -392,7 +386,7 @@ def render_git_describe_long(pieces): if pieces["closest-tag"]: rendered = pieces["closest-tag"] - rendered += "-{:d}-g{}".format(pieces["distance"], pieces["short"]) + rendered += f"-{pieces['distance']:d}-g{pieces['short']}" else: # exception #1 rendered = pieces["short"] @@ -426,7 +420,7 @@ def render(pieces, style): elif style == "git-describe-long": rendered = render_git_describe_long(pieces) else: - raise ValueError("unknown style '{style}'".format(style=style)) + raise ValueError(f"unknown style '{style}'") return { "version": rendered, diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py index d0a26864a1102..bebbb38b4aefa 100644 --- a/pandas/api/__init__.py +++ b/pandas/api/__init__.py @@ -1,2 +1,2 @@ """ public toolkit API """ -from . import extensions, indexers, types # noqa +from pandas.api import extensions, indexers, types # noqa diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 573d700dac43d..3019dd0e9b371 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -1,13 +1,27 @@ -"""Public API for extending pandas objects.""" -from pandas.core.dtypes.dtypes import ( # noqa: F401 - ExtensionDtype, - register_extension_dtype, -) +""" +Public API for extending pandas objects. +""" + +from pandas._libs.lib import no_default -from pandas.core.accessor import ( # noqa: F401 +from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype + +from pandas.core.accessor import ( register_dataframe_accessor, register_index_accessor, register_series_accessor, ) -from pandas.core.algorithms import take # noqa: F401 -from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin # noqa: F401 +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin + +__all__ = [ + "no_default", + "ExtensionDtype", + "register_extension_dtype", + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", + "take", + "ExtensionArray", + "ExtensionScalarOpsMixin", +] diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py index a5d6bc07da3eb..10654eb0888ee 100644 --- a/pandas/api/indexers/__init__.py +++ b/pandas/api/indexers/__init__.py @@ -1,2 +1,8 @@ -"""Public API for Rolling Window Indexers""" -from pandas.core.window.indexers import BaseIndexer # noqa: F401 +""" +Public API for Rolling Window Indexers. +""" + +from pandas.core.indexers import check_bool_array_indexer +from pandas.core.window.indexers import BaseIndexer + +__all__ = ["check_bool_array_indexer", "BaseIndexer"] diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index f32e1abe28cc1..3495b493707c2 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -1,12 +1,23 @@ -""" public toolkit API """ +""" +Public toolkit API. +""" -from pandas._libs.lib import infer_dtype # noqa: F401 +from pandas._libs.lib import infer_dtype from pandas.core.dtypes.api import * # noqa: F403, F401 -from pandas.core.dtypes.concat import union_categoricals # noqa: F401 -from pandas.core.dtypes.dtypes import ( # noqa: F401 +from pandas.core.dtypes.concat import union_categoricals +from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, IntervalDtype, PeriodDtype, ) + +__all__ = [ + "infer_dtype", + "union_categoricals", + "CategoricalDtype", + "DatetimeTZDtype", + "IntervalDtype", + "PeriodDtype", +] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 60cfecd5804ac..3547a33ea357b 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -110,8 +110,7 @@ def _import_lzma(): return lzma except ImportError: msg = ( - "Could not import the lzma module. " - "Your installed Python is incomplete. " + "Could not import the lzma module. Your installed Python is incomplete. " "Attempting to use lzma compression will result in a RuntimeError." ) warnings.warn(msg) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 412293f029fa5..7aeb0327139f1 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -16,17 +16,19 @@ "odfpy": "1.3.0", "openpyxl": "2.5.7", "pandas_gbq": "0.8.0", - "pyarrow": "0.12.0", + "pyarrow": "0.13.0", "pytables": "3.4.2", "pytest": "5.0.1", "s3fs": "0.3.0", "scipy": "0.19.0", "sqlalchemy": "1.1.4", "tables": "3.4.2", + "tabulate": "0.8.3", "xarray": "0.8.2", "xlrd": "1.1.0", "xlwt": "1.2.0", "xlsxwriter": "0.9.8", + "numba": "0.46.0", } diff --git a/pandas/compat/chainmap.py b/pandas/compat/chainmap.py index 479eddf0c0536..588bd24ddf797 100644 --- a/pandas/compat/chainmap.py +++ b/pandas/compat/chainmap.py @@ -1,15 +1,24 @@ -from collections import ChainMap +from typing import ChainMap, MutableMapping, TypeVar, cast +_KT = TypeVar("_KT") +_VT = TypeVar("_VT") -class DeepChainMap(ChainMap): - def __setitem__(self, key, value): + +class DeepChainMap(ChainMap[_KT, _VT]): + """Variant of ChainMap that allows direct updates to inner scopes. + + Only works when all passed mapping are mutable. + """ + + def __setitem__(self, key: _KT, value: _VT) -> None: for mapping in self.maps: - if key in mapping: - mapping[key] = value + mutable_mapping = cast(MutableMapping[_KT, _VT], mapping) + if key in mutable_mapping: + mutable_mapping[key] = value return - self.maps[0][key] = value + cast(MutableMapping[_KT, _VT], self.maps[0])[key] = value - def __delitem__(self, key): + def __delitem__(self, key: _KT) -> None: """ Raises ------ @@ -17,7 +26,8 @@ def __delitem__(self, key): If `key` doesn't exist. """ for mapping in self.maps: + mutable_mapping = cast(MutableMapping[_KT, _VT], mapping) if key in mapping: - del mapping[key] + del mutable_mapping[key] return raise KeyError(key) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 27f1c32058941..6c9ac5944e6a1 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -18,11 +18,9 @@ if _nlv < "1.13.3": raise ImportError( - f"this version of pandas is incompatible with " - f"numpy < 1.13.3\n" + "this version of pandas is incompatible with numpy < 1.13.3\n" f"your numpy version is {_np_version}.\n" - f"Please upgrade numpy to >= 1.13.3 to use " - f"this pandas version" + "Please upgrade numpy to >= 1.13.3 to use this pandas version" ) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index fffe09a74571e..05ecccc67daef 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -33,13 +33,26 @@ class CompatValidator: - def __init__(self, defaults, fname=None, method=None, max_fname_arg_count=None): + def __init__( + self, + defaults, + fname=None, + method: Optional[str] = None, + max_fname_arg_count=None, + ): self.fname = fname self.method = method self.defaults = defaults self.max_fname_arg_count = max_fname_arg_count - def __call__(self, args, kwargs, fname=None, max_fname_arg_count=None, method=None): + def __call__( + self, + args, + kwargs, + fname=None, + max_fname_arg_count=None, + method: Optional[str] = None, + ) -> None: if args or kwargs: fname = self.fname if fname is None else fname max_fname_arg_count = ( @@ -169,13 +182,6 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -COMPRESS_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() -COMPRESS_DEFAULTS["axis"] = None -COMPRESS_DEFAULTS["out"] = None -validate_compress = CompatValidator( - COMPRESS_DEFAULTS, fname="compress", method="both", max_fname_arg_count=1 -) - CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None @@ -307,7 +313,7 @@ def validate_take_with_convert(convert, args, kwargs): ) -def validate_window_func(name, args, kwargs): +def validate_window_func(name, args, kwargs) -> None: numpy_args = ("axis", "dtype", "out") msg = ( f"numpy operations are not valid with window objects. " @@ -322,7 +328,7 @@ def validate_window_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_rolling_func(name, args, kwargs): +def validate_rolling_func(name, args, kwargs) -> None: numpy_args = ("axis", "dtype", "out") msg = ( f"numpy operations are not valid with window objects. " @@ -337,7 +343,7 @@ def validate_rolling_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_expanding_func(name, args, kwargs): +def validate_expanding_func(name, args, kwargs) -> None: numpy_args = ("axis", "dtype", "out") msg = ( f"numpy operations are not valid with window objects. " @@ -352,7 +358,7 @@ def validate_expanding_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_groupby_func(name, args, kwargs, allowed=None): +def validate_groupby_func(name, args, kwargs, allowed=None) -> None: """ 'args' and 'kwargs' should be empty, except for allowed kwargs because all of @@ -366,16 +372,15 @@ def validate_groupby_func(name, args, kwargs, allowed=None): if len(args) + len(kwargs) > 0: raise UnsupportedFunctionCall( - f"numpy operations are not valid with " - f"groupby. Use .groupby(...).{name}() " - f"instead" + "numpy operations are not valid with groupby. " + f"Use .groupby(...).{name}() instead" ) RESAMPLER_NUMPY_OPS = ("min", "max", "sum", "prod", "mean", "std", "var") -def validate_resampler_func(method, args, kwargs): +def validate_resampler_func(method: str, args, kwargs) -> None: """ 'args' and 'kwargs' should be empty because all of their necessary parameters are explicitly listed in @@ -384,15 +389,14 @@ def validate_resampler_func(method, args, kwargs): if len(args) + len(kwargs) > 0: if method in RESAMPLER_NUMPY_OPS: raise UnsupportedFunctionCall( - f"numpy operations are not " - f"valid with resample. Use " - f".resample(...).{method}() instead" + "numpy operations are not valid with resample. " + f"Use .resample(...).{method}() instead" ) else: raise TypeError("too many arguments passed in") -def validate_minmax_axis(axis): +def validate_minmax_axis(axis: Optional[int]) -> None: """ Ensure that the axis argument passed to min, max, argmin, or argmax is zero or None, as otherwise it will be incorrectly ignored. diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index e8fd390456f82..0a1a1376bfc8d 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -169,9 +169,9 @@ def __new__(cls) -> "DataFrame": # type: ignore # our Unpickler sub-class to override methods and some dispatcher -# functions for compat - +# functions for compat and uses a non-public class of the pickle module. +# error: Name 'pkl._Unpickler' is not defined class Unpickler(pkl._Unpickler): # type: ignore def find_class(self, module, name): # override superclass diff --git a/pandas/conftest.py b/pandas/conftest.py index 0a3bf31cf9666..0c964452df5da 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,3 +1,4 @@ +from collections import abc from datetime import date, time, timedelta, timezone from decimal import Decimal import operator @@ -14,8 +15,8 @@ import pandas as pd from pandas import DataFrame +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm hypothesis.settings.register_profile( "ci", @@ -64,25 +65,28 @@ def pytest_runtest_setup(item): pytest.skip("skipping high memory test since --run-high-memory was not set") -# Configurations for all tests and all test modules - - @pytest.fixture(autouse=True) def configure_tests(): + """ + Configure settings for all tests and test modules. + """ pd.set_option("chained_assignment", "raise") -# For running doctests: make np and pd names available - - @pytest.fixture(autouse=True) def add_imports(doctest_namespace): + """ + Make `np` and `pd` names available for doctests. + """ doctest_namespace["np"] = np doctest_namespace["pd"] = pd @pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) def spmatrix(request): + """ + Yields scipy sparse matrix classes. + """ from scipy import sparse return getattr(sparse, request.param + "_matrix") @@ -91,8 +95,8 @@ def spmatrix(request): @pytest.fixture(params=[0, 1, "index", "columns"], ids=lambda x: f"axis {repr(x)}") def axis(request): """ - Fixture for returning the axis numbers of a DataFrame. - """ + Fixture for returning the axis numbers of a DataFrame. + """ return request.param @@ -236,6 +240,10 @@ def all_boolean_reductions(request): @pytest.fixture(params=list(_cython_table)) def cython_table_items(request): + """ + Yields a tuple of a function and its corresponding name. Correspond to + the list of aggregator "Cython functions" used on selected table items. + """ return request.param @@ -336,6 +344,9 @@ def writable(request): @pytest.fixture(scope="module") def datetime_tz_utc(): + """ + Yields the UTC timezone object from the datetime module. + """ return timezone.utc @@ -357,6 +368,9 @@ def join_type(request): @pytest.fixture def strict_data_files(pytestconfig): + """ + Returns the configuration for the test setting `--strict-data-files`. + """ return pytestconfig.getoption("--strict-data-files") @@ -894,3 +908,38 @@ def index_or_series(request): See GH#29725 """ return request.param + + +@pytest.fixture +def dict_subclass(): + """ + Fixture for a dictionary subclass. + """ + + class TestSubDict(dict): + def __init__(self, *args, **kwargs): + dict.__init__(self, *args, **kwargs) + + return TestSubDict + + +@pytest.fixture +def non_mapping_dict_subclass(): + """ + Fixture for a non-mapping dictionary subclass. + """ + + class TestNonDictMapping(abc.Mapping): + def __init__(self, underlying_dict): + self._data = underlying_dict + + def __getitem__(self, key): + return self._data.__getitem__(key) + + def __iter__(self): + return self._data.__iter__() + + def __len__(self): + return self._data.__len__() + + return TestNonDictMapping diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 42cfd9d54ac19..59256f6924b79 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,7 +3,7 @@ intended for public consumption """ from textwrap import dedent -from typing import Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -50,6 +50,9 @@ from pandas.core.construction import array, extract_array from pandas.core.indexers import validate_indices +if TYPE_CHECKING: + from pandas import Series + _shared_docs: Dict[str, str] = {} @@ -198,7 +201,7 @@ def _ensure_arraylike(values): """ if not is_array_like(values): inferred = lib.infer_dtype(values, skipna=False) - if inferred in ["mixed", "string", "unicode"]: + if inferred in ["mixed", "string"]: if isinstance(values, tuple): values = list(values) values = construct_1d_object_array_from_listlike(values) @@ -651,7 +654,7 @@ def value_counts( normalize: bool = False, bins=None, dropna: bool = True, -) -> ABCSeries: +) -> "Series": """ Compute a histogram of the counts of non-null values. @@ -793,7 +796,7 @@ def duplicated(values, keep="first") -> np.ndarray: return f(values, keep=keep) -def mode(values, dropna: bool = True) -> ABCSeries: +def mode(values, dropna: bool = True) -> "Series": """ Returns the mode(s) of an array. diff --git a/pandas/core/api.py b/pandas/core/api.py index 5261801600111..b0b65f9d0be34 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -1,7 +1,5 @@ # flake8: noqa -import numpy as np - from pandas._libs import NaT, Period, Timedelta, Timestamp from pandas._libs.missing import NA diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 14a3c3c008e92..ca1be3154757a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,10 +1,11 @@ import abc import inspect -from typing import TYPE_CHECKING, Any, Dict, Iterator, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type, Union import numpy as np from pandas._libs import reduction as libreduction +from pandas._typing import Axis from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -26,9 +27,9 @@ def frame_apply( obj: "DataFrame", func, - axis=0, + axis: Axis = 0, raw: bool = False, - result_type=None, + result_type: Optional[str] = None, ignore_failures: bool = False, args=None, kwds=None, @@ -87,7 +88,7 @@ def __init__( obj: "DataFrame", func, raw: bool, - result_type, + result_type: Optional[str], ignore_failures: bool, args, kwds, diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index df26cd94b5ed9..bf3469924a700 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -1,16 +1,36 @@ -from .base import ( # noqa: F401 +from pandas.core.arrays.base import ( ExtensionArray, ExtensionOpsMixin, ExtensionScalarOpsMixin, try_cast_to_ea, ) -from .boolean import BooleanArray # noqa: F401 -from .categorical import Categorical # noqa: F401 -from .datetimes import DatetimeArray # noqa: F401 -from .integer import IntegerArray, integer_array # noqa: F401 -from .interval import IntervalArray # noqa: F401 -from .numpy_ import PandasArray, PandasDtype # noqa: F401 -from .period import PeriodArray, period_array # noqa: F401 -from .sparse import SparseArray # noqa: F401 -from .string_ import StringArray # noqa: F401 -from .timedeltas import TimedeltaArray # noqa: F401 +from pandas.core.arrays.boolean import BooleanArray +from pandas.core.arrays.categorical import Categorical +from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.integer import IntegerArray, integer_array +from pandas.core.arrays.interval import IntervalArray +from pandas.core.arrays.numpy_ import PandasArray, PandasDtype +from pandas.core.arrays.period import PeriodArray, period_array +from pandas.core.arrays.sparse import SparseArray +from pandas.core.arrays.string_ import StringArray +from pandas.core.arrays.timedeltas import TimedeltaArray + +__all__ = [ + "ExtensionArray", + "ExtensionOpsMixin", + "ExtensionScalarOpsMixin", + "try_cast_to_ea", + "BooleanArray", + "Categorical", + "DatetimeArray", + "IntegerArray", + "integer_array", + "IntervalArray", + "PandasArray", + "PandasDtype", + "PeriodArray", + "period_array", + "SparseArray", + "StringArray", + "TimedeltaArray", +] diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py new file mode 100644 index 0000000000000..e0d33bebeb421 --- /dev/null +++ b/pandas/core/arrays/_arrow_utils.py @@ -0,0 +1,124 @@ +from distutils.version import LooseVersion +import json + +import numpy as np +import pyarrow + +from pandas.core.arrays.interval import _VALID_CLOSED + +_pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") + + +def pyarrow_array_to_numpy_and_mask(arr, dtype): + """ + Convert a primitive pyarrow.Array to a numpy array and boolean mask based + on the buffers of the Array. + + Parameters + ---------- + arr : pyarrow.Array + dtype : numpy.dtype + + Returns + ------- + (data, mask) + Tuple of two numpy arrays with the raw data (with specified dtype) and + a boolean mask (validity mask, so False means missing) + """ + buflist = arr.buffers() + data = np.frombuffer(buflist[1], dtype=dtype)[arr.offset : arr.offset + len(arr)] + bitmask = buflist[0] + if bitmask is not None: + mask = pyarrow.BooleanArray.from_buffers( + pyarrow.bool_(), len(arr), [None, bitmask] + ) + mask = np.asarray(mask) + else: + mask = np.ones(len(arr), dtype=bool) + return data, mask + + +if _pyarrow_version_ge_015: + # the pyarrow extension types are only available for pyarrow 0.15+ + + class ArrowPeriodType(pyarrow.ExtensionType): + def __init__(self, freq): + # attributes need to be set first before calling + # super init (as that calls serialize) + self._freq = freq + pyarrow.ExtensionType.__init__(self, pyarrow.int64(), "pandas.period") + + @property + def freq(self): + return self._freq + + def __arrow_ext_serialize__(self): + metadata = {"freq": self.freq} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + return ArrowPeriodType(metadata["freq"]) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return type(self) == type(other) and self.freq == other.freq + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), self.freq)) + + # register the type with a dummy instance + _period_type = ArrowPeriodType("D") + pyarrow.register_extension_type(_period_type) + + class ArrowIntervalType(pyarrow.ExtensionType): + def __init__(self, subtype, closed): + # attributes need to be set first before calling + # super init (as that calls serialize) + assert closed in _VALID_CLOSED + self._closed = closed + if not isinstance(subtype, pyarrow.DataType): + subtype = pyarrow.type_for_alias(str(subtype)) + self._subtype = subtype + + storage_type = pyarrow.struct([("left", subtype), ("right", subtype)]) + pyarrow.ExtensionType.__init__(self, storage_type, "pandas.interval") + + @property + def subtype(self): + return self._subtype + + @property + def closed(self): + return self._closed + + def __arrow_ext_serialize__(self): + metadata = {"subtype": str(self.subtype), "closed": self.closed} + return json.dumps(metadata).encode() + + @classmethod + def __arrow_ext_deserialize__(cls, storage_type, serialized): + metadata = json.loads(serialized.decode()) + subtype = pyarrow.type_for_alias(metadata["subtype"]) + closed = metadata["closed"] + return ArrowIntervalType(subtype, closed) + + def __eq__(self, other): + if isinstance(other, pyarrow.BaseExtensionType): + return ( + type(self) == type(other) + and self.subtype == other.subtype + and self.closed == other.closed + ) + else: + return NotImplemented + + def __hash__(self): + return hash((str(self), str(self.subtype), self.closed)) + + # register the type with a dummy instance + _interval_type = ArrowIntervalType(pyarrow.int64(), "left") + pyarrow.register_extension_type(_interval_type) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 96a4eb1b3bf32..c3c91cea43f6b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -10,6 +10,7 @@ import numpy as np +from pandas._libs import lib from pandas._typing import ArrayLike from pandas.compat import set_function_name from pandas.compat.numpy import function as nv @@ -175,6 +176,9 @@ class ExtensionArray: types present. See :ref:`extending.extension.ufunc` for more. + + By default, ExtensionArrays are not hashable. Immutable subclasses may + override this behavior. """ # '_typ' is for pandas.core.dtypes.generic.ABCExtensionArray. @@ -350,6 +354,39 @@ def __iter__(self): for i in range(len(self)): yield self[i] + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): + """ + Convert to a NumPy ndarray. + + .. versionadded:: 1.0.0 + + This is similar to :meth:`numpy.asarray`, but may provide additional control + over how the conversion is done. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + Returns + ------- + numpy.ndarray + """ + result = np.asarray(self, dtype=dtype) + if copy or na_value is not lib.no_default: + result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result + # ------------------------------------------------------------------------ # Required attributes # ------------------------------------------------------------------------ @@ -1039,6 +1076,9 @@ def _reduce(self, name, skipna=True, **kwargs): """ raise TypeError(f"cannot perform {name} with type {self.dtype}") + def __hash__(self): + raise TypeError(f"unhashable type: {repr(type(self).__name__)}") + class ExtensionOpsMixin: """ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index a8fcd6d03847c..eaa17df1235d3 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -15,7 +15,6 @@ is_extension_array_dtype, is_float, is_float_dtype, - is_integer, is_integer_dtype, is_list_like, is_numeric_dtype, @@ -27,8 +26,8 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops, ops -from pandas.core.algorithms import take -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin + +from .masked import BaseMaskedArray if TYPE_CHECKING: from pandas._typing import Scalar @@ -60,6 +59,8 @@ class BooleanDtype(ExtensionDtype): BooleanDtype """ + name = "boolean" + @property def na_value(self) -> "Scalar": """ @@ -79,19 +80,6 @@ def type(self) -> Type: def kind(self) -> str: return "b" - @property - def name(self) -> str: - """ - The alias for BooleanDtype is ``'boolean'``. - """ - return "boolean" - - @classmethod - def construct_from_string(cls, string: str) -> ExtensionDtype: - if string == "boolean": - return cls() - return super().construct_from_string(string) - @classmethod def construct_array_type(cls) -> "Type[BooleanArray]": return BooleanArray @@ -206,7 +194,7 @@ def coerce_to_array(values, mask=None, copy: bool = False): return values, mask -class BooleanArray(ExtensionArray, ExtensionOpsMixin): +class BooleanArray(BaseMaskedArray): """ Array of boolean (True/False) data with missing values. @@ -256,10 +244,13 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin): >>> pd.array([True, False, None], dtype="boolean") - [True, False, NA] + [True, False, ] Length: 3, dtype: boolean """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = False + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype == np.bool_): raise TypeError( @@ -304,59 +295,6 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: def _from_factorized(cls, values, original: "BooleanArray"): return cls._from_sequence(values, dtype=original.dtype) - def _formatter(self, boxed=False): - return str - - def __getitem__(self, item): - if is_integer(item): - if self._mask[item]: - return self.dtype.na_value - return self._data[item] - return type(self)(self._data[item], self._mask[item]) - - def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA): - """ - Coerce to an ndarray of object dtype or bool dtype (if force_bool=True). - - Parameters - ---------- - dtype : dtype, default object - The numpy dtype to convert to - na_value : scalar, optional - Scalar missing value indicator to use in numpy array. Defaults - to the native missing value indicator of this array (pd.NA). - """ - if dtype is None: - dtype = object - if is_bool_dtype(dtype): - if not self.isna().any(): - return self._data - else: - raise ValueError( - "cannot convert to bool numpy array in presence of missing values" - ) - data = self._data.astype(dtype) - data[self._mask] = na_value - return data - - __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - - def __array__(self, dtype=None): - """ - the array interface, return my values - We return an object array here to preserve our scalar values - """ - # by default (no dtype specified), return an object array - return self._coerce_to_ndarray(dtype=dtype) - - def __arrow_array__(self, type=None): - """ - Convert myself into a pyarrow Array. - """ - import pyarrow as pa - - return pa.array(self._data, mask=self._mask, type=type) - _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -404,40 +342,6 @@ def reconstruct(x): else: return reconstruct(result) - def __iter__(self): - for i in range(len(self)): - if self._mask[i]: - yield self.dtype.na_value - else: - yield self._data[i] - - def take(self, indexer, allow_fill=False, fill_value=None): - # we always fill with False internally - # to avoid upcasting - data_fill_value = False if isna(fill_value) else fill_value - result = take( - self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill - ) - - mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) - - # if we are filling - # we only fill where the indexer is null - # not existing missing values - # TODO(jreback) what if we have a non-na float as a fill value? - if allow_fill and notna(fill_value): - fill_mask = np.asarray(indexer) == -1 - result[fill_mask] = fill_value - mask = mask ^ fill_mask - - return type(self)(result, mask, copy=False) - - def copy(self): - data, mask = self._data, self._mask - data = data.copy() - mask = mask.copy() - return type(self)(data, mask, copy=False) - def __setitem__(self, key, value): _is_scalar = is_scalar(value) if _is_scalar: @@ -451,26 +355,6 @@ def __setitem__(self, key, value): self._data[key] = value self._mask[key] = mask - def __len__(self): - return len(self._data) - - @property - def nbytes(self): - return self._data.nbytes + self._mask.nbytes - - def isna(self): - return self._mask - - @property - def _na_value(self): - return self._dtype.na_value - - @classmethod - def _concat_same_type(cls, to_concat): - data = np.concatenate([x._data for x in to_concat]) - mask = np.concatenate([x._mask for x in to_concat]) - return cls(data, mask) - def astype(self, dtype, copy=True): """ Cast to a NumPy array or ExtensionArray with 'dtype'. @@ -503,7 +387,7 @@ def astype(self, dtype, copy=True): if is_bool_dtype(dtype): # astype_nansafe converts np.nan to True - if self.isna().any(): + if self._hasna: raise ValueError("cannot convert float NaN to bool") else: return self._data.astype(dtype, copy=copy) @@ -515,7 +399,7 @@ def astype(self, dtype, copy=True): ) # for integer, error if there are missing values if is_integer_dtype(dtype): - if self.isna().any(): + if self._hasna: raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) @@ -523,54 +407,8 @@ def astype(self, dtype, copy=True): if is_float_dtype(dtype): na_value = np.nan # coerce - data = self._coerce_to_ndarray(na_value=na_value) - return astype_nansafe(data, dtype, copy=None) - - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - - """ - - from pandas import Index, Series - - # compute counts on the data with no nans - data = self._data[~self._mask] - value_counts = Index(data).value_counts() - array = value_counts.values - - # TODO(extension) - # if we have allow Index to hold an ExtensionArray - # this is easier - index = value_counts.index.values.astype(bool).astype(object) - - # if we want nans, count the mask - if not dropna: - - # TODO(extension) - # appending to an Index *always* infers - # w/o passing the dtype - array = np.append(array, [self._mask.sum()]) - index = Index( - np.concatenate([index, np.array([np.nan], dtype=object)]), dtype=object - ) - - return Series(array, index=index) + data = self.to_numpy(na_value=na_value) + return astype_nansafe(data, dtype, copy=False) def _values_for_argsort(self) -> np.ndarray: """ @@ -643,7 +481,7 @@ def any(self, skipna: bool = True, **kwargs): >>> pd.array([True, False, pd.NA]).any(skipna=False) True >>> pd.array([False, False, pd.NA]).any(skipna=False) - NA + """ kwargs.pop("axis", None) nv.validate_any((), kwargs) @@ -708,7 +546,7 @@ def all(self, skipna: bool = True, **kwargs): required (whether ``pd.NA`` is True or False influences the result): >>> pd.array([True, True, pd.NA]).all(skipna=False) - NA + >>> pd.array([True, False, pd.NA]).all(skipna=False) False """ @@ -730,7 +568,6 @@ def all(self, skipna: bool = True, **kwargs): @classmethod def _create_logical_method(cls, op): def logical_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented @@ -755,9 +592,8 @@ def logical_method(self, other): if other_is_scalar and not (other is libmissing.NA or lib.is_bool(other)): raise TypeError( - "'other' should be pandas.NA or a bool. Got {} instead.".format( - type(other).__name__ - ) + "'other' should be pandas.NA or a bool. " + f"Got {type(other).__name__} instead." ) if not other_is_scalar and len(self) != len(other): @@ -772,14 +608,17 @@ def logical_method(self, other): return BooleanArray(result, mask) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}__" return set_function_name(logical_method, name, cls) @classmethod def _create_comparison_method(cls, op): def cmp_method(self, other): + from pandas.arrays import IntegerArray - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): + if isinstance( + other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) + ): # Rely on pandas to unbox and dispatch to us. return NotImplemented @@ -819,7 +658,7 @@ def cmp_method(self, other): return BooleanArray(result, mask, copy=False) - name = "__{name}__".format(name=op.__name__) + name = f"__{op.__name__}" return set_function_name(cmp_method, name, cls) def _reduce(self, name, skipna=True, **kwargs): @@ -831,13 +670,15 @@ def _reduce(self, name, skipna=True, **kwargs): mask = self._mask # coerce to a nan-aware float if needed - if mask.any(): - data = self._data.astype("float64") - data[mask] = np.nan + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + if np.isnan(result): + return libmissing.NA + # if we have numeric op that would result in an int, coerce to int if possible if name in ["sum", "prod"] and notna(result): int_result = np.int64(result) @@ -922,7 +763,7 @@ def boolean_arithmetic_method(self, other): return self._maybe_mask_result(result, mask, other, op_name) - name = "__{name}__".format(name=op_name) + name = f"__{op_name}__" return set_function_name(boolean_arithmetic_method, name, cls) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4d6be8221557d..9d7359dd9c614 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,6 +1,6 @@ import operator from shutil import get_terminal_size -from typing import Type, Union, cast +from typing import Dict, Hashable, List, Type, Union, cast from warnings import warn import numpy as np @@ -8,7 +8,7 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, hashtable as htable -from pandas._typing import ArrayLike, Dtype, Ordered +from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -39,24 +39,28 @@ ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.inference import is_array_like, is_hashable from pandas.core.dtypes.missing import isna, notna from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take, take_1d, unique1d +from pandas.core.arrays.base import ( + ExtensionArray, + _extension_array_shared_docs, + try_cast_to_ea, +) from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort from pandas.io.formats import console -from .base import ExtensionArray, _extension_array_shared_docs, try_cast_to_ea - def _cat_compare_op(op): opname = f"__{op.__name__}__" @@ -232,7 +236,7 @@ class Categorical(ExtensionArray, PandasObject): `categories` attribute (which in turn is the `categories` argument, if provided). dtype : CategoricalDtype - An instance of ``CategoricalDtype`` to use for this categorical + An instance of ``CategoricalDtype`` to use for this categorical. .. versionadded:: 0.21.0 @@ -272,7 +276,7 @@ class Categorical(ExtensionArray, PandasObject): Notes ----- See the `user guide - `_ + `_ for more. Examples @@ -302,7 +306,7 @@ class Categorical(ExtensionArray, PandasObject): __array_priority__ = 1000 _dtype = CategoricalDtype(ordered=False) # tolist is not actually deprecated, just suppressed in the __dir__ - _deprecations = PandasObject._deprecations | frozenset(["tolist", "itemsize"]) + _deprecations = PandasObject._deprecations | frozenset(["tolist"]) _typ = "categorical" def __init__( @@ -511,7 +515,7 @@ def itemsize(self) -> int: """ return self.categories.itemsize - def tolist(self) -> list: + def tolist(self) -> List[Scalar]: """ Return a list of the values. @@ -1260,7 +1264,7 @@ def shift(self, periods, fill_value=None): return self.from_codes(codes, dtype=self.dtype) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ The numpy array interface. @@ -1873,7 +1877,7 @@ def __iter__(self): """ return iter(self._internal_get_values().tolist()) - def __contains__(self, key): + def __contains__(self, key) -> bool: """ Returns True if `key` is in this Categorical. """ @@ -1883,7 +1887,7 @@ def __contains__(self, key): return contains(self, key, container=self._codes) - def _tidy_repr(self, max_vals=10, footer=True): + def _tidy_repr(self, max_vals=10, footer=True) -> str: """ a short repr displaying only max_vals and an optional (but default footer) """ @@ -1920,7 +1924,7 @@ def _repr_categories(self): category_strs = [x.strip() for x in category_strs] return category_strs - def _repr_categories_info(self): + def _repr_categories_info(self) -> str: """ Returns a string representation of the footer. """ @@ -1950,11 +1954,11 @@ def _repr_categories_info(self): # replace to simple save space by return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" - def _repr_footer(self): + def _repr_footer(self) -> str: info = self._repr_categories_info() return f"Length: {len(self)}\n{info}" - def _get_repr(self, length=True, na_rep="NaN", footer=True): + def _get_repr(self, length=True, na_rep="NaN", footer=True) -> str: from pandas.io.formats import format as fmt formatter = fmt.CategoricalFormatter( @@ -1996,10 +2000,17 @@ def __getitem__(self, key): return np.nan else: return self.categories[i] - else: - return self._constructor( - values=self._codes[key], dtype=self.dtype, fastpath=True - ) + + if is_list_like(key) and not is_array_like(key): + key = np.asarray(key) + + if com.is_bool_indexer(key): + key = check_bool_array_indexer(self, key) + + result = self._codes[key] + if result.ndim > 1: + return result + return self._constructor(result, dtype=self.dtype, fastpath=True) def __setitem__(self, key, value): """ @@ -2067,7 +2078,7 @@ def __setitem__(self, key, value): lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer - def _reverse_indexer(self): + def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ Compute the inverse of a categorical, returning a dict of categories -> indexers. @@ -2097,8 +2108,8 @@ def _reverse_indexer(self): self.codes.astype("int64"), categories.size ) counts = counts.cumsum() - result = (r[start:end] for start, end in zip(counts, counts[1:])) - result = dict(zip(categories, result)) + _result = (r[start:end] for start, end in zip(counts, counts[1:])) + result = dict(zip(categories, _result)) return result # reduction ops # @@ -2393,8 +2404,8 @@ def isin(self, values): if not is_list_like(values): values_type = type(values).__name__ raise TypeError( - "only list-like objects are allowed to be passed" - f" to isin(), you passed a [{values_type}]" + "only list-like objects are allowed to be passed " + f"to isin(), you passed a [{values_type}]" ) values = sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 045e511e32586..70637026c278d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -11,6 +11,7 @@ from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 from pandas._typing import DatetimeLikeScalar +from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning from pandas.util._decorators import Appender, Substitution @@ -27,27 +28,101 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_offsetlike, is_period_dtype, is_string_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ABCIndexClass, ABCPeriodArray, ABCSeries +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna -from pandas.core import missing, nanops +from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, take, unique1d, value_counts +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.ops.invalid import make_invalid_op +from pandas.core.ops.invalid import invalid_comparison, make_invalid_op from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick -from .base import ExtensionArray, ExtensionOpsMixin + +def _datetimelike_array_cmp(cls, op): + """ + Wrap comparison operations to convert Timestamp/Timedelta/Period-like to + boxed scalars/arrays. + """ + opname = f"__{op.__name__}__" + nat_result = opname == "__ne__" + + @unpack_zerodim_and_defer(opname) + def wrapper(self, other): + + if isinstance(other, str): + try: + # GH#18435 strings get a pass from tzawareness compat + other = self._scalar_from_string(other) + except ValueError: + # failed to parse as Timestamp/Timedelta/Period + return invalid_comparison(self, other, op) + + if isinstance(other, self._recognized_scalars) or other is NaT: + other = self._scalar_type(other) + self._check_compatible_with(other) + + other_i8 = self._unbox_scalar(other) + + result = op(self.view("i8"), other_i8) + if isna(other): + result.fill(nat_result) + + elif not is_list_like(other): + return invalid_comparison(self, other, op) + + elif len(other) != len(self): + raise ValueError("Lengths must match") + + else: + if isinstance(other, list): + # TODO: could use pd.Index to do inference? + other = np.array(other) + + if not isinstance(other, (np.ndarray, type(self))): + return invalid_comparison(self, other, op) + + if is_object_dtype(other): + # We have to use comp_method_OBJECT_ARRAY instead of numpy + # comparison otherwise it would fail to raise when + # comparing tz-aware and tz-naive + with np.errstate(all="ignore"): + result = ops.comp_method_OBJECT_ARRAY( + op, self.astype(object), other + ) + o_mask = isna(other) + + elif not type(self)._is_recognized_dtype(other.dtype): + return invalid_comparison(self, other, op) + + else: + # For PeriodDType this casting is unnecessary + other = type(self)._from_sequence(other) + self._check_compatible_with(other) + + result = op(self.view("i8"), other.view("i8")) + o_mask = other._isnan + + if o_mask.any(): + result[o_mask] = nat_result + + if self._hasnans: + result[self._isnan] = nat_result + + return result + + return set_function_name(wrapper, opname, cls) class AttributesMixin: @@ -109,7 +184,7 @@ def _unbox_scalar(self, value: Union[Period, Timestamp, Timedelta, NaTType]) -> raise AbstractMethodError(self) def _check_compatible_with( - self, other: Union[Period, Timestamp, Timedelta, NaTType] + self, other: Union[Period, Timestamp, Timedelta, NaTType], setitem: bool = False ) -> None: """ Verify that `self` and `other` are compatible. @@ -123,6 +198,9 @@ def _check_compatible_with( Parameters ---------- other + setitem : bool, default False + For __setitem__ we may have stricter compatiblity resrictions than + for comparisons. Raises ------ @@ -289,16 +367,19 @@ class TimelikeOps: def _round(self, freq, mode, ambiguous, nonexistent): # round the local times - values = _ensure_datetimelike_to_i8(self) + if is_datetime64tz_dtype(self): + # operate on naive timestamps, then convert back to aware + naive = self.tz_localize(None) + result = naive._round(freq, mode, ambiguous, nonexistent) + aware = result.tz_localize( + self.tz, ambiguous=ambiguous, nonexistent=nonexistent + ) + return aware + + values = self.view("i8") result = round_nsint64(values, mode, freq) result = self._maybe_mask_results(result, fill_value=NaT) - - dtype = self.dtype - if is_datetime64tz_dtype(self): - dtype = None - return self._ensure_localized( - self._simple_new(result, dtype=dtype), ambiguous, nonexistent - ) + return self._simple_new(result, dtype=self.dtype) @Appender((_round_doc + _round_example).format(op="round")) def round(self, freq, ambiguous="raise", nonexistent="raise"): @@ -325,6 +406,24 @@ class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin, ExtensionArray) _generate_range """ + @property + def ndim(self) -> int: + return self._data.ndim + + @property + def shape(self): + return self._data.shape + + def reshape(self, *args, **kwargs): + # Note: we drop any freq + data = self._data.reshape(*args, **kwargs) + return type(self)(data, dtype=self.dtype) + + def ravel(self, *args, **kwargs): + # Note: we drop any freq + data = self._data.ravel(*args, **kwargs) + return type(self)(data, dtype=self.dtype) + @property def _box_func(self): """ @@ -382,7 +481,7 @@ def _formatter(self, boxed=False): def nbytes(self): return self._data.nbytes - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) @@ -413,10 +512,13 @@ def __getitem__(self, key): getitem = self._data.__getitem__ if is_int: val = getitem(key) - return self._box_func(val) + if lib.is_scalar(val): + # i.e. self.ndim == 1 + return self._box_func(val) + return type(self)(val, dtype=self.dtype) if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + key = check_bool_array_indexer(self, key) if key.all(): key = slice(0, None, None) else: @@ -441,8 +543,6 @@ def __getitem__(self, key): if result.ndim > 1: # To support MPL which performs slicing with 2 dim # even though it only has 1 dim by definition - if is_period: - return self._simple_new(result, dtype=self.dtype, freq=freq) return result return self._simple_new(result, dtype=self.dtype, freq=freq) @@ -479,10 +579,10 @@ def __setitem__( return value = type(self)._from_sequence(value, dtype=self.dtype) - self._check_compatible_with(value) + self._check_compatible_with(value, setitem=True) value = value.asi8 elif isinstance(value, self._scalar_type): - self._check_compatible_with(value) + self._check_compatible_with(value, setitem=True) value = self._unbox_scalar(value) elif is_valid_nat_for_dtype(value, self.dtype): value = iNaT @@ -567,7 +667,17 @@ def _validate_fill_value(self, fill_value): ------ ValueError """ - raise AbstractMethodError(self) + if isna(fill_value): + fill_value = iNaT + elif isinstance(fill_value, self._recognized_scalars): + self._check_compatible_with(fill_value) + fill_value = self._scalar_type(fill_value) + fill_value = self._unbox_scalar(fill_value) + else: + raise ValueError( + f"'fill_value' should be a {self._scalar_type}. Got '{fill_value}'." + ) + return fill_value def take(self, indices, allow_fill=False, fill_value=None): if allow_fill: @@ -633,17 +743,36 @@ def searchsorted(self, value, side="left", sorter=None): Array of insertion points with the same shape as `value`. """ if isinstance(value, str): - value = self._scalar_from_string(value) + try: + value = self._scalar_from_string(value) + except ValueError: + raise TypeError("searchsorted requires compatible dtype or scalar") + + elif is_valid_nat_for_dtype(value, self.dtype): + value = NaT - if not (isinstance(value, (self._scalar_type, type(self))) or isna(value)): - raise ValueError(f"Unexpected type for 'value': {type(value)}") + elif isinstance(value, self._recognized_scalars): + value = self._scalar_type(value) + + elif isinstance(value, np.ndarray): + if not type(self)._is_recognized_dtype(value): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + value = type(self)(value) + self._check_compatible_with(value) + + if not (isinstance(value, (self._scalar_type, type(self))) or (value is NaT)): + raise TypeError(f"Unexpected type for 'value': {type(value)}") - self._check_compatible_with(value) if isinstance(value, type(self)): + self._check_compatible_with(value) value = value.asi8 else: value = self._unbox_scalar(value) + # TODO: Use datetime64 semantics for sorting, xref GH#29844 return self.asi8.searchsorted(value, side=side, sorter=sorter) def repeat(self, repeats, *args, **kwargs): @@ -810,7 +939,7 @@ def freq(self, value): @property def freqstr(self): """ - Return the frequency object as a string if its set, otherwise None + Return the frequency object as a string if its set, otherwise None. """ if self.freq is None: return None @@ -823,6 +952,8 @@ def inferred_freq(self): generated by infer_freq. Returns None if it can't autodetect the frequency. """ + if self.ndim != 1: + return None try: return frequencies.infer_freq(self) except ValueError: @@ -898,6 +1029,7 @@ def _is_unique(self): # ------------------------------------------------------------------ # Arithmetic Methods + _create_comparison_method = classmethod(_datetimelike_array_cmp) # pow is invalid for all three subclasses; TimedeltaArray will override # the multiplication and division ops @@ -915,10 +1047,8 @@ def _is_unique(self): __rdivmod__ = make_invalid_op("__rdivmod__") def _add_datetimelike_scalar(self, other): - # Overriden by TimedeltaArray - raise TypeError( - f"cannot add {type(self).__name__} and " f"{type(other).__name__}" - ) + # Overridden by TimedeltaArray + raise TypeError(f"cannot add {type(self).__name__} and {type(other).__name__}") _add_datetime_arraylike = _add_datetimelike_scalar @@ -930,7 +1060,7 @@ def _sub_datetimelike_scalar(self, other): _sub_datetime_arraylike = _sub_datetimelike_scalar def _sub_period(self, other): - # Overriden by PeriodArray + # Overridden by PeriodArray raise TypeError(f"cannot subtract Period from a {type(self).__name__}") def _add_offset(self, offset): @@ -970,7 +1100,7 @@ def _add_timedeltalike_scalar(self, other): """ if isna(other): # i.e np.timedelta64("NaT"), not recognized by delta_to_nanoseconds - new_values = np.empty(len(self), dtype="i8") + new_values = np.empty(self.shape, dtype="i8") new_values[:] = iNaT return new_values @@ -1016,7 +1146,7 @@ def _add_nat(self): # GH#19124 pd.NaT is treated like a timedelta for both timedelta # and datetime dtypes - result = np.zeros(len(self), dtype=np.int64) + result = np.zeros(self.shape, dtype=np.int64) result.fill(iNaT) return type(self)(result, dtype=self.dtype, freq=None) @@ -1030,7 +1160,7 @@ def _sub_nat(self): # For datetime64 dtypes by convention we treat NaT as a datetime, so # this subtraction returns a timedelta64 dtype. # For period dtype, timedelta64 is a close-enough return dtype. - result = np.zeros(len(self), dtype=np.int64) + result = np.zeros(self.shape, dtype=np.int64) result.fill(iNaT) return result.view("timedelta64[ns]") @@ -1054,8 +1184,6 @@ def _sub_period_array(self, other): f"cannot subtract {other.dtype}-dtype from {type(self).__name__}" ) - if len(self) != len(other): - raise ValueError("cannot subtract arrays/indices of unequal length") if self.freq != other.freq: msg = DIFFERENT_FREQ.format( cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr @@ -1072,47 +1200,13 @@ def _sub_period_array(self, other): new_values[mask] = NaT return new_values - def _addsub_int_array(self, other, op): - """ - Add or subtract array-like of integers equivalent to applying - `_time_shift` pointwise. - - Parameters - ---------- - other : Index, ExtensionArray, np.ndarray - integer-dtype - op : {operator.add, operator.sub} - - Returns - ------- - result : same class as self - """ - # _addsub_int_array is overriden by PeriodArray - assert not is_period_dtype(self) - assert op in [operator.add, operator.sub] - - if self.freq is None: - # GH#19123 - raise NullFrequencyError("Cannot shift with no freq") - - elif isinstance(self.freq, Tick): - # easy case where we can convert to timedelta64 operation - td = Timedelta(self.freq) - return op(self, td * other) - - # We should only get here with DatetimeIndex; dispatch - # to _addsub_offset_array - assert not is_timedelta64_dtype(self) - return op(self, np.array(other) * self.freq) - - def _addsub_offset_array(self, other, op): + def _addsub_object_array(self, other: np.ndarray, op): """ Add or subtract array-like of DateOffset objects Parameters ---------- - other : Index, np.ndarray - object-dtype containing pd.DateOffset objects + other : np.ndarray[object] op : {operator.add, operator.sub} Returns @@ -1136,7 +1230,12 @@ def _addsub_offset_array(self, other, op): kwargs = {} if not is_period_dtype(self): kwargs["freq"] = "infer" - return self._from_sequence(res_values, **kwargs) + try: + res = type(self)._from_sequence(res_values, **kwargs) + except ValueError: + # e.g. we've passed a Timestamp to TimedeltaArray + res = res_values + return res def _time_shift(self, periods, freq=None): """ @@ -1199,9 +1298,9 @@ def __add__(self, other): elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.add) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datetime_arraylike(other) @@ -1254,9 +1353,9 @@ def __sub__(self, other): elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) - elif is_offsetlike(other): - # Array/Index of DateOffset objects - result = self._addsub_offset_array(other, operator.sub) + elif is_object_dtype(other): + # e.g. Array/Index of DateOffset objects + result = self._addsub_object_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datetime_arraylike(other) @@ -1313,53 +1412,23 @@ def __rsub__(self, other): return -(self - other) - # FIXME: DTA/TDA/PA inplace methods should actually be inplace, GH#24115 def __iadd__(self, other): # type: ignore - # alias for __add__ - return self.__add__(other) - - def __isub__(self, other): # type: ignore - # alias for __sub__ - return self.__sub__(other) - - # -------------------------------------------------------------- - # Comparison Methods - - def _ensure_localized( - self, arg, ambiguous="raise", nonexistent="raise", from_utc=False - ): - """ - Ensure that we are re-localized. + result = self + other + self[:] = result[:] - This is for compat as we can then call this on all datetimelike - arrays generally (ignored for Period/Timedelta) - - Parameters - ---------- - arg : Union[DatetimeLikeArray, DatetimeIndexOpsMixin, ndarray] - ambiguous : str, bool, or bool-ndarray, default 'raise' - nonexistent : str, default 'raise' - from_utc : bool, default False - If True, localize the i8 ndarray to UTC first before converting to - the appropriate tz. If False, localize directly to the tz. + if not is_period_dtype(self): + # restore freq, which is invalidated by setitem + self._freq = result._freq + return self - Returns - ------- - localized array - """ + def __isub__(self, other): # type: ignore + result = self - other + self[:] = result[:] - # reconvert to local tz - tz = getattr(self, "tz", None) - if tz is not None: - if not isinstance(arg, type(self)): - arg = self._simple_new(arg) - if from_utc: - arg = arg.tz_localize("UTC").tz_convert(self.tz) - else: - arg = arg.tz_localize( - self.tz, ambiguous=ambiguous, nonexistent=nonexistent - ) - return arg + if not is_period_dtype(self): + # restore freq, which is invalidated by setitem + self._freq = result._freq + return self # -------------------------------------------------------------- # Reductions @@ -1473,6 +1542,8 @@ def mean(self, skipna=True): return self._box_func(result) +DatetimeLikeArrayMixin._add_comparison_ops() + # ------------------------------------------------------------------- # Shared Constructor Helpers @@ -1596,38 +1667,3 @@ def maybe_infer_freq(freq): freq_infer = True freq = None return freq, freq_infer - - -def _ensure_datetimelike_to_i8(other, to_utc=False): - """ - Helper for coercing an input scalar or array to i8. - - Parameters - ---------- - other : 1d array - to_utc : bool, default False - If True, convert the values to UTC before extracting the i8 values - If False, extract the i8 values directly. - - Returns - ------- - i8 1d array - """ - from pandas import Index - - if lib.is_scalar(other) and isna(other): - return iNaT - elif isinstance(other, (ABCPeriodArray, ABCIndexClass, DatetimeLikeArrayMixin)): - # convert tz if needed - if getattr(other, "tz", None) is not None: - if to_utc: - other = other.tz_convert("UTC") - else: - other = other.tz_localize(None) - else: - try: - return np.array(other, copy=False).view("i8") - except TypeError: - # period array cannot be coerced to int - other = Index(other) - return other.asi8 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 10669b09cefec..1988b2e9e33f2 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -18,14 +18,13 @@ timezones, tzconversion, ) -import pandas.compat as compat from pandas.errors import PerformanceWarning -from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( _INT64_DTYPE, _NS_DTYPE, is_categorical_dtype, + is_datetime64_any_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, @@ -42,13 +41,10 @@ from pandas.core.dtypes.generic import ABCIndexClass, ABCPandasArray, ABCSeries from pandas.core.dtypes.missing import isna -from pandas.core import ops from pandas.core.algorithms import checked_add_with_arr from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com -from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import get_period_alias, to_offset from pandas.tseries.offsets import Day, Tick @@ -74,22 +70,6 @@ def tz_to_dtype(tz): return DatetimeTZDtype(tz=tz) -def _to_M8(key, tz=None): - """ - Timestamp-like => dt64 - """ - if not isinstance(key, Timestamp): - # this also converts strings - key = Timestamp(key) - if key.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - key = key.tz_convert(tz) - else: - key = key.tz_localize(tz) - - return np.int64(conversion.pydt_to_i8(key)).view(_NS_DTYPE) - - def _field_accessor(name, field, docstring=None): def f(self): values = self.asi8 @@ -130,87 +110,6 @@ def f(self): return property(f) -def _dt_array_cmp(cls, op): - """ - Wrap comparison operations to convert datetime-like to datetime64 - """ - opname = f"__{op.__name__}__" - nat_result = opname == "__ne__" - - @unpack_zerodim_and_defer(opname) - def wrapper(self, other): - - if isinstance(other, (datetime, np.datetime64, str)): - if isinstance(other, (datetime, np.datetime64)): - # GH#18435 strings get a pass from tzawareness compat - self._assert_tzawareness_compat(other) - - try: - other = _to_M8(other, tz=self.tz) - except ValueError: - # string that cannot be parsed to Timestamp - return invalid_comparison(self, other, op) - - result = op(self.asi8, other.view("i8")) - if isna(other): - result.fill(nat_result) - elif lib.is_scalar(other) or np.ndim(other) == 0: - return invalid_comparison(self, other, op) - elif len(other) != len(self): - raise ValueError("Lengths must match") - else: - if isinstance(other, list): - try: - other = type(self)._from_sequence(other) - except ValueError: - other = np.array(other, dtype=np.object_) - elif not isinstance( - other, (np.ndarray, ABCIndexClass, ABCSeries, DatetimeArray) - ): - # Following Timestamp convention, __eq__ is all-False - # and __ne__ is all True, others raise TypeError. - return invalid_comparison(self, other, op) - - if is_object_dtype(other): - # We have to use comp_method_OBJECT_ARRAY instead of numpy - # comparison otherwise it would fail to raise when - # comparing tz-aware and tz-naive - with np.errstate(all="ignore"): - result = ops.comp_method_OBJECT_ARRAY( - op, self.astype(object), other - ) - o_mask = isna(other) - elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): - # e.g. is_timedelta64_dtype(other) - return invalid_comparison(self, other, op) - else: - self._assert_tzawareness_compat(other) - if isinstance(other, (ABCIndexClass, ABCSeries)): - other = other.array - - if ( - is_datetime64_dtype(other) - and not is_datetime64_ns_dtype(other) - or not hasattr(other, "asi8") - ): - # e.g. other.dtype == 'datetime64[s]' - # or an object-dtype ndarray - other = type(self)._from_sequence(other) - - result = op(self.view("i8"), other.view("i8")) - o_mask = other._isnan - - if o_mask.any(): - result[o_mask] = nat_result - - if self._hasnans: - result[self._isnan] = nat_result - - return result - - return compat.set_function_name(wrapper, opname, cls) - - class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): """ Pandas ExtensionArray for tz-naive or tz-aware datetime data. @@ -230,12 +129,12 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps The datetime data. For DatetimeArray `values` (or a Series or Index boxing one), - `dtype` and `freq` will be extracted from `values`, with - precedence given to + `dtype` and `freq` will be extracted from `values`. dtype : numpy.dtype or DatetimeTZDtype Note that the only NumPy dtype allowed is 'datetime64[ns]'. freq : str or Offset, optional + The frequency. copy : bool, default False Whether to copy the underlying array of values. @@ -250,6 +149,8 @@ class DatetimeArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps _typ = "datetimearray" _scalar_type = Timestamp + _recognized_scalars = (datetime, np.datetime64) + _is_recognized_dtype = is_datetime64_any_dtype # define my properties & methods for delegation _bool_ops = [ @@ -327,19 +228,17 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): raise TypeError(msg) elif values.tz: dtype = values.dtype - # freq = validate_values_freq(values, freq) + if freq is None: freq = values.freq values = values._data if not isinstance(values, np.ndarray): - msg = ( + raise ValueError( f"Unexpected type '{type(values).__name__}'. 'values' must be " - "a DatetimeArray ndarray, or Series or Index containing one of" - " those." + "a DatetimeArray ndarray, or Series or Index containing one of those." ) - raise ValueError(msg) - if values.ndim != 1: + if values.ndim not in [1, 2]: raise ValueError("Only 1-dimensional input arrays are supported.") if values.dtype == "i8": @@ -349,20 +248,18 @@ def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): values = values.view(_NS_DTYPE) if values.dtype != _NS_DTYPE: - msg = ( - "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'." - f" Got {values.dtype} instead." + raise ValueError( + "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'. " + f"Got {values.dtype} instead." ) - raise ValueError(msg) dtype = _validate_dt64_dtype(dtype) if freq == "infer": - msg = ( + raise ValueError( "Frequency inference not allowed in DatetimeArray.__init__. " "Use 'pd.array()' instead." ) - raise ValueError(msg) if copy: values = values.copy() @@ -556,11 +453,14 @@ def _unbox_scalar(self, value): def _scalar_from_string(self, value): return Timestamp(value, tz=self.tz) - def _check_compatible_with(self, other): + def _check_compatible_with(self, other, setitem: bool = False): if other is NaT: return - if not timezones.tz_compare(self.tz, other.tz): - raise ValueError(f"Timezones don't match. '{self.tz} != {other.tz}'") + self._assert_tzawareness_compat(other) + if setitem: + # Stricter check for setitem vs comparison methods + if not timezones.tz_compare(self.tz, other.tz): + raise ValueError(f"Timezones don't match. '{self.tz} != {other.tz}'") def _maybe_clear_freq(self): self._freq = None @@ -643,7 +543,7 @@ def _resolution(self): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: if dtype is None and self.tz: # The default for tz-aware is object, to preserve tz info dtype = object @@ -700,20 +600,6 @@ def astype(self, dtype, copy=True): return self.to_period(freq=dtype.freq) return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) - # ---------------------------------------------------------------- - # ExtensionArray Interface - - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) - def _validate_fill_value(self, fill_value): - if isna(fill_value): - fill_value = iNaT - elif isinstance(fill_value, (datetime, np.datetime64)): - self._assert_tzawareness_compat(fill_value) - fill_value = Timestamp(fill_value).value - else: - raise ValueError(f"'fill_value' should be a Timestamp. Got '{fill_value}'.") - return fill_value - # ----------------------------------------------------------------- # Rendering Methods @@ -729,8 +615,6 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): # ----------------------------------------------------------------- # Comparison Methods - _create_comparison_method = classmethod(_dt_array_cmp) - def _has_same_tz(self, other): zzone = self._timezone @@ -788,6 +672,9 @@ def _sub_datetime_arraylike(self, other): return new_values.view("timedelta64[ns]") def _add_offset(self, offset): + if self.ndim == 2: + return self.ravel()._add_offset(offset).reshape(self.shape) + assert not isinstance(offset, Tick) try: if self.tz is not None: @@ -802,8 +689,8 @@ def _add_offset(self, offset): PerformanceWarning, ) result = self.astype("O") + offset - if len(self) == 0: - # _from_sequence won't be able to infer self.tz + if not len(self): + # GH#30336 _from_sequence won't be able to infer self.tz return type(self)._from_sequence(result).tz_localize(self.tz) return type(self)._from_sequence(result, freq="infer") @@ -1779,9 +1666,6 @@ def to_julian_date(self): ) -DatetimeArray._add_comparison_ops() - - # ------------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 3f5a4ca49702f..67036761bc62a 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,10 +1,10 @@ import numbers -from typing import Type +from typing import Any, Tuple, Type import warnings import numpy as np -from pandas._libs import lib +from pandas._libs import lib, missing as libmissing from pandas.compat import set_function_name from pandas.util._decorators import cache_readonly @@ -21,15 +21,15 @@ is_scalar, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna from pandas.core import nanops, ops -from pandas.core.algorithms import take -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric +from .masked import BaseMaskedArray + class _IntegerDtype(ExtensionDtype): """ @@ -44,7 +44,7 @@ class _IntegerDtype(ExtensionDtype): name: str base = None type: Type - na_value = np.nan + na_value = libmissing.NA def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" @@ -90,6 +90,7 @@ def construct_array_type(cls): def __from_arrow__(self, array): """Construct IntegerArray from passed pyarrow Array/ChunkedArray""" import pyarrow + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask if isinstance(array, pyarrow.Array): chunks = [array] @@ -99,18 +100,7 @@ def __from_arrow__(self, array): results = [] for arr in chunks: - buflist = arr.buffers() - data = np.frombuffer(buflist[1], dtype=self.type)[ - arr.offset : arr.offset + len(arr) - ] - bitmask = buflist[0] - if bitmask is not None: - mask = pyarrow.BooleanArray.from_buffers( - pyarrow.bool_(), len(arr), [None, bitmask] - ) - mask = np.asarray(mask) - else: - mask = np.ones(len(arr), dtype=bool) + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) int_arr = IntegerArray(data.copy(), ~mask, copy=False) results.append(int_arr) @@ -257,12 +247,17 @@ def coerce_to_array(values, dtype, mask=None, copy=False): return values, mask -class IntegerArray(ExtensionArray, ExtensionOpsMixin): +class IntegerArray(BaseMaskedArray): """ Array of integer (optional missing) values. .. versionadded:: 0.24.0 + .. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as the missing value rather + than :attr:`numpy.nan`. + .. warning:: IntegerArray is currently experimental, and its API or internal @@ -306,22 +301,25 @@ class IntegerArray(ExtensionArray, ExtensionOpsMixin): >>> int_array = pd.array([1, None, 3], dtype=pd.Int32Dtype()) >>> int_array - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 String aliases for the dtypes are also available. They are capitalized. >>> pd.array([1, None, 3], dtype='Int32') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: Int32 >>> pd.array([1, None, 3], dtype='UInt16') - [1, NaN, 3] + [1, , 3] Length: 3, dtype: UInt16 """ + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 1 + @cache_readonly def dtype(self): return _dtypes[str(self._data.dtype)] @@ -358,48 +356,6 @@ def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): def _from_factorized(cls, values, original): return integer_array(values, dtype=original.dtype) - def _formatter(self, boxed=False): - def fmt(x): - if isna(x): - return "NaN" - return str(x) - - return fmt - - def __getitem__(self, item): - if is_integer(item): - if self._mask[item]: - return self.dtype.na_value - return self._data[item] - return type(self)(self._data[item], self._mask[item]) - - def _coerce_to_ndarray(self): - """ - coerce to an ndarary of object dtype - """ - - # TODO(jreback) make this better - data = self._data.astype(object) - data[self._mask] = self._na_value - return data - - __array_priority__ = 1000 # higher than ndarray so ops dispatch to us - - def __array__(self, dtype=None): - """ - the array interface, return my values - We return an object array here to preserve our scalar values - """ - return self._coerce_to_ndarray() - - def __arrow_array__(self, type=None): - """ - Convert myself into a pyarrow Array. - """ - import pyarrow as pa - - return pa.array(self._data, mask=self._mask, type=type) - _HANDLED_TYPES = (np.ndarray, numbers.Number) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): @@ -447,40 +403,6 @@ def reconstruct(x): else: return reconstruct(result) - def __iter__(self): - for i in range(len(self)): - if self._mask[i]: - yield self.dtype.na_value - else: - yield self._data[i] - - def take(self, indexer, allow_fill=False, fill_value=None): - # we always fill with 1 internally - # to avoid upcasting - data_fill_value = 1 if isna(fill_value) else fill_value - result = take( - self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill - ) - - mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) - - # if we are filling - # we only fill where the indexer is null - # not existing missing values - # TODO(jreback) what if we have a non-na float as a fill value? - if allow_fill and notna(fill_value): - fill_mask = np.asarray(indexer) == -1 - result[fill_mask] = fill_value - mask = mask ^ fill_mask - - return type(self)(result, mask, copy=False) - - def copy(self): - data, mask = self._data, self._mask - data = data.copy() - mask = mask.copy() - return type(self)(data, mask, copy=False) - def __setitem__(self, key, value): _is_scalar = is_scalar(value) if _is_scalar: @@ -494,26 +416,6 @@ def __setitem__(self, key, value): self._data[key] = value self._mask[key] = mask - def __len__(self) -> int: - return len(self._data) - - @property - def nbytes(self): - return self._data.nbytes + self._mask.nbytes - - def isna(self): - return self._mask - - @property - def _na_value(self): - return np.nan - - @classmethod - def _concat_same_type(cls, to_concat): - data = np.concatenate([x._data for x in to_concat]) - mask = np.concatenate([x._mask for x in to_concat]) - return cls(data, mask) - def astype(self, dtype, copy=True): """ Cast to a NumPy array or IntegerArray with 'dtype'. @@ -545,8 +447,14 @@ def astype(self, dtype, copy=True): return type(self)(result, mask=self._mask, copy=False) # coerce - data = self._coerce_to_ndarray() - return astype_nansafe(data, dtype, copy=None) + if is_float_dtype(dtype): + # In astype, we consider dtype=float to also mean na_value=np.nan + kwargs = dict(na_value=np.nan) + else: + kwargs = {} + + data = self.to_numpy(dtype=dtype, **kwargs) + return astype_nansafe(data, dtype, copy=False) @property def _ndarray_values(self) -> np.ndarray: @@ -559,52 +467,10 @@ def _ndarray_values(self) -> np.ndarray: """ return self._data - def value_counts(self, dropna=True): - """ - Returns a Series containing counts of each category. - - Every category will have an entry, even those with a count of 0. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of NaN. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - - """ - - from pandas import Index, Series - - # compute counts on the data with no nans - data = self._data[~self._mask] - value_counts = Index(data).value_counts() - array = value_counts.values - - # TODO(extension) - # if we have allow Index to hold an ExtensionArray - # this is easier - index = value_counts.index.astype(object) - - # if we want nans, count the mask - if not dropna: - - # TODO(extension) - # appending to an Index *always* infers - # w/o passing the dtype - array = np.append(array, [self._mask.sum()]) - index = Index( - np.concatenate([index.values, np.array([np.nan], dtype=object)]), - dtype=object, - ) - - return Series(array, index=index) + def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: + # TODO: https://github.com/pandas-dev/pandas/issues/30037 + # use masked algorithms, rather than object-dtype / np.nan. + return self.to_numpy(na_value=np.nan), np.nan def _values_for_argsort(self) -> np.ndarray: """Return values for sorting. @@ -629,9 +495,11 @@ def _create_comparison_method(cls, op): @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): + from pandas.arrays import BooleanArray + mask = None - if isinstance(other, IntegerArray): + if isinstance(other, (BooleanArray, IntegerArray)): other, mask = other._data, other._mask elif is_list_like(other): @@ -643,25 +511,35 @@ def cmp_method(self, other): if len(self) != len(other): raise ValueError("Lengths must match to compare") - # numpy will show a DeprecationWarning on invalid elementwise - # comparisons, this will raise in the future - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "elementwise", FutureWarning) - with np.errstate(all="ignore"): - method = getattr(self._data, f"__{op_name}__") - result = method(other) + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op_name}__") + result = method(other) if result is NotImplemented: result = invalid_comparison(self._data, other, op) # nans propagate if mask is None: - mask = self._mask + mask = self._mask.copy() else: mask = self._mask | mask - result[mask] = op_name == "ne" - return result + return BooleanArray(result, mask) name = f"__{op.__name__}__" return set_function_name(cmp_method, name, cls) @@ -671,20 +549,23 @@ def _reduce(self, name, skipna=True, **kwargs): mask = self._mask # coerce to a nan-aware float if needed - if mask.any(): - data = self._data.astype("float64") - data[mask] = self._na_value + # (we explicitly use NaN within reductions) + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) op = getattr(nanops, "nan" + name) result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + if np.isnan(result): + return libmissing.NA + # if we have a boolean op, don't coerce if name in ["any", "all"]: pass # if we have a preservable numeric op, # provide coercion back to an integer type if possible - elif name in ["sum", "min", "max", "prod"] and notna(result): + elif name in ["sum", "min", "max", "prod"]: int_result = int(result) if int_result == result: result = int_result @@ -739,12 +620,13 @@ def integer_arithmetic_method(self, other): raise TypeError("can only perform ops with numeric values") else: - if not (is_float(other) or is_integer(other)): + if not (is_float(other) or is_integer(other) or other is libmissing.NA): raise TypeError("can only perform ops with numeric values") - # nans propagate if omask is None: mask = self._mask.copy() + if other is libmissing.NA: + mask |= True else: mask = self._mask | omask @@ -754,20 +636,23 @@ def integer_arithmetic_method(self, other): # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) - else: + elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: mask = np.where((other == 1) & ~omask, False, mask) - else: + elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. mask = np.where((self._data == 0) & ~self._mask, False, mask) - with np.errstate(all="ignore"): - result = op(self._data, other) + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": @@ -790,6 +675,11 @@ def integer_arithmetic_method(self, other): _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. +.. versionchanged:: 1.0.0 + + Now uses :attr:`pandas.NA` as its missing value, + rather than :attr:`numpy.nan`. + Attributes ---------- None diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index cea059fb22be1..37d2baed2c09e 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -17,6 +17,8 @@ is_integer_dtype, is_interval, is_interval_dtype, + is_list_like, + is_object_dtype, is_scalar, is_string_dtype, is_timedelta64_dtype, @@ -37,6 +39,7 @@ from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com +from pandas.core.construction import array from pandas.core.indexes.base import ensure_index _VALID_CLOSED = {"left", "right", "both", "neither"} @@ -105,7 +108,7 @@ Notes ----- See the `user guide -`_ +`_ for more. %(examples)s\ @@ -497,8 +500,11 @@ def __getitem__(self, value): # scalar if not isinstance(left, ABCIndexClass): - if isna(left): + if is_scalar(left) and isna(left): return self._fill_value + if np.ndim(left) > 1: + # GH#30588 multi-dimensional indexer disallowed + raise ValueError("multi-dimensional indexing not allowed") return Interval(left, right, self.closed) return self._shallow_copy(left, right) @@ -547,6 +553,58 @@ def __setitem__(self, key, value): right.values[key] = value_right self._right = right + def __eq__(self, other): + # ensure pandas array for list-like and eliminate non-interval scalars + if is_list_like(other): + if len(self) != len(other): + raise ValueError("Lengths must match to compare") + other = array(other) + elif not isinstance(other, Interval): + # non-interval scalar -> no matches + return np.zeros(len(self), dtype=bool) + + # determine the dtype of the elements we want to compare + if isinstance(other, Interval): + other_dtype = "interval" + elif not is_categorical_dtype(other): + other_dtype = other.dtype + else: + # for categorical defer to categories for dtype + other_dtype = other.categories.dtype + + # extract intervals if we have interval categories with matching closed + if is_interval_dtype(other_dtype): + if self.closed != other.categories.closed: + return np.zeros(len(self), dtype=bool) + other = other.categories.take(other.codes) + + # interval-like -> need same closed and matching endpoints + if is_interval_dtype(other_dtype): + if self.closed != other.closed: + return np.zeros(len(self), dtype=bool) + return (self.left == other.left) & (self.right == other.right) + + # non-interval/non-object dtype -> no matches + if not is_object_dtype(other_dtype): + return np.zeros(len(self), dtype=bool) + + # object dtype -> iteratively check for intervals + result = np.zeros(len(self), dtype=bool) + for i, obj in enumerate(other): + # need object to be an Interval with same closed and endpoints + if ( + isinstance(obj, Interval) + and self.closed == obj.closed + and self.left[i] == obj.left + and self.right[i] == obj.right + ): + result[i] = True + + return result + + def __ne__(self, other): + return ~self.__eq__(other) + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. @@ -1008,7 +1066,7 @@ def is_non_overlapping_monotonic(self): ) # Conversion - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') @@ -1026,6 +1084,59 @@ def __array__(self, dtype=None): result[i] = Interval(left[i], right[i], closed) return result + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + try: + subtype = pyarrow.from_numpy_dtype(self.dtype.subtype) + except TypeError: + raise TypeError( + "Conversion to arrow with subtype '{}' " + "is not supported".format(self.dtype.subtype) + ) + interval_type = ArrowIntervalType(subtype, self.closed) + storage_array = pyarrow.StructArray.from_arrays( + [ + pyarrow.array(self.left, type=subtype, from_pandas=True), + pyarrow.array(self.right, type=subtype, from_pandas=True), + ], + names=["left", "right"], + ) + mask = self.isna() + if mask.any(): + # if there are missing values, set validity bitmap also on the array level + null_bitmap = pyarrow.array(~mask).buffers()[1] + storage_array = pyarrow.StructArray.from_buffers( + storage_array.type, + len(storage_array), + [null_bitmap], + children=[storage_array.field(0), storage_array.field(1)], + ) + + if type is not None: + if type.equals(interval_type.storage_type): + return storage_array + elif isinstance(type, ArrowIntervalType): + # ensure we have the same subtype and closed attributes + if not type.equals(interval_type): + raise TypeError( + "Not supported to convert IntervalArray to type with " + "different 'subtype' ({0} vs {1}) and 'closed' ({2} vs {3}) " + "attributes".format( + self.dtype.subtype, type.subtype, self.closed, type.closed + ) + ) + else: + raise TypeError( + "Not supported to convert IntervalArray to '{0}' type".format(type) + ) + + return pyarrow.ExtensionArray.from_storage(interval_type, storage_array) + _interval_shared_docs[ "to_tuples" ] = """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py new file mode 100644 index 0000000000000..47605413ff1a6 --- /dev/null +++ b/pandas/core/arrays/masked.py @@ -0,0 +1,250 @@ +from typing import TYPE_CHECKING + +import numpy as np + +from pandas._libs import lib, missing as libmissing + +from pandas.core.dtypes.common import is_integer, is_object_dtype, is_string_dtype +from pandas.core.dtypes.missing import isna, notna + +from pandas.core.algorithms import take +from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com +from pandas.core.indexers import check_bool_array_indexer + +if TYPE_CHECKING: + from pandas._typing import Scalar + + +class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): + """ + Base class for masked arrays (which use _data and _mask to store the data). + + numpy based + """ + + _data: np.ndarray + _mask: np.ndarray + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value: "Scalar" + + def __getitem__(self, item): + if is_integer(item): + if self._mask[item]: + return self.dtype.na_value + return self._data[item] + + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + + return type(self)(self._data[item], self._mask[item]) + + def __iter__(self): + for i in range(len(self)): + if self._mask[i]: + yield self.dtype.na_value + else: + yield self._data[i] + + def __len__(self) -> int: + return len(self._data) + + def to_numpy( + self, dtype=None, copy=False, na_value: "Scalar" = lib.no_default, + ): + """ + Convert to a NumPy Array. + + By default converts to an object-dtype NumPy array. Specify the `dtype` and + `na_value` keywords to customize the conversion. + + Parameters + ---------- + dtype : dtype, default object + The numpy dtype to convert to. + copy : bool, default False + Whether to ensure that the returned value is a not a view on + the array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. This is typically + only possible when no missing values are present and `dtype` + is the equivalent numpy dtype. + na_value : scalar, optional + Scalar missing value indicator to use in numpy array. Defaults + to the native missing value indicator of this array (pd.NA). + + Returns + ------- + numpy.ndarray + + Examples + -------- + An object-dtype is the default result + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a.to_numpy() + array([True, False, NA], dtype=object) + + When no missing values are present, an equivalent dtype can be used. + + >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") + array([ True, False]) + >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") + array([1, 2]) + + However, requesting such dtype will raise a ValueError if + missing values are present and the default missing value :attr:`NA` + is used. + + >>> a = pd.array([True, False, pd.NA], dtype="boolean") + >>> a + + [True, False, NA] + Length: 3, dtype: boolean + + >>> a.to_numpy(dtype="bool") + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + + Specify a valid `na_value` instead + + >>> a.to_numpy(dtype="bool", na_value=False) + array([ True, False, False]) + """ + if na_value is lib.no_default: + na_value = libmissing.NA + if dtype is None: + dtype = object + if self._hasna: + if ( + not (is_object_dtype(dtype) or is_string_dtype(dtype)) + and na_value is libmissing.NA + ): + raise ValueError( + f"cannot convert to '{dtype}'-dtype NumPy array " + "with missing values. Specify an appropriate 'na_value' " + "for this dtype." + ) + # don't pass copy to astype -> always need a copy since we are mutating + data = self._data.astype(dtype) + data[self._mask] = na_value + else: + data = self._data.astype(dtype, copy=copy) + return data + + __array_priority__ = 1000 # higher than ndarray so ops dispatch to us + + def __array__(self, dtype=None) -> np.ndarray: + """ + the array interface, return my values + We return an object array here to preserve our scalar values + """ + return self.to_numpy(dtype=dtype) + + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow as pa + + return pa.array(self._data, mask=self._mask, type=type) + + @property + def _hasna(self) -> bool: + # Note: this is expensive right now! The hope is that we can + # make this faster by having an optional mask, but not have to change + # source code using it.. + return self._mask.any() + + def isna(self): + return self._mask + + @property + def _na_value(self): + return self.dtype.na_value + + @property + def nbytes(self): + return self._data.nbytes + self._mask.nbytes + + @classmethod + def _concat_same_type(cls, to_concat): + data = np.concatenate([x._data for x in to_concat]) + mask = np.concatenate([x._mask for x in to_concat]) + return cls(data, mask) + + def take(self, indexer, allow_fill=False, fill_value=None): + # we always fill with 1 internally + # to avoid upcasting + data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value + result = take( + self._data, indexer, fill_value=data_fill_value, allow_fill=allow_fill + ) + + mask = take(self._mask, indexer, fill_value=True, allow_fill=allow_fill) + + # if we are filling + # we only fill where the indexer is null + # not existing missing values + # TODO(jreback) what if we have a non-na float as a fill value? + if allow_fill and notna(fill_value): + fill_mask = np.asarray(indexer) == -1 + result[fill_mask] = fill_value + mask = mask ^ fill_mask + + return type(self)(result, mask, copy=False) + + def copy(self): + data, mask = self._data, self._mask + data = data.copy() + mask = mask.copy() + return type(self)(data, mask, copy=False) + + def value_counts(self, dropna=True): + """ + Returns a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import Index, Series + from pandas.arrays import IntegerArray + + # compute counts on the data with no nans + data = self._data[~self._mask] + value_counts = Index(data).value_counts() + + # TODO(extension) + # if we have allow Index to hold an ExtensionArray + # this is easier + index = value_counts.index.values.astype(object) + + # if we want nans, count the mask + if dropna: + counts = value_counts.values + else: + counts = np.empty(len(value_counts) + 1, dtype="int64") + counts[:-1] = value_counts + counts[-1] = self._mask.sum() + + index = Index( + np.concatenate([index, np.array([self.dtype.na_value], dtype=object)]), + dtype=object, + ) + + mask = np.zeros(len(counts), dtype="bool") + counts = IntegerArray(counts, mask) + + return Series(counts, index=index) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index deec30dfe34ff..4db3d3010adaf 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -17,11 +17,12 @@ from pandas import compat from pandas.core import nanops from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +import pandas.core.common as com from pandas.core.construction import extract_array +from pandas.core.indexers import check_bool_array_indexer from pandas.core.missing import backfill_1d, pad_1d -from .base import ExtensionArray, ExtensionOpsMixin - class PandasDtype(ExtensionDtype): """ @@ -181,7 +182,7 @@ def dtype(self): # ------------------------------------------------------------------------ # NumPy Array Interface - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._ndarray, dtype=dtype) _HANDLED_TYPES = (np.ndarray, numbers.Number) @@ -234,6 +235,9 @@ def __getitem__(self, item): if isinstance(item, type(self)): item = item._ndarray + elif com.is_bool_indexer(item): + item = check_bool_array_indexer(self, item) + result = self._ndarray[item] if not lib.is_scalar(item): result = type(self)(result) @@ -416,27 +420,15 @@ def skew(self, axis=None, dtype=None, out=None, keepdims=False, skipna=True): # ------------------------------------------------------------------------ # Additional Methods - def to_numpy(self, dtype=None, copy=False): - """ - Convert the PandasArray to a :class:`numpy.ndarray`. - - By default, this requires no coercion or copying of data. - - Parameters - ---------- - dtype : numpy.dtype - The NumPy dtype to pass to :func:`numpy.asarray`. - copy : bool, default False - Whether to copy the underlying data. - - Returns - ------- - ndarray - """ + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default): result = np.asarray(self._ndarray, dtype=dtype) - if copy and result is self._ndarray: + + if (copy or na_value is not lib.no_default) and result is self._ndarray: result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value + return result @Appender(ExtensionArray.searchsorted.__doc__) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index df057ce5a0104..d9b53aa4a867c 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -20,15 +20,13 @@ period_asfreq_arr, ) from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds -import pandas.compat as compat -from pandas.util._decorators import Appender, cache_readonly +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( _TD_DTYPE, ensure_object, is_datetime64_dtype, is_float_dtype, - is_list_like, is_period_dtype, pandas_dtype, ) @@ -44,7 +42,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick, _delta_to_tick @@ -61,49 +58,6 @@ def f(self): return property(f) -def _period_array_cmp(cls, op): - """ - Wrap comparison operations to convert Period-like to PeriodDtype - """ - opname = f"__{op.__name__}__" - nat_result = opname == "__ne__" - - @unpack_zerodim_and_defer(opname) - def wrapper(self, other): - ordinal_op = getattr(self.asi8, opname) - - if is_list_like(other) and len(other) != len(self): - raise ValueError("Lengths must match") - - if isinstance(other, Period): - self._check_compatible_with(other) - - result = ordinal_op(other.ordinal) - elif isinstance(other, cls): - self._check_compatible_with(other) - - result = ordinal_op(other.asi8) - - mask = self._isnan | other._isnan - if mask.any(): - result[mask] = nat_result - - return result - elif other is NaT: - result = np.empty(len(self.asi8), dtype=bool) - result.fill(nat_result) - else: - other = Period(other, freq=self.freq) - result = ordinal_op(other.ordinal) - - if self._hasnans: - result[self._isnan] = nat_result - - return result - - return compat.set_function_name(wrapper, opname, cls) - - class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): """ Pandas ExtensionArray for storing Period data. @@ -159,6 +113,8 @@ class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): __array_priority__ = 1000 _typ = "periodarray" # ABCPeriodArray _scalar_type = Period + _recognized_scalars = (Period,) + _is_recognized_dtype = is_period_dtype # Names others delegate to us _other_ops: List[str] = [] @@ -203,12 +159,7 @@ def __init__(self, values, freq=None, dtype=None, copy=False): if isinstance(values, type(self)): if freq is not None and freq != values.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=values.freq.freqstr, - other_freq=freq.freqstr, - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(values, freq) values, freq = values._data, values.freq values = np.array(values, dtype="int64", copy=copy) @@ -218,8 +169,9 @@ def __init__(self, values, freq=None, dtype=None, copy=False): self._dtype = PeriodDtype(freq) @classmethod - def _simple_new(cls, values, freq=None, **kwargs): + def _simple_new(cls, values: np.ndarray, freq=None, **kwargs): # alias for PeriodArray.__init__ + assert isinstance(values, np.ndarray) and values.dtype == "i8" return cls(values, freq=freq, **kwargs) @classmethod @@ -307,11 +259,11 @@ def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: def _scalar_from_string(self, value: str) -> Period: return Period(value, freq=self.freq) - def _check_compatible_with(self, other): + def _check_compatible_with(self, other, setitem: bool = False): if other is NaT: return if self.freqstr != other.freqstr: - _raise_on_incompatible(self, other) + raise raise_on_incompatible(self, other) # -------------------------------------------------------------------- # Data / Attributes @@ -320,7 +272,7 @@ def _check_compatible_with(self, other): def dtype(self): return self._dtype - # read-only property overwriting read/write + # error: Read-only property cannot override read-write property [misc] @property # type: ignore def freq(self): """ @@ -328,10 +280,36 @@ def freq(self): """ return self.dtype.freq - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: # overriding DatetimelikeArray return np.array(list(self), dtype=object) + def __arrow_array__(self, type=None): + """ + Convert myself into a pyarrow Array. + """ + import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + if type is not None: + if pyarrow.types.is_integer(type): + return pyarrow.array(self._data, mask=self.isna(), type=type) + elif isinstance(type, ArrowPeriodType): + # ensure we have the same freq + if self.freqstr != type.freq: + raise TypeError( + "Not supported to convert PeriodArray to array with different " + f"'freq' ({self.freqstr} vs {type.freq})" + ) + else: + raise TypeError( + f"Not supported to convert PeriodArray to '{type}' type" + ) + + period_type = ArrowPeriodType(self.freqstr) + storage_array = pyarrow.array(self._data, mask=self.isna(), type="int64") + return pyarrow.ExtensionArray.from_storage(period_type, storage_array) + # -------------------------------------------------------------------- # Vectorized analogues of Period properties @@ -440,8 +418,9 @@ def to_timestamp(self, freq=None, how="start"): ---------- freq : str or DateOffset, optional Target frequency. The default is 'D' for week or longer, - 'S' otherwise + 'S' otherwise. how : {'s', 'e', 'start', 'end'} + Whether to use the start or end of the time period being converted. Returns ------- @@ -476,21 +455,8 @@ def to_timestamp(self, freq=None, how="start"): # -------------------------------------------------------------------- # Array-like / EA-Interface Methods - def _formatter(self, boxed=False): - if boxed: - return str - return "'{}'".format - - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) - def _validate_fill_value(self, fill_value): - if isna(fill_value): - fill_value = iNaT - elif isinstance(fill_value, Period): - self._check_compatible_with(fill_value) - fill_value = fill_value.ordinal - else: - raise ValueError(f"'fill_value' should be a Period. Got '{fill_value}'.") - return fill_value + def _values_for_argsort(self): + return self._data # -------------------------------------------------------------------- @@ -530,17 +496,20 @@ def asfreq(self, freq=None, how="E"): Parameters ---------- freq : str - a frequency + A frequency. how : str {'E', 'S'} - 'E', 'END', or 'FINISH' for end, - 'S', 'START', or 'BEGIN' for start. Whether the elements should be aligned to the end - or start within pa period. January 31st ('END') vs. - January 1st ('START') for example. + or start within pa period. + + * 'E', 'END', or 'FINISH' for end, + * 'S', 'START', or 'BEGIN' for start. + + January 31st ('END') vs. January 1st ('START') for example. Returns ------- - new : Period Array/Index with the new frequency + Period Array/Index + Constructed with the new frequency. Examples -------- @@ -582,6 +551,11 @@ def asfreq(self, freq=None, how="E"): # ------------------------------------------------------------------ # Rendering Methods + def _formatter(self, boxed=False): + if boxed: + return str + return "'{}'".format + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): """ actually format my specific types @@ -615,7 +589,6 @@ def astype(self, dtype, copy=True): # ------------------------------------------------------------------ # Arithmetic Methods - _create_comparison_method = classmethod(_period_array_cmp) def _sub_datelike(self, other): assert other is not NaT @@ -634,12 +607,23 @@ def _sub_period(self, other): return new_data - @Appender(dtl.DatetimeLikeArrayMixin._addsub_int_array.__doc__) def _addsub_int_array( - self, - other: Union[ABCPeriodArray, ABCSeries, ABCPeriodIndex, np.ndarray], - op: Callable[[Any], Any], - ) -> ABCPeriodArray: + self, other: np.ndarray, op: Callable[[Any, Any], Any], + ) -> "PeriodArray": + """ + Add or subtract array of integers; equivalent to applying + `_time_shift` pointwise. + + Parameters + ---------- + other : np.ndarray[integer-dtype] + op : {operator.add, operator.sub} + + Returns + ------- + result : PeriodArray + """ + assert op in [operator.add, operator.sub] if op is operator.sub: other = -other @@ -652,7 +636,7 @@ def _add_offset(self, other): assert not isinstance(other, Tick) base = libfrequencies.get_base_alias(other.rule_code) if base != self.freq.rule_code: - _raise_on_incompatible(self, other) + raise raise_on_incompatible(self, other) # Note: when calling parent class's _add_timedeltalike_scalar, # it will call delta_to_nanoseconds(delta). Because delta here @@ -720,7 +704,7 @@ def _add_delta(self, other): """ if not isinstance(self.freq, Tick): # We cannot add timedelta-like to non-tick PeriodArray - _raise_on_incompatible(self, other) + raise raise_on_incompatible(self, other) new_ordinals = super()._add_delta(other) return type(self)(new_ordinals, freq=self.freq) @@ -772,16 +756,10 @@ def _check_timedeltalike_freq_compat(self, other): # by which will be added to self. return delta - _raise_on_incompatible(self, other) - - def _values_for_argsort(self): - return self._data - + raise raise_on_incompatible(self, other) -PeriodArray._add_comparison_ops() - -def _raise_on_incompatible(left, right): +def raise_on_incompatible(left, right): """ Helper function to render a consistent error message when raising IncompatibleFrequency. @@ -789,14 +767,15 @@ def _raise_on_incompatible(left, right): Parameters ---------- left : PeriodArray - right : DateOffset, Period, ndarray, or timedelta-like + right : None, DateOffset, Period, ndarray, or timedelta-like - Raises - ------ + Returns + ------- IncompatibleFrequency + Exception to be raised by the caller. """ # GH#24283 error message format depends on whether right is scalar - if isinstance(right, np.ndarray): + if isinstance(right, np.ndarray) or right is None: other_freq = None elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, DateOffset)): other_freq = right.freqstr @@ -806,7 +785,7 @@ def _raise_on_incompatible(left, right): msg = DIFFERENT_FREQ.format( cls=type(left).__name__, own_freq=left.freqstr, other_freq=other_freq ) - raise IncompatibleFrequency(msg) + return IncompatibleFrequency(msg) # ------------------------------------------------------------------- diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index 75f3819fb19fd..e928db499a771 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -1,5 +1,10 @@ # flake8: noqa: F401 -from .accessor import SparseAccessor, SparseFrameAccessor -from .array import BlockIndex, IntIndex, SparseArray, _make_index -from .dtype import SparseDtype +from pandas.core.arrays.sparse.accessor import SparseAccessor, SparseFrameAccessor +from pandas.core.arrays.sparse.array import ( + BlockIndex, + IntIndex, + SparseArray, + _make_index, +) +from pandas.core.arrays.sparse.dtype import SparseDtype diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index c207b96a8d308..92c05f44d677c 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -7,9 +7,8 @@ from pandas.core.dtypes.cast import find_common_type from pandas.core.accessor import PandasDelegate, delegate_names - -from .array import SparseArray -from .dtype import SparseDtype +from pandas.core.arrays.sparse.array import SparseArray +from pandas.core.arrays.sparse.dtype import SparseDtype class BaseAccessor: @@ -163,7 +162,7 @@ def to_dense(self): Examples -------- - >>> series = pd.Series(pd.SparseArray([0, 1, 0])) + >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0])) >>> series 0 0 1 1 @@ -216,7 +215,7 @@ def from_spmatrix(cls, data, index=None, columns=None): ------- DataFrame Each column of the DataFrame is stored as a - :class:`SparseArray`. + :class:`arrays.SparseArray`. Examples -------- @@ -251,7 +250,7 @@ def to_dense(self): Examples -------- - >>> df = pd.DataFrame({"A": pd.SparseArray([0, 1, 0])}) + >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])}) >>> df.sparse.to_dense() A 0 0 diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 548f2bf702e60..e2562a375515d 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -39,6 +39,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays.sparse.dtype import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.construction import sanitize_array @@ -48,8 +49,6 @@ import pandas.io.formats.printing as printing -from .dtype import SparseDtype - # ---------------------------------------------------------------------------- # Array @@ -403,7 +402,7 @@ def from_spmatrix(cls, data): -------- >>> import scipy.sparse >>> mat = scipy.sparse.coo_matrix((4, 1)) - >>> pd.SparseArray.from_spmatrix(mat) + >>> pd.arrays.SparseArray.from_spmatrix(mat) [0.0, 0.0, 0.0, 0.0] Fill: 0.0 IntIndex @@ -428,7 +427,7 @@ def from_spmatrix(cls, data): return cls._simple_new(arr, index, dtype) - def __array__(self, dtype=None, copy=True): + def __array__(self, dtype=None, copy=True) -> np.ndarray: fill_value = self.fill_value if self.sp_index.ngaps == 0: @@ -738,6 +737,9 @@ def value_counts(self, dropna=True): # -------- def __getitem__(self, key): + # avoid mypy issues when importing at the top-level + from pandas.core.indexing import check_bool_indexer + if isinstance(key, tuple): if len(key) > 1: raise IndexError("too many indices for array.") @@ -766,7 +768,9 @@ def __getitem__(self, key): else: key = np.asarray(key) - if com.is_bool_indexer(key) and len(self) == len(key): + if com.is_bool_indexer(key): + key = check_bool_indexer(self, key) + return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, "__len__"): return self.take(key) @@ -1074,7 +1078,7 @@ def map(self, mapper): Examples -------- - >>> arr = pd.SparseArray([0, 1, 2]) + >>> arr = pd.arrays.SparseArray([0, 1, 2]) >>> arr.apply(lambda x: x + 10) [10, 11, 12] Fill: 10 diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index e3e0064c84da3..6f15681cab87e 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -64,7 +64,7 @@ class SparseDtype(ExtensionDtype): # hash(nan) is (sometimes?) 0. _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") - def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: + def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None): if isinstance(dtype, type(self)): if fill_value is None: @@ -175,7 +175,7 @@ def construct_array_type(cls): ------- type """ - from .array import SparseArray + from pandas.core.arrays.sparse.array import SparseArray return SparseArray diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index 278ad1027d489..17a953fce9ec0 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -3,8 +3,6 @@ Currently only includes to_coo helpers. """ -from collections import OrderedDict - from pandas.core.indexes.api import Index, MultiIndex from pandas.core.series import Series @@ -19,14 +17,14 @@ def _check_is_partition(parts, whole): def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): - """ For arbitrary (MultiIndexed) SparseSeries return + """ For arbitrary (MultiIndexed) sparse Series return (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo constructor. """ # index and column levels must be a partition of the index _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) - # from the SparseSeries: get the labels and data for non-null entries - values = ss._data.internal_values()._valid_sp_values + # from the sparse Series: get the labels and data for non-null entries + values = ss.array._valid_sp_values nonnull_labels = ss.dropna() @@ -46,14 +44,13 @@ def get_indexers(levels): # labels_to_i[:] = np.arange(labels_to_i.shape[0]) def _get_label_to_i_dict(labels, sort_labels=False): - """ Return OrderedDict of unique labels to number. + """ Return dict of unique labels to number. Optionally sort by label. """ labels = Index(map(tuple, labels)).unique().tolist() # squish if sort_labels: labels = sorted(labels) - d = OrderedDict((k, i) for i, k in enumerate(labels)) - return d + return {k: i for i, k in enumerate(labels)} def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): ilabels = list(zip(*[index._get_level_values(i) for i in subset])) @@ -88,7 +85,7 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ - Convert a SparseSeries to a scipy.sparse.coo_matrix using index + Convert a sparse Series to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column labels respectively. Returns the sparse_matrix, row and column labels. """ diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index de254f662bb32..c485d1f50dc9d 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -47,6 +47,8 @@ class StringDtype(ExtensionDtype): StringDtype """ + name = "string" + #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA @@ -54,19 +56,6 @@ class StringDtype(ExtensionDtype): def type(self) -> Type: return str - @property - def name(self) -> str: - """ - The alias for StringDtype is ``'string'``. - """ - return "string" - - @classmethod - def construct_from_string(cls, string: str) -> ExtensionDtype: - if string == "string": - return cls() - return super().construct_from_string(string) - @classmethod def construct_array_type(cls) -> "Type[StringArray]": return StringArray @@ -104,9 +93,6 @@ class StringArray(PandasArray): StringArray is considered experimental. The implementation and parts of the API may change without warning. - In particular, the NA value used may change to no longer be - ``numpy.nan``. - Parameters ---------- values : array-like @@ -115,8 +101,11 @@ class StringArray(PandasArray): .. warning:: Currently, this expects an object-dtype ndarray - where the elements are Python strings. This may - change without warning in the future. + where the elements are Python strings or :attr:`pandas.NA`. + This may change without warning in the future. Use + :meth:`pandas.array` with ``dtype="string"`` for a stable way of + creating a `StringArray` from any sequence. + copy : bool, default False Whether to copy the array of data. @@ -130,6 +119,8 @@ class StringArray(PandasArray): See Also -------- + array + The recommended function for creating a StringArray. Series.str The string methods are available on Series backed by a StringArray. @@ -142,7 +133,7 @@ class StringArray(PandasArray): -------- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") - ['This is', 'some text', NA, 'data.'] + ['This is', 'some text', , 'data.'] Length: 4, dtype: string Unlike ``object`` dtype arrays, ``StringArray`` doesn't allow non-string @@ -157,7 +148,7 @@ class StringArray(PandasArray): >>> pd.array(["a", None, "c"], dtype="string") == "a" - [True, NA, False] + [True, , False] Length: 3, dtype: boolean """ @@ -176,12 +167,10 @@ def __init__(self, values, copy=False): def _validate(self): """Validate that we only store NA or strings.""" if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): - raise ValueError( - "StringArray requires a sequence of strings or missing values." - ) + raise ValueError("StringArray requires a sequence of strings or pandas.NA") if self._ndarray.dtype != "object": raise ValueError( - "StringArray requires a sequence of strings. Got " + "StringArray requires a sequence of strings or pandas.NA. Got " f"'{self._ndarray.dtype}' dtype instead." ) @@ -189,12 +178,22 @@ def _validate(self): def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" - result = super()._from_sequence(scalars, dtype=object, copy=copy) + + result = np.asarray(scalars, dtype="object") + if copy and result is scalars: + result = result.copy() + # Standardize all missing-like values to NA # TODO: it would be nice to do this in _validate / lib.is_string_array # We are already doing a scan over the values there. - result[result.isna()] = StringDtype.na_value - return result + na_values = isna(result) + if na_values.any(): + if result is scalars: + # force a copy now, if we haven't already + result = result.copy() + result[na_values] = StringDtype.na_value + + return cls(result) @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): @@ -264,7 +263,7 @@ def _reduce(self, name, skipna=True, **kwargs): def value_counts(self, dropna=False): from pandas import value_counts - return value_counts(self._ndarray, dropna=dropna) + return value_counts(self._ndarray, dropna=dropna).astype("Int64") # Overrride parent because we have different return types. @classmethod diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index db4effa608582..516a271042c9b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -11,9 +11,7 @@ parse_timedelta_unit, precision_from_unit, ) -import pandas.compat as compat from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( _NS_DTYPE, @@ -21,7 +19,6 @@ is_dtype_equal, is_float_dtype, is_integer_dtype, - is_list_like, is_object_dtype, is_scalar, is_string_dtype, @@ -40,17 +37,12 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr +from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.ops.invalid import invalid_comparison from pandas.tseries.frequencies import to_offset from pandas.tseries.offsets import Tick -from . import datetimelike as dtl - -_BAD_DTYPE = "dtype {dtype} cannot be converted to timedelta64[ns]" - def _is_convertible_to_td(key): return isinstance(key, (Tick, timedelta, np.timedelta64, str)) @@ -72,54 +64,6 @@ def f(self): return property(f) -def _td_array_cmp(cls, op): - """ - Wrap comparison operations to convert timedelta-like to timedelta64 - """ - opname = f"__{op.__name__}__" - nat_result = opname == "__ne__" - - @unpack_zerodim_and_defer(opname) - def wrapper(self, other): - - if _is_convertible_to_td(other) or other is NaT: - try: - other = Timedelta(other) - except ValueError: - # failed to parse as timedelta - return invalid_comparison(self, other, op) - - result = op(self.view("i8"), other.value) - if isna(other): - result.fill(nat_result) - - elif not is_list_like(other): - return invalid_comparison(self, other, op) - - elif len(other) != len(self): - raise ValueError("Lengths must match") - - else: - try: - other = type(self)._from_sequence(other)._data - except (ValueError, TypeError): - return invalid_comparison(self, other, op) - - result = op(self.view("i8"), other.view("i8")) - result = com.values_from_object(result) - - o_mask = np.array(isna(other)) - if o_mask.any(): - result[o_mask] = nat_result - - if self._hasnans: - result[self._isnan] = nat_result - - return result - - return compat.set_function_name(wrapper, opname, cls) - - class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): """ Pandas ExtensionArray for timedelta data. @@ -155,6 +99,9 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _typ = "timedeltaarray" _scalar_type = Timedelta + _recognized_scalars = (timedelta, np.timedelta64, Tick) + _is_recognized_dtype = is_timedelta64_dtype + __array_priority__ = 1000 # define my properties & methods for delegation _other_ops: List[str] = [] @@ -213,11 +160,11 @@ def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): if not isinstance(values, np.ndarray): msg = ( - f"Unexpected type '{type(values).__name__}'. 'values' must be a" - " TimedeltaArray ndarray, or Series or Index containing one of those." + f"Unexpected type '{type(values).__name__}'. 'values' must be a " + "TimedeltaArray ndarray, or Series or Index containing one of those." ) raise ValueError(msg) - if values.ndim != 1: + if values.ndim not in [1, 2]: raise ValueError("Only 1-dimensional input arrays are supported.") if values.dtype == "i8": @@ -332,7 +279,7 @@ def _unbox_scalar(self, value): def _scalar_from_string(self, value): return Timedelta(value) - def _check_compatible_with(self, other): + def _check_compatible_with(self, other, setitem: bool = False): # we don't have anything to validate. pass @@ -342,16 +289,6 @@ def _maybe_clear_freq(self): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) - def _validate_fill_value(self, fill_value): - if isna(fill_value): - fill_value = iNaT - elif isinstance(fill_value, (timedelta, np.timedelta64, Tick)): - fill_value = Timedelta(fill_value).value - else: - raise ValueError(f"'fill_value' should be a Timedelta. Got '{fill_value}'.") - return fill_value - def astype(self, dtype, copy=True): # We handle # --> timedelta64[ns] @@ -378,6 +315,9 @@ def astype(self, dtype, copy=True): return self return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy) + # ---------------------------------------------------------------- + # Reductions + def sum( self, axis=None, @@ -442,7 +382,7 @@ def _formatter(self, boxed=False): return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep="NaT", date_format=None): + def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): from pandas.io.formats.format import _get_format_timedelta64 formatter = _get_format_timedelta64(self._data, na_rep) @@ -451,8 +391,6 @@ def _format_native_types(self, na_rep="NaT", date_format=None): # ---------------------------------------------------------------- # Arithmetic Methods - _create_comparison_method = classmethod(_td_array_cmp) - def _add_offset(self, other): assert not isinstance(other, Tick) raise TypeError( @@ -507,13 +445,13 @@ def _add_datetimelike_scalar(self, other): dtype = DatetimeTZDtype(tz=other.tz) if other.tz else _NS_DTYPE return DatetimeArray(result, dtype=dtype, freq=self.freq) - def _addsub_offset_array(self, other, op): - # Add or subtract Array-like of DateOffset objects + def _addsub_object_array(self, other, op): + # Add or subtract Array-like of objects try: # TimedeltaIndex can only operate with a subset of DateOffset # subclasses. Incompatible classes will raise AttributeError, # which we re-raise as TypeError - return super()._addsub_offset_array(other, op) + return super()._addsub_object_array(other, op) except AttributeError: raise TypeError( f"Cannot add/subtract non-tick DateOffset to {type(self).__name__}" @@ -948,9 +886,6 @@ def f(x): return result -TimedeltaArray._add_comparison_ops() - - # --------------------------------------------------------------------- # Constructor Helpers @@ -1036,8 +971,6 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") data = np.array(data, copy=copy) - if data.ndim != 1: - raise ValueError("Only 1-dimensional input arrays are supported.") assert data.dtype == "m8[ns]", data return data, inferred_freq @@ -1129,7 +1062,7 @@ def _validate_td64_dtype(dtype): raise ValueError(msg) if not is_dtype_equal(dtype, _TD_DTYPE): - raise ValueError(_BAD_DTYPE.format(dtype=dtype)) + raise ValueError(f"dtype {dtype} cannot be converted to timedelta64[ns]") return dtype diff --git a/pandas/core/base.py b/pandas/core/base.py index 381d45d829e62..c6800d282700f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -2,13 +2,13 @@ Base and utility classes for pandas objects. """ import builtins -from collections import OrderedDict import textwrap from typing import Dict, FrozenSet, List, Optional import numpy as np import pandas._libs.lib as lib +from pandas._typing import T from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -19,7 +19,7 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, + is_dict_like, is_extension_array_dtype, is_list_like, is_object_dtype, @@ -87,6 +87,14 @@ def __sizeof__(self): # object's 'sizeof' return super().__sizeof__() + def _ensure_type(self: T, obj) -> T: + """Ensure that an object has same type as self. + + Used by type checkers. + """ + assert isinstance(obj, type(self)), type(obj) + return obj + class NoNewAttributesMixin: """Mixin which prevents adding new attributes. @@ -141,39 +149,35 @@ class SelectionMixin: _internal_names = ["_cache", "__setstate__"] _internal_names_set = set(_internal_names) - _builtin_table = OrderedDict( - ((builtins.sum, np.sum), (builtins.max, np.max), (builtins.min, np.min)) - ) - - _cython_table = OrderedDict( - ( - (builtins.sum, "sum"), - (builtins.max, "max"), - (builtins.min, "min"), - (np.all, "all"), - (np.any, "any"), - (np.sum, "sum"), - (np.nansum, "sum"), - (np.mean, "mean"), - (np.nanmean, "mean"), - (np.prod, "prod"), - (np.nanprod, "prod"), - (np.std, "std"), - (np.nanstd, "std"), - (np.var, "var"), - (np.nanvar, "var"), - (np.median, "median"), - (np.nanmedian, "median"), - (np.max, "max"), - (np.nanmax, "max"), - (np.min, "min"), - (np.nanmin, "min"), - (np.cumprod, "cumprod"), - (np.nancumprod, "cumprod"), - (np.cumsum, "cumsum"), - (np.nancumsum, "cumsum"), - ) - ) + _builtin_table = {builtins.sum: np.sum, builtins.max: np.max, builtins.min: np.min} + + _cython_table = { + builtins.sum: "sum", + builtins.max: "max", + builtins.min: "min", + np.all: "all", + np.any: "any", + np.sum: "sum", + np.nansum: "sum", + np.mean: "mean", + np.nanmean: "mean", + np.prod: "prod", + np.nanprod: "prod", + np.std: "std", + np.nanstd: "std", + np.var: "var", + np.nanvar: "var", + np.median: "median", + np.nanmedian: "median", + np.max: "max", + np.nanmax: "max", + np.min: "min", + np.nanmin: "min", + np.cumprod: "cumprod", + np.nancumprod: "cumprod", + np.cumsum: "cumsum", + np.nancumsum: "cumsum", + } @property def _selection_name(self): @@ -328,7 +332,7 @@ def _aggregate(self, arg, *args, **kwargs): # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in arg.values()): - new_arg = OrderedDict() + new_arg = {} for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] @@ -386,16 +390,16 @@ def _agg_2dim(name, how): def _agg(arg, func): """ run the aggregations over the arg with func - return an OrderedDict + return a dict """ - result = OrderedDict() + result = {} for fname, agg_how in arg.items(): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(arg.keys()) - result = OrderedDict() + result = {} if self._selection is not None: @@ -579,12 +583,10 @@ def _is_builtin_func(self, arg): class ShallowMixin: _attributes: List[str] = [] - def _shallow_copy(self, obj=None, **kwargs): + def _shallow_copy(self, obj, **kwargs): """ return a new object with the replacement attributes """ - if obj is None: - obj = self._selected_obj.copy() if isinstance(obj, self._constructor): obj = obj.obj @@ -602,7 +604,7 @@ class IndexOpsMixin: # ndarray compatibility __array_priority__ = 1000 _deprecations: FrozenSet[str] = frozenset( - ["tolist", "item"] # tolist is not deprecated, just suppressed in the __dir__ + ["tolist"] # tolist is not deprecated, just suppressed in the __dir__ ) def transpose(self, *args, **kwargs): @@ -623,24 +625,6 @@ def transpose(self, *args, **kwargs): """, ) - @property - def _is_homogeneous_type(self) -> bool: - """ - Whether the object has a single dtype. - - By definition, Series and Index are always considered homogeneous. - A MultiIndex may or may not be homogeneous, depending on the - dtypes of the levels. - - See Also - -------- - DataFrame._is_homogeneous_type : Whether all the columns in a - DataFrame have the same dtype. - MultiIndex._is_homogeneous_type : Whether all the levels of a - MultiIndex have the same dtype. - """ - return True - @property def shape(self): """ @@ -729,6 +713,8 @@ def array(self) -> ExtensionArray: period PeriodArray interval IntervalArray IntegerNA IntegerArray + string StringArray + boolean BooleanArray datetime64[ns, tz] DatetimeArray ================== ============================= @@ -780,7 +766,7 @@ def array(self) -> ExtensionArray: return result - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False, na_value=lib.no_default, **kwargs): """ A NumPy ndarray representing the values in this Series or Index. @@ -795,6 +781,17 @@ def to_numpy(self, dtype=None, copy=False): another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the type of the array. + + .. versionadded:: 1.0.0 + + **kwargs + Additional keywords passed through to the ``to_numpy`` method + of the underlying array (for extension arrays). + + .. versionadded:: 1.0.0 Returns ------- @@ -864,16 +861,21 @@ def to_numpy(self, dtype=None, copy=False): array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ - if is_datetime64tz_dtype(self.dtype) and dtype is None: - # note: this is going to change very soon. - # I have a WIP PR making this unnecessary, but it's - # a bit out of scope for the DatetimeArray PR. - dtype = "object" + if is_extension_array_dtype(self.dtype): + return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs) + else: + if kwargs: + msg = "to_numpy() got an unexpected keyword argument '{}'".format( + list(kwargs.keys())[0] + ) + raise TypeError(msg) result = np.asarray(self._values, dtype=dtype) # TODO(GH-24345): Avoid potential double copy - if copy: + if copy or na_value is not lib.no_default: result = result.copy() + if na_value is not lib.no_default: + result[self.isna()] = na_value return result @property @@ -1112,8 +1114,8 @@ def _map_values(self, mapper, na_action=None): # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types - if isinstance(mapper, dict): - if hasattr(mapper, "__missing__"): + if is_dict_like(mapper): + if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper diff --git a/pandas/core/common.py b/pandas/core/common.py index 9017584171850..f0fcb736586d6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -9,11 +9,12 @@ from datetime import datetime, timedelta from functools import partial import inspect -from typing import Any, Iterable, Union +from typing import Any, Collection, Iterable, Union import numpy as np from pandas._libs import lib, tslibs +from pandas._typing import T from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -110,14 +111,20 @@ def is_bool_indexer(key: Any) -> bool: Returns ------- bool + Whether `key` is a valid boolean indexer. Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. + + See Also + -------- + check_bool_array_indexer : Check that `key` + is a valid mask for an array, and convert to an ndarray. """ - na_msg = "cannot index with vector containing NA / NaN values" + na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): @@ -270,7 +277,7 @@ def maybe_make_list(obj): return obj -def maybe_iterable_to_list(obj: Union[Iterable, Any]) -> Union[list, Any]: +def maybe_iterable_to_list(obj: Union[Iterable[T], T]) -> Union[Collection[T], T]: """ If obj is Iterable but not list-like, consume into list. """ diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 57348ad3b81a0..a1b1cffdd1d76 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -2,10 +2,12 @@ """ from functools import partial, wraps +from typing import Dict, Optional, Sequence, Tuple, Type, Union import warnings import numpy as np +from pandas._typing import FrameOrSeries from pandas.errors import PerformanceWarning from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -15,22 +17,27 @@ from pandas.core.computation.common import result_type_many -def _align_core_single_unary_op(term): +def _align_core_single_unary_op( + term, +) -> Tuple[Union[partial, Type[FrameOrSeries]], Optional[Dict[str, int]]]: + + typ: Union[partial, Type[FrameOrSeries]] + axes: Optional[Dict[str, int]] = None + if isinstance(term.value, np.ndarray): typ = partial(np.asanyarray, dtype=term.value.dtype) else: typ = type(term.value) - ret = (typ,) + if hasattr(term.value, "axes"): + axes = _zip_axes_from_type(typ, term.value.axes) - if not hasattr(term.value, "axes"): - ret += (None,) - else: - ret += (_zip_axes_from_type(typ, term.value.axes),) - return ret + return typ, axes -def _zip_axes_from_type(typ, new_axes): - axes = {ax_name: new_axes[ax_ind] for ax_ind, ax_name in typ._AXIS_NAMES.items()} +def _zip_axes_from_type( + typ: Type[FrameOrSeries], new_axes: Sequence[int] +) -> Dict[str, int]: + axes = {name: new_axes[i] for i, name in typ._AXIS_NAMES.items()} return axes diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 994f470942cd1..19a8898a2987c 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -4,9 +4,6 @@ from pandas._config import get_option -# A token value Python's tokenizer probably will never use. -_BACKTICK_QUOTED_STRING = 100 - def _ensure_decoded(s): """ @@ -29,16 +26,5 @@ def result_type_many(*arrays_and_dtypes): return reduce(np.result_type, arrays_and_dtypes) -def _remove_spaces_column_name(name): - """ - Check if name contains any spaces, if it contains any spaces - the spaces will be removed and an underscore suffix is added. - """ - if not isinstance(name, str) or " " not in name: - return name - - return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING" - - class NameResolutionError(NameError): pass diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index dbfd6c04eee32..9c5388faae1bd 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -3,6 +3,7 @@ """ import abc +from typing import Dict, Type from pandas.core.computation.align import align_terms, reconstruct_object from pandas.core.computation.ops import _mathops, _reductions @@ -53,7 +54,7 @@ def convert(self) -> str: """ return printing.pprint_thing(self.expr) - def evaluate(self): + def evaluate(self) -> object: """ Run the engine on the expression. @@ -62,7 +63,7 @@ def evaluate(self): Returns ------- - obj : object + object The result of the passed expression. """ if not self._is_aligned: @@ -101,12 +102,6 @@ class NumExprEngine(AbstractEngine): has_neg_frac = True - def __init__(self, expr): - super().__init__(expr) - - def convert(self) -> str: - return str(super().convert()) - def _evaluate(self): import numexpr as ne @@ -128,14 +123,14 @@ class PythonEngine(AbstractEngine): has_neg_frac = False - def __init__(self, expr): - super().__init__(expr) - def evaluate(self): return self.expr() - def _evaluate(self): + def _evaluate(self) -> None: pass -_engines = {"numexpr": NumExprEngine, "python": PythonEngine} +_engines: Dict[str, Type[AbstractEngine]] = { + "numexpr": NumExprEngine, + "python": PythonEngine, +} diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 2e5a563b815b3..71e1b6c2a08a9 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -1,23 +1,25 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Top level ``eval`` module. """ import tokenize +from typing import Optional import warnings -from pandas._libs.lib import _no_default +from pandas._libs.lib import no_default from pandas.util._validators import validate_bool_kwarg from pandas.core.computation.engines import _engines -from pandas.core.computation.expr import Expr, _parsers, tokenize_string +from pandas.core.computation.expr import Expr, _parsers +from pandas.core.computation.parsing import tokenize_string from pandas.core.computation.scope import ensure_scope from pandas.io.formats.printing import pprint_thing -def _check_engine(engine): +def _check_engine(engine: Optional[str]) -> str: """ Make sure a valid engine is passed. @@ -168,8 +170,8 @@ def _check_for_locals(expr: str, stack_level: int, parser: str): def eval( expr, parser="pandas", - engine=None, - truediv=_no_default, + engine: Optional[str] = None, + truediv=no_default, local_dict=None, global_dict=None, resolvers=(), @@ -286,7 +288,7 @@ def eval( inplace = validate_bool_kwarg(inplace, "inplace") - if truediv is not _no_default: + if truediv is not no_default: warnings.warn( "The `truediv` parameter in pd.eval is deprecated and will be " "removed in a future version.", @@ -337,8 +339,8 @@ def eval( if parsed_expr.assigner is None: if multi_line: raise ValueError( - "Multi-line expressions are only valid" - " if all expressions contain an assignment" + "Multi-line expressions are only valid " + "if all expressions contain an assignment" ) elif inplace: raise ValueError("Cannot operate inplace if there is no assignment") diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 9b422b28c3c27..d91586e6c9b81 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -3,19 +3,13 @@ import ast from functools import partial, reduce -from io import StringIO -import itertools as it -import operator +from keyword import iskeyword import tokenize from typing import Optional, Type import numpy as np import pandas.core.common as com -from pandas.core.computation.common import ( - _BACKTICK_QUOTED_STRING, - _remove_spaces_column_name, -) from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, @@ -34,38 +28,12 @@ _unary_ops_syms, is_term, ) +from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string from pandas.core.computation.scope import Scope import pandas.io.formats.printing as printing -def tokenize_string(source: str): - """ - Tokenize a Python source code string. - - Parameters - ---------- - source : str - A Python source code string - """ - line_reader = StringIO(source).readline - token_generator = tokenize.generate_tokens(line_reader) - - # Loop over all tokens till a backtick (`) is found. - # Then, take all tokens till the next backtick to form a backtick quoted - # string. - for toknum, tokval, _, _, _ in token_generator: - if tokval == "`": - tokval = " ".join( - it.takewhile( - lambda tokval: tokval != "`", - map(operator.itemgetter(1), token_generator), - ) - ) - toknum = _BACKTICK_QUOTED_STRING - yield toknum, tokval - - def _rewrite_assign(tok): """Rewrite the assignment operator for PyTables expressions that use ``=`` as a substitute for ``==``. @@ -133,31 +101,6 @@ def _replace_locals(tok): return toknum, tokval -def _clean_spaces_backtick_quoted_names(tok): - """Clean up a column name if surrounded by backticks. - - Backtick quoted string are indicated by a certain tokval value. If a string - is a backtick quoted token it will processed by - :func:`_remove_spaces_column_name` so that the parser can find this - string when the query is executed. - See also :meth:`NDFrame._get_space_character_free_column_resolver`. - - Parameters - ---------- - tok : tuple of int, str - ints correspond to the all caps constants in the tokenize module - - Returns - ------- - t : tuple of int, str - Either the input or token or the replacement values - """ - toknum, tokval = tok - if toknum == _BACKTICK_QUOTED_STRING: - return tokenize.NAME, _remove_spaces_column_name(tokval) - return toknum, tokval - - def _compose2(f, g): """Compose 2 callables""" return lambda *args, **kwargs: f(g(*args, **kwargs)) @@ -172,10 +115,7 @@ def _compose(*funcs): def _preparse( source: str, f=_compose( - _replace_locals, - _replace_booleans, - _rewrite_assign, - _clean_spaces_backtick_quoted_names, + _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks ), ): """Compose a collection of tokenization functions @@ -426,8 +366,6 @@ def visit(self, node, **kwargs): try: node = ast.fix_missing_locations(ast.parse(clean)) except SyntaxError as e: - from keyword import iskeyword - if any(iskeyword(x) for x in clean.split()): e.msg = "Python keyword not valid identifier in numexpr query" raise e @@ -528,8 +466,8 @@ def _maybe_evaluate_binop( if res.has_invalid_return_type: raise TypeError( - f"unsupported operand type(s) for {res.op}:" - f" '{lhs.type}' and '{rhs.type}'" + f"unsupported operand type(s) for {res.op}: " + f"'{lhs.type}' and '{rhs.type}'" ) if self.engine != "pytables": @@ -781,9 +719,7 @@ def __init__( parser, preparser=partial( _preparse, - f=_compose( - _replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names - ), + f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks), ), ): super().__init__(env, engine, parser, preparser) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py new file mode 100644 index 0000000000000..ce213c8532834 --- /dev/null +++ b/pandas/core/computation/parsing.py @@ -0,0 +1,190 @@ +""":func:`~pandas.eval` source string parsing functions +""" + +from io import StringIO +from keyword import iskeyword +import token +import tokenize +from typing import Iterator, Tuple + +# A token value Python's tokenizer probably will never use. +BACKTICK_QUOTED_STRING = 100 + + +def create_valid_python_identifier(name: str) -> str: + """ + Create valid Python identifiers from any string. + + Check if name contains any special characters. If it contains any + special characters, the special characters will be replaced by + a special string and a prefix is added. + + Raises + ------ + SyntaxError + If the returned name is not a Python valid identifier, raise an exception. + This can happen if there is a hashtag in the name, as the tokenizer will + than terminate and not find the backtick. + But also for characters that fall out of the range of (U+0001..U+007F). + """ + if name.isidentifier() and not iskeyword(name): + return name + + # Create a dict with the special characters and their replacement string. + # EXACT_TOKEN_TYPES contains these special characters + # toke.tok_name contains a readable description of the replacement string. + special_characters_replacements = { + char: f"_{token.tok_name[tokval]}_" + # The ignore here is because of a bug in mypy that is resolved in 0.740 + for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore + } + special_characters_replacements.update( + { + " ": "_", + "?": "_QUESTIONMARK_", + "!": "_EXCLAMATIONMARK_", + "$": "_DOLLARSIGN_", + "€": "_EUROSIGN_", + # Including quotes works, but there are exceptions. + "'": "_SINGLEQUOTE_", + '"': "_DOUBLEQUOTE_", + # Currently not possible. Terminates parser and won't find backtick. + # "#": "_HASH_", + } + ) + + name = "".join(special_characters_replacements.get(char, char) for char in name) + name = "BACKTICK_QUOTED_STRING_" + name + + if not name.isidentifier(): + raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.") + + return name + + +def clean_backtick_quoted_toks(tok: Tuple[int, str]) -> Tuple[int, str]: + """ + Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_create_valid_python_identifier` so that the parser can find this + string when the query is executed. + In this case the tok will get the NAME tokval. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + tok : Tuple[int, str] + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == BACKTICK_QUOTED_STRING: + return tokenize.NAME, create_valid_python_identifier(tokval) + return toknum, tokval + + +def clean_column_name(name: str) -> str: + """ + Function to emulate the cleaning of a backtick quoted name. + + The purpose for this function is to see what happens to the name of + identifier if it goes to the process of being parsed a Python code + inside a backtick quoted string and than being cleaned + (removed of any special characters). + + Parameters + ---------- + name : str + Name to be cleaned. + + Returns + ------- + name : str + Returns the name after tokenizing and cleaning. + + Notes + ----- + For some cases, a name cannot be converted to a valid Python identifier. + In that case :func:`tokenize_string` raises a SyntaxError. + In that case, we just return the name unmodified. + + If this name was used in the query string (this makes the query call impossible) + an error will be raised by :func:`tokenize_backtick_quoted_string` instead, + which is not catched and propogates to the user level. + """ + try: + tokenized = tokenize_string(f"`{name}`") + tokval = next(tokenized)[1] + return create_valid_python_identifier(tokval) + except SyntaxError: + return name + + +def tokenize_backtick_quoted_string( + token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int +) -> Tuple[int, str]: + """ + Creates a token from a backtick quoted string. + + Moves the token_generator forwards till right after the next backtick. + + Parameters + ---------- + token_generator : Iterator[tokenize.TokenInfo] + The generator that yields the tokens of the source string (Tuple[int, str]). + The generator is at the first token after the backtick (`) + + source : str + The Python source code string. + + string_start : int + This is the start of backtick quoted string inside the source string. + + Returns + ------- + tok: Tuple[int, str] + The token that represents the backtick quoted string. + The integer is equal to BACKTICK_QUOTED_STRING (100). + """ + for _, tokval, start, _, _ in token_generator: + if tokval == "`": + string_end = start[1] + break + + return BACKTICK_QUOTED_STRING, source[string_start:string_end] + + +def tokenize_string(source: str) -> Iterator[Tuple[int, str]]: + """ + Tokenize a Python source code string. + + Parameters + ---------- + source : str + The Python source code string. + + Returns + ------- + tok_generator : Iterator[Tuple[int, str]] + An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). + """ + line_reader = StringIO(source).readline + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted string + for toknum, tokval, start, _, _ in token_generator: + if tokval == "`": + try: + yield tokenize_backtick_quoted_string( + token_generator, source, string_start=start[1] + 1 + ) + except Exception: + raise SyntaxError(f"Failed to parse backticks in '{source}'.") + else: + yield toknum, tokval diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 4d27bcf2845f1..be652ca0e6a36 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -533,7 +533,7 @@ def __init__( self._visitor = None # capture the environment if needed - local_dict = DeepChainMap() + local_dict: DeepChainMap[Any, Any] = DeepChainMap() if isinstance(where, PyTablesExpr): local_dict = where.env.scope diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index ba0a4d81a88d3..afdd8a01ee003 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -300,14 +300,15 @@ def table_schema_cb(key): _enable_data_resource_formatter(cf.get_option(key)) -def is_terminal(): +def is_terminal() -> bool: """ Detect if Python is running in a terminal. Returns True if Python is running in a terminal or False if not. """ try: - ip = get_ipython() + # error: Name 'get_ipython' is not defined + ip = get_ipython() # type: ignore except NameError: # assume standard Python interpreter in a terminal return True else: diff --git a/pandas/core/construction.py b/pandas/core/construction.py index cc8311cf3e21d..f947a1fda49f1 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -334,7 +334,7 @@ def array( return result -def extract_array(obj, extract_numpy=False): +def extract_array(obj, extract_numpy: bool = False): """ Extract the ndarray or ExtensionArray from a Series or Index. @@ -497,13 +497,8 @@ def sanitize_array( if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == "period": - from pandas.core.arrays import period_array - - try: - subarr = period_array(subarr) - except IncompatibleFrequency: - pass + if inferred in {"interval", "period"}: + subarr = array(subarr) return subarr diff --git a/pandas/core/dtypes/api.py b/pandas/core/dtypes/api.py index cb0912cbcf880..051affd0af1f9 100644 --- a/pandas/core/dtypes/api.py +++ b/pandas/core/dtypes/api.py @@ -1,6 +1,6 @@ # flake8: noqa -from .common import ( +from pandas.core.dtypes.common import ( is_array_like, is_bool, is_bool_dtype, diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 1dda51da49ffb..1b4e7062b38e5 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -236,6 +236,10 @@ def construct_from_string(cls, string: str): """ if not isinstance(string, str): raise TypeError(f"Expects a string, got {type(string).__name__}") + + # error: Non-overlapping equality check (left operand type: "str", right + # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap] + assert isinstance(cls.name, str), (cls, type(cls.name)) if string != cls.name: raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") return cls() @@ -276,10 +280,12 @@ def is_dtype(cls, dtype) -> bool: return False elif isinstance(dtype, cls): return True - try: - return cls.construct_from_string(dtype) is not None - except TypeError: - return False + if isinstance(dtype, str): + try: + return cls.construct_from_string(dtype) is not None + except TypeError: + return False + return False @property def _is_numeric(self) -> bool: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1ab21f18f3bdc..2a09bd7e54a8e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,9 +6,10 @@ from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import NaT, OutOfBoundsDatetime, Period, iNaT +from pandas._libs.tslibs.timezones import tz_compare from pandas.util._validators import validate_bool_kwarg -from .common import ( +from pandas.core.dtypes.common import ( _INT64_DTYPE, _NS_DTYPE, _POSSIBLY_CAST_DTYPES, @@ -41,8 +42,13 @@ is_unsigned_integer_dtype, pandas_dtype, ) -from .dtypes import DatetimeTZDtype, ExtensionDtype, IntervalDtype, PeriodDtype -from .generic import ( +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + IntervalDtype, + PeriodDtype, +) +from pandas.core.dtypes.generic import ( ABCDataFrame, ABCDatetimeArray, ABCDatetimeIndex, @@ -50,8 +56,8 @@ ABCPeriodIndex, ABCSeries, ) -from .inference import is_list_like -from .missing import isna, notna +from pandas.core.dtypes.inference import is_list_like +from pandas.core.dtypes.missing import isna, notna _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max @@ -409,6 +415,14 @@ def maybe_promote(dtype, fill_value=np.nan): elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT + elif not isinstance(fill_value, datetime): + dtype = np.dtype(np.object_) + elif fill_value.tzinfo is None: + dtype = np.dtype(np.object_) + elif not tz_compare(fill_value.tzinfo, dtype.tz): + # TODO: sure we want to cast here? + dtype = np.dtype(np.object_) + elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value @@ -656,7 +670,7 @@ def infer_dtype_from_array(arr, pandas_dtype: bool = False): # don't force numpy coerce with nan's inferred = lib.infer_dtype(arr, skipna=False) - if inferred in ["string", "bytes", "unicode", "mixed", "mixed-integer"]: + if inferred in ["string", "bytes", "mixed", "mixed-integer"]: return (np.object_, arr) arr = np.asarray(arr) @@ -814,20 +828,22 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): if is_object_dtype(dtype): return tslib.ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions if dtype.kind == "M": return arr.astype(dtype) - raise TypeError( - f"cannot astype a datetimelike from [{arr.dtype}] " f"to [{dtype}]" - ) + raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr): if is_object_dtype(dtype): return tslibs.ints_to_pytimedelta(arr.view(np.int64)) elif dtype == np.int64: + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) if dtype not in [_INT64_DTYPE, _TD_DTYPE]: @@ -842,9 +858,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): elif dtype == _TD_DTYPE: return arr.astype(_TD_DTYPE, copy=copy) - raise TypeError( - f"cannot astype a timedelta from [{arr.dtype}] " f"to [{dtype}]" - ) + raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index dc22a79a2f3fe..f62f03be9b732 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -18,7 +18,6 @@ ) from pandas.core.dtypes.generic import ( ABCCategorical, - ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, ABCPeriodArray, @@ -172,6 +171,8 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: try: return arr.astype("uint64", copy=copy, casting="safe") # type: ignore except TypeError: + if is_extension_array_dtype(arr.dtype): + return arr.to_numpy(dtype="float64", na_value=np.nan) return arr.astype("float64", copy=copy) @@ -193,12 +194,11 @@ def ensure_python_int(value: Union[int, np.integer]) -> int: """ if not is_scalar(value): raise TypeError(f"Value needs to be a scalar value, was type {type(value)}") - msg = "Wrong type {} for value {}" try: new_value = int(value) assert new_value == value except (TypeError, ValueError, AssertionError): - raise TypeError(msg.format(type(value), value)) + raise TypeError(f"Wrong type {type(value)} for value {value}") return new_value @@ -270,9 +270,9 @@ def is_sparse(arr) -> bool: -------- Returns `True` if the parameter is a 1-D pandas sparse array. - >>> is_sparse(pd.SparseArray([0, 0, 1, 0])) + >>> is_sparse(pd.arrays.SparseArray([0, 0, 1, 0])) True - >>> is_sparse(pd.Series(pd.SparseArray([0, 0, 1, 0]))) + >>> is_sparse(pd.Series(pd.arrays.SparseArray([0, 0, 1, 0]))) True Returns `False` if the parameter is not sparse. @@ -319,7 +319,7 @@ def is_scipy_sparse(arr) -> bool: >>> from scipy.sparse import bsr_matrix >>> is_scipy_sparse(bsr_matrix([1, 2, 3])) True - >>> is_scipy_sparse(pd.SparseArray([1, 2, 3])) + >>> is_scipy_sparse(pd.arrays.SparseArray([1, 2, 3])) False """ @@ -368,37 +368,6 @@ def is_categorical(arr) -> bool: return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr) -def is_offsetlike(arr_or_obj) -> bool: - """ - Check if obj or all elements of list-like is DateOffset - - Parameters - ---------- - arr_or_obj : object - - Returns - ------- - boolean - Whether the object is a DateOffset or listlike of DatetOffsets - - Examples - -------- - >>> is_offsetlike(pd.DateOffset(days=1)) - True - >>> is_offsetlike('offset') - False - >>> is_offsetlike([pd.offsets.Minute(4), pd.offsets.MonthEnd()]) - True - >>> is_offsetlike(np.array([pd.DateOffset(months=3), pd.Timestamp.now()])) - False - """ - if isinstance(arr_or_obj, ABCDateOffset): - return True - elif is_list_like(arr_or_obj) and len(arr_or_obj) and is_object_dtype(arr_or_obj): - return all(isinstance(x, ABCDateOffset) for x in arr_or_obj) - return False - - def is_datetime64_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the datetime64 dtype. @@ -633,7 +602,14 @@ def is_string_dtype(arr_or_dtype) -> bool: # TODO: gh-15585: consider making the checks stricter. def condition(dtype) -> bool: - return dtype.kind in ("O", "S", "U") and not is_period_dtype(dtype) + return dtype.kind in ("O", "S", "U") and not is_excluded_dtype(dtype) + + def is_excluded_dtype(dtype) -> bool: + """ + These have kind = "O" but aren't string dtypes so need to be explicitly excluded + """ + is_excluded_checks = (is_period_dtype, is_interval_dtype) + return any(is_excluded(dtype) for is_excluded in is_excluded_checks) return _is_dtype(arr_or_dtype, condition) @@ -1492,7 +1468,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: True >>> is_bool_dtype(pd.Categorical([True, False])) True - >>> is_bool_dtype(pd.SparseArray([True, False])) + >>> is_bool_dtype(pd.arrays.SparseArray([True, False])) True """ if arr_or_dtype is None: @@ -1554,7 +1530,7 @@ def is_extension_type(arr) -> bool: True >>> is_extension_type(pd.Series(cat)) True - >>> is_extension_type(pd.SparseArray([1, 2, 3])) + >>> is_extension_type(pd.arrays.SparseArray([1, 2, 3])) True >>> from scipy.sparse import bsr_matrix >>> is_extension_type(bsr_matrix([1, 2, 3])) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 7b3e7d4f42121..cd4b5af4588e5 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -220,7 +220,7 @@ def union_categoricals( ----- To learn more about categories, see `link - `__ + `__ Examples -------- diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index b77cd34700f10..93522abc3a48f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -9,10 +9,9 @@ from pandas._libs.tslibs import NaT, Period, Timestamp, timezones from pandas._typing import Ordered +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCDateOffset, ABCIndexClass - -from .base import ExtensionDtype -from .inference import is_bool, is_list_like +from pandas.core.dtypes.inference import is_bool, is_list_like str_type = str @@ -436,12 +435,11 @@ def __eq__(self, other: Any) -> bool: return hash(self) == hash(other) def __repr__(self) -> str_type: - tpl = "CategoricalDtype(categories={data}ordered={ordered})" if self.categories is None: data = "None, " else: data = self.categories._format_data(name=type(self).__name__) - return tpl.format(data=data, ordered=self.ordered) + return f"CategoricalDtype(categories={data}ordered={self.ordered})" @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: @@ -882,7 +880,11 @@ def construct_from_string(cls, string): return cls(freq=string) except ValueError: pass - raise TypeError(f"Cannot construct a 'PeriodDtype' from '{string}'") + if isinstance(string, str): + msg = f"Cannot construct a 'PeriodDtype' from '{string}'" + else: + msg = f"'construct_from_string' expects a string, got {type(string)}" + raise TypeError(msg) def __str__(self) -> str_type: return self.name @@ -946,6 +948,26 @@ def construct_array_type(cls): return PeriodArray + def __from_arrow__(self, array): + """Construct PeriodArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import PeriodArray + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype="int64") + parr = PeriodArray(data.copy(), freq=self.freq, copy=False) + parr[~mask] = NaT + results.append(parr) + + return PeriodArray._concat_same_type(results) + @register_extension_dtype class IntervalDtype(PandasExtensionDtype): @@ -974,7 +996,7 @@ class IntervalDtype(PandasExtensionDtype): """ name = "interval" - kind: Optional[str_type] = None + kind: str_type = "O" str = "|O08" base = np.dtype("O") num = 103 @@ -1117,3 +1139,22 @@ def is_dtype(cls, dtype) -> bool: else: return False return super().is_dtype(dtype) + + def __from_arrow__(self, array): + """Construct IntervalArray from pyarrow Array/ChunkedArray.""" + import pyarrow + from pandas.core.arrays import IntervalArray + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + chunks = array.chunks + + results = [] + for arr in chunks: + left = np.asarray(arr.storage.field("left"), dtype=self.subtype) + right = np.asarray(arr.storage.field("right"), dtype=self.subtype) + iarr = IntervalArray.from_arrays(left, right, closed=array.type.closed) + results.append(iarr) + + return IntervalArray._concat_same_type(results) diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index aa0f7d2aba1fc..4c3f8b7374465 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -4,7 +4,10 @@ # define abstract base classes to enable isinstance type checking on our # objects def create_pandas_abc_type(name, attr, comp): - @classmethod + + # https://github.com/python/mypy/issues/1006 + # error: 'classmethod' used with a non-method + @classmethod # type: ignore def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index fc22d5be1ca69..fb579f2f58a57 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -9,7 +9,7 @@ import pandas._libs.missing as libmissing from pandas._libs.tslibs import NaT, iNaT -from .common import ( +from pandas.core.dtypes.common import ( _NS_DTYPE, _TD_DTYPE, ensure_object, @@ -31,7 +31,7 @@ needs_i8_conversion, pandas_dtype, ) -from .generic import ( +from pandas.core.dtypes.generic import ( ABCDatetimeArray, ABCExtensionArray, ABCGeneric, @@ -40,7 +40,7 @@ ABCSeries, ABCTimedeltaArray, ) -from .inference import is_list_like +from pandas.core.dtypes.inference import is_list_like isposinf_scalar = libmissing.isposinf_scalar isneginf_scalar = libmissing.isneginf_scalar @@ -212,7 +212,7 @@ def _use_inf_as_na(key): This approach to setting global module values is discussed and approved here: - * http://stackoverflow.com/questions/4859217/ + * https://stackoverflow.com/questions/4859217/ programmatically-creating-variables-in-python/4859312#4859312 """ flag = get_option(key) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dfda1470413b7..fa9a951d6849c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -15,6 +15,8 @@ import sys from textwrap import dedent from typing import ( + IO, + TYPE_CHECKING, Any, FrozenSet, Hashable, @@ -36,7 +38,9 @@ from pandas._config import get_option from pandas._libs import algos as libalgos, lib -from pandas._typing import Axes, Dtype, FilePathOrBuffer +from pandas._typing import Axes, Axis, Dtype, FilePathOrBuffer, Level, Renamer +from pandas.compat import PY37 +from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -118,10 +122,15 @@ from pandas.core.ops.missing import dispatch_fill_zeros from pandas.core.series import Series +from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt from pandas.io.formats.printing import pprint_thing import pandas.plotting +if TYPE_CHECKING: + from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.io.formats.style import Styler + # --------------------------------------------------------------------- # Docstring templates @@ -137,11 +146,12 @@ Name or list of names to sort by. - if `axis` is 0 or `'index'` then `by` may contain index - levels and/or column labels + levels and/or column labels. - if `axis` is 1 or `'columns'` then `by` may contain column - levels and/or index labels + levels and/or index labels. .. versionchanged:: 0.23.0 + Allow specifying index or column level names.""", versionadded_to_excel="", optional_labels="""labels : array-like, optional @@ -392,7 +402,7 @@ def _constructor(self) -> Type["DataFrame"]: _constructor_sliced: Type[Series] = Series _deprecations: FrozenSet[str] = NDFrame._deprecations | frozenset([]) - _accessors: Set[str] = set() + _accessors: Set[str] = {"sparse"} @property def _constructor_expanddim(self): @@ -812,7 +822,7 @@ def to_string( # ---------------------------------------------------------------------- @property - def style(self): + def style(self) -> "Styler": """ Returns a Styler object. @@ -887,10 +897,10 @@ def items(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield k, self._ixs(i, axis=1) @Appender(_shared_docs["items"]) - def iteritems(self): + def iteritems(self) -> Iterable[Tuple[Optional[Hashable], Series]]: yield from self.items() - def iterrows(self): + def iterrows(self) -> Iterable[Tuple[Optional[Hashable], Series]]: """ Iterate over DataFrame rows as (index, Series) pairs. @@ -971,7 +981,8 @@ def itertuples(self, index=True, name="Pandas"): ----- The column names will be renamed to positional names if they are invalid Python identifiers, repeated, or start with an underscore. - With a large number of columns (>255), regular tuples are returned. + On python versions < 3.7 regular tuples are returned for DataFrames + with a large number of columns (>254). Examples -------- @@ -1014,8 +1025,9 @@ def itertuples(self, index=True, name="Pandas"): # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - # Python 3 supports at most 255 arguments to constructor - if name is not None and len(self.columns) + index < 256: + # Python versions before 3.7 support at most 255 arguments to constructors + can_return_named_tuples = PY37 or len(self.columns) + index < 255 + if name is not None and can_return_named_tuples: itertuple = collections.namedtuple(name, fields, rename=True) return map(itertuple._make, zip(*arrays)) @@ -1154,7 +1166,7 @@ def __rmatmul__(self, other): # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient="columns", dtype=None, columns=None): + def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame": """ Construct DataFrame from dict of array-like or dicts. @@ -1234,7 +1246,7 @@ def from_dict(cls, data, orient="columns", dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) - def to_numpy(self, dtype=None, copy=False): + def to_numpy(self, dtype=None, copy=False) -> np.ndarray: """ Convert the DataFrame to a NumPy array. @@ -1438,7 +1450,7 @@ def to_gbq( location=None, progress_bar=True, credentials=None, - ): + ) -> None: """ Write a DataFrame to a Google BigQuery table. @@ -1466,7 +1478,7 @@ def to_gbq( Behavior when the destination table exists. Value can be one of: ``'fail'`` - If table exists, do nothing. + If table exists raise pandas_gbq.gbq.TableCreationError. ``'replace'`` If table exists, drop it, recreate it, and insert data. ``'append'`` @@ -1543,7 +1555,7 @@ def from_records( columns=None, coerce_float=False, nrows=None, - ): + ) -> "DataFrame": """ Convert structured or record ndarray to DataFrame. @@ -1665,7 +1677,9 @@ def from_records( return cls(mgr) - def to_records(self, index=True, column_dtypes=None, index_dtypes=None): + def to_records( + self, index=True, column_dtypes=None, index_dtypes=None + ) -> np.recarray: """ Convert DataFrame to a NumPy record array. @@ -1830,7 +1844,7 @@ def to_records(self, index=True, column_dtypes=None, index_dtypes=None): return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) @classmethod - def _from_arrays(cls, arrays, columns, index, dtype=None): + def _from_arrays(cls, arrays, columns, index, dtype=None) -> "DataFrame": mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) return cls(mgr) @@ -1884,14 +1898,22 @@ def to_stata( variable_labels : dict Dictionary containing columns as keys and variable labels as values. Each label must be 80 characters or smaller. - version : {114, 117}, default 114 - Version to use in the output dta file. Version 114 can be used - read by Stata 10 and later. Version 117 can be read by Stata 13 - or later. Version 114 limits string variables to 244 characters or - fewer while 117 allows strings with lengths up to 2,000,000 - characters. + version : {114, 117, 118, 119, None}, default 114 + Version to use in the output dta file. Set to None to let pandas + decide between 118 or 119 formats depending on the number of + columns in the frame. Version 114 can be read by Stata 10 and + later. Version 117 can be read by Stata 13 or later. Version 118 + is supported in Stata 14 and later. Version 119 is supported in + Stata 15 and later. Version 114 limits string variables to 244 + characters or fewer while versions 117 and later allow strings + with lengths up to 2,000,000 characters. Versions 118 and 119 + support Unicode characters, and version 119 supports more than + 32,767 variables. .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + + Added support for formats 118 and 119. convert_strl : list, optional List of column names to convert to string columns to Stata StrL @@ -1925,17 +1947,24 @@ def to_stata( ... 'speed': [350, 18, 361, 15]}) >>> df.to_stata('animals.dta') # doctest: +SKIP """ - kwargs = {} - if version not in (114, 117): - raise ValueError("Only formats 114 and 117 supported.") + if version not in (114, 117, 118, 119, None): + raise ValueError("Only formats 114, 117, 118 and 119 are supported.") if version == 114: if convert_strl is not None: - raise ValueError("strl support is only available when using format 117") + raise ValueError("strl is not supported in format 114") from pandas.io.stata import StataWriter as statawriter - else: + elif version == 117: from pandas.io.stata import StataWriter117 as statawriter + else: # versions 118 and 119 + from pandas.io.stata import StataWriterUTF8 as statawriter + kwargs = {} + if version is None or version >= 117: + # strl conversion is only supported >= 117 kwargs["convert_strl"] = convert_strl + if version is None or version >= 118: + # Specifying the version is only supported for UTF8 (118 or 119) + kwargs["version"] = version writer = statawriter( path, @@ -1951,7 +1980,7 @@ def to_stata( writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path): + def to_feather(self, path) -> None: """ Write out the binary feather-format for DataFrames. @@ -1964,6 +1993,36 @@ def to_feather(self, path): to_feather(self, path) + @Appender( + """ + Examples + -------- + >>> df = pd.DataFrame( + ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} + ... ) + >>> print(df.to_markdown()) + | | animal_1 | animal_2 | + |---:|:-----------|:-----------| + | 0 | elk | dog | + | 1 | pig | quetzal | + """ + ) + @Substitution(klass="DataFrame") + @Appender(_shared_docs["to_markdown"]) + def to_markdown( + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + ) -> Optional[str]: + kwargs.setdefault("headers", "keys") + kwargs.setdefault("tablefmt", "pipe") + tabulate = import_optional_dependency("tabulate") + result = tabulate.tabulate(self, **kwargs) + if buf is None: + return result + buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) + assert buf is not None # Help mypy. + buf.writelines(result) + return None + @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, @@ -1973,7 +2032,7 @@ def to_parquet( index=None, partition_cols=None, **kwargs, - ): + ) -> None: """ Write a DataFrame to the binary parquet format. @@ -2108,9 +2167,10 @@ def to_html( A ``border=border`` attribute is included in the opening `` tag. Default ``pd.options.display.html.border``. encoding : str, default "utf-8" - Set character encoding + Set character encoding. .. versionadded:: 1.0 + table_id : str, optional A css id is included in the opening `
` tag if specified. @@ -2163,7 +2223,7 @@ def to_html( def info( self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None - ): + ) -> None: """ Print a concise summary of a DataFrame. @@ -2236,9 +2296,11 @@ def info( RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): - int_col 5 non-null int64 - text_col 5 non-null object - float_col 5 non-null float64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 248.0+ bytes @@ -2277,9 +2339,11 @@ def info( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - column_1 1000000 non-null object - column_2 1000000 non-null object - column_3 1000000 non-null object + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB @@ -2287,9 +2351,11 @@ def info( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - column_1 1000000 non-null object - column_2 1000000 non-null object - column_3 1000000 non-null object + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 188.8 MB """ @@ -2308,6 +2374,7 @@ def info( return cols = self.columns + col_count = len(self.columns) # hack if max_cols is None: @@ -2316,36 +2383,76 @@ def info( max_rows = get_option("display.max_info_rows", len(self) + 1) if null_counts is None: - show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows) + show_counts = (col_count <= max_cols) and (len(self) < max_rows) else: show_counts = null_counts - exceeds_info_cols = len(self.columns) > max_cols + exceeds_info_cols = col_count > max_cols def _verbose_repr(): lines.append(f"Data columns (total {len(self.columns)} columns):") - space = max(len(pprint_thing(k)) for k in self.columns) + 4 + + id_head = " # " + column_head = "Column" + col_space = 2 + + max_col = max(len(pprint_thing(k)) for k in cols) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space + + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space counts = None - tmpl = "{count}{dtype}" + header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: counts = self.count() if len(cols) != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({len(cols)} != {len(counts)})" ) - tmpl = "{count} non-null {dtype}" + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) + + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) - dtypes = self.dtypes for i, col in enumerate(self.columns): - dtype = dtypes.iloc[i] + dtype = self.dtypes.iloc[i] col = pprint_thing(col) + line_no = _put_str(f" {i}", space_num) count = "" if show_counts: count = counts.iloc[i] lines.append( - _put_str(col, space) + tmpl.format(count=count, dtype=dtype) + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) ) def _non_verbose_repr(): @@ -2391,7 +2498,7 @@ def _sizeof_fmt(num, size_qualifier): lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") fmt.buffer_put_lines(buf, lines) - def memory_usage(self, index=True, deep=False): + def memory_usage(self, index=True, deep=False) -> Series: """ Return the memory usage of each column in bytes. @@ -2485,7 +2592,7 @@ def memory_usage(self, index=True, deep=False): ) return result - def transpose(self, *args, **kwargs): + def transpose(self, *args, copy: bool = False) -> "DataFrame": """ Transpose index and columns. @@ -2495,9 +2602,14 @@ def transpose(self, *args, **kwargs): Parameters ---------- - *args, **kwargs - Additional arguments and keywords have no effect but might be - accepted for compatibility with numpy. + *args : tuple, optional + Accepted for compatibility with NumPy. + copy : bool, default False + Whether to copy the data after transposing, even for DataFrames + with a single dtype. + + Note that a copy is always required for mixed dtype DataFrames, + or for DataFrames with any extension types. Returns ------- @@ -2578,7 +2690,29 @@ def transpose(self, *args, **kwargs): dtype: object """ nv.validate_transpose(args, dict()) - return super().transpose(1, 0, **kwargs) + # construct the args + + dtypes = list(self.dtypes) + if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]): + # We have EAs with the same dtype. We can preserve that dtype in transpose. + dtype = dtypes[0] + arr_type = dtype.construct_array_type() + values = self.values + + new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] + result = self._constructor( + dict(zip(self.index, new_values)), index=self.columns + ) + + else: + new_values = self.values.T + if copy: + new_values = new_values.copy() + result = self._constructor( + new_values, index=self.columns, columns=self.index + ) + + return result.__finalize__(self) T = property(transpose) @@ -2615,14 +2749,7 @@ def _ixs(self, i: int, axis: int = 0): else: label = self.columns[i] - # if the values returned are not the same length - # as the index (iow a not found value), iget returns - # a 0-len ndarray. This is effectively catching - # a numpy error (as numpy should really raise) values = self._data.iget(i) - - if len(self.index) and not len(values): - values = np.array([np.nan] * len(self.index), dtype=object) result = self._box_col_values(values, label) # this is a cached value, mark it so @@ -2950,18 +3077,27 @@ def query(self, expr, inplace=False, **kwargs): Parameters ---------- expr : str - The query string to evaluate. You can refer to variables + The query string to evaluate. + + You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. - .. versionadded:: 0.25.0 - - You can refer to column names that contain spaces by surrounding - them in backticks. + You can refer to column names that contain spaces or operators by + surrounding them in backticks. This way you can also escape + names that start with a digit, or those that are a Python keyword. + Basically when it is not valid Python identifier. See notes down + for more details. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. + .. versionadded:: 0.25.0 + Backtick quoting introduced. + + .. versionadded:: 1.0.0 + Expanding functionality of backtick quoting for more than only spaces. + inplace : bool Whether the query should modify the data in place or return a modified copy. @@ -3016,6 +3152,32 @@ def query(self, expr, inplace=False, **kwargs): For further details and examples see the ``query`` documentation in :ref:`indexing `. + *Backtick quoted variables* + + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. + + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. + Examples -------- >>> df = pd.DataFrame({'A': range(1, 6), @@ -3165,14 +3327,15 @@ def eval(self, expr, inplace=False, **kwargs): kwargs["level"] = kwargs.pop("level", 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() - column_resolvers = self._get_space_character_free_column_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() resolvers = column_resolvers, index_resolvers if "target" not in kwargs: kwargs["target"] = self kwargs["resolvers"] = kwargs.get("resolvers", ()) + tuple(resolvers) + return _eval(expr, inplace=inplace, **kwargs) - def select_dtypes(self, include=None, exclude=None): + def select_dtypes(self, include=None, exclude=None) -> "DataFrame": """ Return a subset of the DataFrame's columns based on the column dtypes. @@ -3302,7 +3465,7 @@ def extract_unique_dtypes_from_dtypes_set( return self.iloc[:, keep_these.values] - def insert(self, loc, column, value, allow_duplicates=False): + def insert(self, loc, column, value, allow_duplicates=False) -> None: """ Insert column into DataFrame at specified location. @@ -3322,7 +3485,7 @@ def insert(self, loc, column, value, allow_duplicates=False): value = self._sanitize_column(column, value, broadcast=False) self._data.insert(loc, column, value, allow_duplicates=allow_duplicates) - def assign(self, **kwargs): + def assign(self, **kwargs) -> "DataFrame": r""" Assign new columns to a DataFrame. @@ -3505,7 +3668,7 @@ def _series(self): for idx, item in enumerate(self.columns) } - def lookup(self, row_labels, col_labels): + def lookup(self, row_labels, col_labels) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. @@ -3613,7 +3776,7 @@ def _reindex_columns( allow_dups=False, ) - def _reindex_multi(self, axes, copy, fill_value): + def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": """ We are guaranteed non-Nones in the axes. """ @@ -3647,7 +3810,7 @@ def align( limit=None, fill_axis=0, broadcast_axis=None, - ): + ) -> "DataFrame": return super().align( other, join=join, @@ -3661,6 +3824,46 @@ def align( broadcast_axis=broadcast_axis, ) + @Appender( + """ + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + + Change the row labels. + + >>> df.set_axis(['a', 'b', 'c'], axis='index') + A B + a 1 4 + b 2 5 + c 3 6 + + Change the column labels. + + >>> df.set_axis(['I', 'II'], axis='columns') + I II + 0 1 4 + 1 2 5 + 2 3 6 + + Now, update the labels inplace. + + >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) + >>> df + i ii + 0 1 4 + 1 2 5 + 2 3 6 + """ + ) + @Substitution( + **_shared_doc_kwargs, + extended_summary_sub=" column or", + axis_description_sub=", and 1 identifies the columns", + see_also_sub=" or columns", + ) + @Appender(NDFrame.set_axis.__doc__) + def set_axis(self, labels, axis=0, inplace=False): + return super().set_axis(labels, axis=axis, inplace=inplace) + @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.reindex.__doc__) @rewrite_axis_style_signature( @@ -3674,13 +3877,13 @@ def align( ("tolerance", None), ], ) - def reindex(self, *args, **kwargs): + def reindex(self, *args, **kwargs) -> "DataFrame": axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names kwargs.pop("axis", None) kwargs.pop("labels", None) - return super().reindex(**kwargs) + return self._ensure_type(super().reindex(**kwargs)) def drop( self, @@ -3828,7 +4031,19 @@ def drop( "mapper", [("copy", True), ("inplace", False), ("level", None), ("errors", "ignore")], ) - def rename(self, *args, **kwargs): + def rename( + self, + mapper: Optional[Renamer] = None, + *, + index: Optional[Renamer] = None, + columns: Optional[Renamer] = None, + axis: Optional[Axis] = None, + copy: bool = True, + inplace: bool = False, + level: Optional[Level] = None, + errors: str = "ignore", + ) -> Optional["DataFrame"]: + """ Alter axes labels. @@ -3937,12 +4152,16 @@ def rename(self, *args, **kwargs): 2 2 5 4 3 6 """ - axes = validate_axis_style_args(self, args, kwargs, "mapper", "rename") - kwargs.update(axes) - # Pop these, since the values are in `kwargs` under different names - kwargs.pop("axis", None) - kwargs.pop("mapper", None) - return super().rename(**kwargs) + return super().rename( + mapper=mapper, + index=index, + columns=columns, + axis=axis, + copy=copy, + inplace=inplace, + level=level, + errors=errors, + ) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.fillna.__doc__) @@ -3954,8 +4173,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs, - ): + ) -> Optional["DataFrame"]: return super().fillna( value=value, method=method, @@ -3963,7 +4181,6 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs, ) @Appender(_shared_docs["replace"] % _shared_doc_kwargs) @@ -3986,9 +4203,9 @@ def replace( ) @Appender(_shared_docs["shift"] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0, fill_value=None): - return super().shift( - periods=periods, freq=freq, axis=axis, fill_value=fill_value + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": + return self._ensure_type( + super().shift(periods=periods, freq=freq, axis=axis, fill_value=fill_value) ) def set_index( @@ -4093,7 +4310,7 @@ def set_index( "one-dimensional arrays." ) - missing = [] + missing: List[Optional[Hashable]] = [] for col in keys: if isinstance( col, (ABCIndexClass, ABCSeries, np.ndarray, list, abc.Iterator) @@ -4130,7 +4347,7 @@ def set_index( else: arrays.append(self.index) - to_remove = [] + to_remove: List[Optional[Hashable]] = [] for col in keys: if isinstance(col, ABCMultiIndex): for n in range(col.nlevels): @@ -4426,19 +4643,19 @@ def _maybe_casted_values(index, labels=None): # Reindex-based selection methods @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): + def isna(self) -> "DataFrame": return super().isna() @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isnull(self): + def isnull(self) -> "DataFrame": return super().isnull() @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): + def notna(self) -> "DataFrame": return super().notna() @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notnull(self): + def notnull(self) -> "DataFrame": return super().notnull() def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): @@ -4587,6 +4804,7 @@ def drop_duplicates( subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", inplace: bool = False, + ignore_index: bool = False, ) -> Optional["DataFrame"]: """ Return DataFrame with duplicate rows removed. @@ -4606,6 +4824,10 @@ def drop_duplicates( - False : Drop all duplicates. inplace : bool, default False Whether to drop duplicates in place or to return a copy. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -4621,9 +4843,16 @@ def drop_duplicates( if inplace: (inds,) = (-duplicated)._ndarray_values.nonzero() new_data = self._data.take(inds) + + if ignore_index: + new_data.axes[1] = ibase.default_index(len(inds)) self._update_inplace(new_data) else: - return self[-duplicated] + result = self[-duplicated] + + if ignore_index: + result.index = ibase.default_index(len(result)) + return result return None @@ -4704,6 +4933,7 @@ def sort_values( inplace=False, kind="quicksort", na_position="last", + ignore_index=False, ): inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) @@ -4737,23 +4967,60 @@ def sort_values( indexer, axis=self._get_block_manager_axis(axis), verify=False ) + if ignore_index: + new_data.axes[1] = ibase.default_index(len(indexer)) + if inplace: return self._update_inplace(new_data) else: return self._constructor(new_data).__finalize__(self) - @Substitution(**_shared_doc_kwargs) - @Appender(NDFrame.sort_index.__doc__) def sort_index( self, axis=0, level=None, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - sort_remaining=True, + ascending: bool = True, + inplace: bool = False, + kind: str = "quicksort", + na_position: str = "last", + sort_remaining: bool = True, + ignore_index: bool = False, ): + """ + Sort object by labels (along an axis). + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + ascending : bool, default True + Sort ascending vs. descending. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See also ndarray.np.sort for more + information. `mergesort` is the only stable algorithm. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + Not implemented for MultiIndex. + sort_remaining : bool, default True + If True and sorting by level and index is multilevel, sort by other + levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 + + Returns + ------- + sorted_obj : DataFrame or None + DataFrame with sorted index if inplace=False, None otherwise. + """ # TODO: this can be combined with Series.sort_index impl as # almost identical @@ -4803,12 +5070,15 @@ def sort_index( # reconstruct axis if needed new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + if ignore_index: + new_data.axes[1] = ibase.default_index(len(indexer)) + if inplace: return self._update_inplace(new_data) else: return self._constructor(new_data).__finalize__(self) - def nlargest(self, n, columns, keep="first"): + def nlargest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in descending order. @@ -4917,7 +5187,7 @@ def nlargest(self, n, columns, keep="first"): """ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n, columns, keep="first"): + def nsmallest(self, n, columns, keep="first") -> "DataFrame": """ Return the first `n` rows ordered by `columns` in ascending order. @@ -5018,7 +5288,7 @@ def nsmallest(self, n, columns, keep="first"): self, n=n, keep=keep, columns=columns ).nsmallest() - def swaplevel(self, i=-2, j=-1, axis=0): + def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": """ Swap levels i and j in a MultiIndex on a particular axis. @@ -5040,7 +5310,7 @@ def swaplevel(self, i=-2, j=-1, axis=0): result.columns = result.columns.swaplevel(i, j) return result - def reorder_levels(self, order, axis=0): + def reorder_levels(self, order, axis=0) -> "DataFrame": """ Rearrange index levels using input order. May not drop or duplicate levels. @@ -5054,7 +5324,7 @@ def reorder_levels(self, order, axis=0): Returns ------- - type of caller (new object) + DataFrame """ axis = self._get_axis_number(axis) if not isinstance(self._get_axis(axis), ABCMultiIndex): # pragma: no cover @@ -5072,7 +5342,7 @@ def reorder_levels(self, order, axis=0): # Arithmetic / combination related def _combine_frame(self, other, func, fill_value=None, level=None): - this, other = self.align(other, join="outer", level=level, copy=False) + # at this point we have `self._indexed_same(other)` if fill_value is None: # since _arith_op may be called in a loop, avoid function call @@ -5088,14 +5358,15 @@ def _arith_op(left, right): left, right = ops.fill_binop(left, right, fill_value) return func(left, right) - if ops.should_series_dispatch(this, other, func): + if ops.should_series_dispatch(self, other, func): # iterate over columns - new_data = ops.dispatch_to_series(this, other, _arith_op) + new_data = ops.dispatch_to_series(self, other, _arith_op) else: with np.errstate(all="ignore"): - res_values = _arith_op(this.values, other.values) - new_data = dispatch_fill_zeros(func, this.values, other.values, res_values) - return this._construct_result(new_data) + res_values = _arith_op(self.values, other.values) + new_data = dispatch_fill_zeros(func, self.values, other.values, res_values) + + return new_data def _combine_match_index(self, other, func): # at this point we have `self.index.equals(other.index)` @@ -5127,7 +5398,9 @@ def _construct_result(self, result) -> "DataFrame": out.columns = self.columns return out - def combine(self, other, func, fill_value=None, overwrite=True): + def combine( + self, other: "DataFrame", func, fill_value=None, overwrite=True + ) -> "DataFrame": """ Perform column-wise combine with another DataFrame. @@ -5294,7 +5567,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first(self, other): + def combine_first(self, other: "DataFrame") -> "DataFrame": """ Update null elements with value in the same location in `other`. @@ -5372,7 +5645,7 @@ def combiner(x, y): def update( self, other, join="left", overwrite=True, filter_func=None, errors="ignore" - ): + ) -> None: """ Modify in place using non-NA values from another DataFrame. @@ -5523,6 +5796,83 @@ def update( # ---------------------------------------------------------------------- # Data reshaping + @Appender( + """ +Examples +-------- +>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', +... 'Parrot', 'Parrot'], +... 'Max Speed': [380., 370., 24., 26.]}) +>>> df + Animal Max Speed +0 Falcon 380.0 +1 Falcon 370.0 +2 Parrot 24.0 +3 Parrot 26.0 +>>> df.groupby(['Animal']).mean() + Max Speed +Animal +Falcon 375.0 +Parrot 25.0 + +**Hierarchical Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, +... index=index) +>>> df + Max Speed +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +>>> df.groupby(level=0).mean() + Max Speed +Animal +Falcon 370.0 +Parrot 25.0 +>>> df.groupby(level="Type").mean() + Max Speed +Type +Captive 210.0 +Wild 185.0 +""" + ) + @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "DataFrameGroupBy": + from pandas.core.groupby.generic import DataFrameGroupBy + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return DataFrameGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) _shared_docs[ "pivot" @@ -5631,7 +5981,7 @@ def update( @Substitution("") @Appender(_shared_docs["pivot"]) - def pivot(self, index=None, columns=None, values=None): + def pivot(self, index=None, columns=None, values=None) -> "DataFrame": from pandas.core.reshape.pivot import pivot return pivot(self, index=index, columns=columns, values=values) @@ -5778,7 +6128,7 @@ def pivot_table( dropna=True, margins_name="All", observed=False, - ): + ) -> "DataFrame": from pandas.core.reshape.pivot import pivot_table return pivot_table( @@ -6208,7 +6558,7 @@ def melt( var_name=None, value_name="value", col_level=None, - ): + ) -> "DataFrame": from pandas.core.reshape.melt import melt return melt( @@ -6223,7 +6573,7 @@ def melt( # ---------------------------------------------------------------------- # Time series-related - def diff(self, periods=1, axis=0): + def diff(self, periods=1, axis=0) -> "DataFrame": """ First discrete difference of element. @@ -6431,7 +6781,7 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): agg = aggregate @Appender(_shared_docs["transform"] % _shared_doc_kwargs) - def transform(self, func, axis=0, *args, **kwargs): + def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": axis = self._get_axis_number(axis) if axis == 1: return self.T.transform(func, *args, **kwargs).T @@ -6586,7 +6936,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): ) return op.get_result() - def applymap(self, func): + def applymap(self, func) -> "DataFrame": """ Apply a function to a Dataframe elementwise. @@ -6655,7 +7005,9 @@ def infer(x): # ---------------------------------------------------------------------- # Merging / joining methods - def append(self, other, ignore_index=False, verify_integrity=False, sort=False): + def append( + self, other, ignore_index=False, verify_integrity=False, sort=False + ) -> "DataFrame": """ Append rows of `other` to the end of caller, returning a new object. @@ -6750,11 +7102,13 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=False): """ if isinstance(other, (Series, dict)): if isinstance(other, dict): + if not ignore_index: + raise TypeError("Can only append a dict if ignore_index=True") other = Series(other) if other.name is None and not ignore_index: raise TypeError( - "Can only append a Series if ignore_index=True" - " or if the Series has a name" + "Can only append a Series if ignore_index=True " + "or if the Series has a name" ) index = Index([other.name], name=self.index.name) @@ -6782,7 +7136,7 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=False): from pandas.core.reshape.concat import concat if isinstance(other, (list, tuple)): - to_concat = [self] + other + to_concat = [self, *other] else: to_concat = [self, other] return concat( @@ -6792,7 +7146,9 @@ def append(self, other, ignore_index=False, verify_integrity=False, sort=False): sort=sort, ) - def join(self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False): + def join( + self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False + ) -> "DataFrame": """ Join columns of another DataFrame. @@ -6983,7 +7339,7 @@ def merge( copy=True, indicator=False, validate=None, - ): + ) -> "DataFrame": from pandas.core.reshape.merge import merge return merge( @@ -7002,7 +7358,7 @@ def merge( validate=validate, ) - def round(self, decimals=0, *args, **kwargs): + def round(self, decimals=0, *args, **kwargs) -> "DataFrame": """ Round a DataFrame to a variable number of decimal places. @@ -7116,7 +7472,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method="pearson", min_periods=1): + def corr(self, method="pearson", min_periods=1) -> "DataFrame": """ Compute pairwise correlation of columns, excluding NA/null values. @@ -7204,7 +7560,7 @@ def corr(self, method="pearson", min_periods=1): return self._constructor(correl, index=idx, columns=cols) - def cov(self, min_periods=None): + def cov(self, min_periods=None) -> "DataFrame": """ Compute pairwise covariance of columns, excluding NA/null values. @@ -7314,7 +7670,7 @@ def cov(self, min_periods=None): return self._constructor(baseCov, index=idx, columns=cols) - def corrwith(self, other, axis=0, drop=False, method="pearson"): + def corrwith(self, other, axis=0, drop=False, method="pearson") -> Series: """ Compute pairwise correlation. @@ -7586,6 +7942,26 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data + if numeric_only is not None and axis in [0, 1]: + df = self + if numeric_only is True: + df = _get_data(axis_matters=True) + if axis == 1: + df = df.T + axis = 0 + + out_dtype = "bool" if filter_type == "bool" else None + + # After possibly _get_data and transposing, we are now in the + # simple case where we can use BlockManager._reduce + res = df._data.reduce(op, axis=1, skipna=skipna, **kwds) + assert isinstance(res, dict) + if len(res): + assert len(res) == max(list(res.keys())) + 1, res.keys() + out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) + out.index = df.columns + return out + if numeric_only is None: values = self.values try: @@ -7650,7 +8026,7 @@ def _get_data(axis_matters): result = Series(result, index=labels) return result - def nunique(self, axis=0, dropna=True): + def nunique(self, axis=0, dropna=True) -> Series: """ Count distinct observations over requested axis. @@ -7690,7 +8066,7 @@ def nunique(self, axis=0, dropna=True): """ return self.apply(Series.nunique, axis=axis, dropna=dropna) - def idxmin(self, axis=0, skipna=True): + def idxmin(self, axis=0, skipna=True) -> Series: """ Return index of first occurrence of minimum over requested axis. @@ -7699,7 +8075,7 @@ def idxmin(self, axis=0, skipna=True): Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -7728,7 +8104,7 @@ def idxmin(self, axis=0, skipna=True): result = [index[i] if i >= 0 else np.nan for i in indices] return Series(result, index=self._get_agg_axis(axis)) - def idxmax(self, axis=0, skipna=True): + def idxmax(self, axis=0, skipna=True) -> Series: """ Return index of first occurrence of maximum over requested axis. @@ -7737,7 +8113,7 @@ def idxmax(self, axis=0, skipna=True): Parameters ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -7777,7 +8153,7 @@ def _get_agg_axis(self, axis_num): else: raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") - def mode(self, axis=0, numeric_only=False, dropna=True): + def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": """ Get the mode(s) of each element along the selected axis. @@ -7960,7 +8336,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): return result - def to_timestamp(self, freq=None, how="start", axis=0, copy=True): + def to_timestamp(self, freq=None, how="start", axis=0, copy=True) -> "DataFrame": """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -7994,7 +8370,7 @@ def to_timestamp(self, freq=None, how="start", axis=0, copy=True): return self._constructor(new_data) - def to_period(self, freq=None, axis=0, copy=True): + def to_period(self, freq=None, axis=0, copy=True) -> "DataFrame": """ Convert DataFrame from DatetimeIndex to PeriodIndex. @@ -8028,7 +8404,7 @@ def to_period(self, freq=None, axis=0, copy=True): return self._constructor(new_data) - def isin(self, values): + def isin(self, values) -> "DataFrame": """ Whether each element in the DataFrame is contained in values. @@ -8095,12 +8471,14 @@ def isin(self, values): from pandas.core.reshape.concat import concat values = collections.defaultdict(list, values) - return concat( - ( - self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns) - ), - axis=1, + return self._ensure_type( + concat( + ( + self.iloc[:, [i]].isin(values[col]) + for i, col in enumerate(self.columns) + ), + axis=1, + ) ) elif isinstance(values, Series): if not values.index.is_unique: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c24f09e338b6c..7b216c53c68cf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8,6 +8,7 @@ import re from textwrap import dedent from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -19,6 +20,7 @@ Sequence, Set, Tuple, + Type, Union, ) import warnings @@ -28,8 +30,17 @@ from pandas._config import config -from pandas._libs import Timestamp, iNaT, properties -from pandas._typing import Dtype, FilePathOrBuffer, FrameOrSeries, JSONSerializable +from pandas._libs import Timestamp, iNaT, lib, properties +from pandas._typing import ( + Axis, + Dtype, + FilePathOrBuffer, + FrameOrSeries, + JSONSerializable, + Label, + Level, + Renamer, +) from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv @@ -92,6 +103,9 @@ from pandas.io.formats.printing import pprint_thing from pandas.tseries.frequencies import to_offset +if TYPE_CHECKING: + from pandas.core.resample import Resampler + # goal is to be able to define the docs close to function, while still being # able to share _shared_docs: Dict[str, str] = dict() @@ -105,10 +119,6 @@ Name or list of names to sort by""", ) -# sentinel value to use as kwarg in place of None when None has special meaning -# and needs to be distinguished from a user explicitly passing None. -sentinel = object() - def _single_replace(self, to_replace, method, inplace, limit): """ @@ -143,7 +153,7 @@ def _single_replace(self, to_replace, method, inplace, limit): bool_t = bool # Need alias because NDFrame has def bool: -class NDFrame(PandasObject, SelectionMixin): +class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): """ N-dimensional analogue of DataFrame. Store multi-dimensional in a size-mutable, labeled data structure @@ -172,7 +182,7 @@ class NDFrame(PandasObject, SelectionMixin): ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() - _deprecations: FrozenSet[str] = frozenset(["get_values", "ix"]) + _deprecations: FrozenSet[str] = frozenset(["get_values"]) _metadata: List[str] = [] _is_copy = None _data: BlockManager @@ -234,6 +244,10 @@ def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): def attrs(self) -> Dict[Optional[Hashable], Any]: """ Dictionary of global attributes on this object. + + .. warning:: + + attrs is experimental and may change without warning. """ if self._attrs is None: self._attrs = {} @@ -252,8 +266,8 @@ def _validate_dtype(self, dtype): # a compound dtype if dtype.kind == "V": raise NotImplementedError( - "compound dtypes are not implemented" - f" in the {type(self).__name__} constructor" + "compound dtypes are not implemented " + f"in the {type(self).__name__} constructor" ) return dtype @@ -262,7 +276,7 @@ def _validate_dtype(self, dtype): # Construction @property - def _constructor(self): + def _constructor(self: FrameOrSeries) -> Type[FrameOrSeries]: """Used when a manipulation result has the same dimensions as the original. """ @@ -298,7 +312,7 @@ def _constructor_expanddim(self): _AXIS_LEN: int @classmethod - def _setup_axes(cls, axes: List[str], docs: Dict[str, str]): + def _setup_axes(cls, axes: List[str], docs: Dict[str, str]) -> None: """ Provide axes setup for the major PandasObjects. @@ -337,13 +351,6 @@ def _construct_axes_dict(self, axes=None, **kwargs): d.update(kwargs) return d - @staticmethod - def _construct_axes_dict_from(self, axes, **kwargs): - """Return an axes dictionary for the passed axes.""" - d = {a: ax for a, ax in zip(self._AXIS_ORDERS, axes)} - d.update(kwargs) - return d - def _construct_axes_from_arguments( self, args, kwargs, require_all: bool = False, sentinel=None ): @@ -372,18 +379,6 @@ def _construct_axes_from_arguments( axes = {a: kwargs.pop(a, sentinel) for a in self._AXIS_ORDERS} return axes, kwargs - @classmethod - def _from_axes(cls, data, axes, **kwargs): - # for construction from BlockManager - if isinstance(data, BlockManager): - return cls(data, **kwargs) - else: - if cls._AXIS_REVERSED: - axes = axes[::-1] - d = cls._construct_axes_dict_from(cls, axes, copy=False) - d.update(kwargs) - return cls(data, **d) - @classmethod def _get_axis_number(cls, axis): axis = cls._AXIS_ALIASES.get(axis, axis) @@ -423,7 +418,7 @@ def _get_block_manager_axis(cls, axis): return m - axis return axis - def _get_axis_resolvers(self, axis): + def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: # index or columns axis_index = getattr(self, axis) d = dict() @@ -453,22 +448,31 @@ def _get_axis_resolvers(self, axis): d[axis] = dindex return d - def _get_index_resolvers(self): - d = {} + def _get_index_resolvers(self) -> Dict[str, ABCSeries]: + from pandas.core.computation.parsing import clean_column_name + + d: Dict[str, ABCSeries] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) - return d - def _get_space_character_free_column_resolvers(self): - """Return the space character free column resolvers of a dataframe. + return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)} - Column names with spaces are 'cleaned up' so that they can be referred - to by backtick quoting. + def _get_cleaned_column_resolvers(self) -> Dict[str, ABCSeries]: + """ + Return the special character free column resolvers of a dataframe. + + Column names with special characters are 'cleaned up' so that they can + be referred to by backtick quoting. Used in :meth:`DataFrame.eval`. """ - from pandas.core.computation.common import _remove_spaces_column_name + from pandas.core.computation.parsing import clean_column_name - return {_remove_spaces_column_name(k): v for k, v in self.items()} + if isinstance(self, ABCSeries): + return {clean_column_name(self.name): self} + + return { + clean_column_name(k): v for k, v in self.items() if not isinstance(k, int) + } @property def _info_axis(self): @@ -486,7 +490,7 @@ def shape(self) -> Tuple[int, ...]: return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS) @property - def axes(self): + def axes(self) -> List[Index]: """ Return index label(s) of the internal NDFrame """ @@ -555,7 +559,7 @@ def set_axis(self, labels, axis=0, inplace=False): """ Assign desired index to given axis. - Indexes for column or row labels can be changed by assigning + Indexes for%(extended_summary_sub)s row labels can be changed by assigning a list-like or Index. .. versionchanged:: 0.21.0 @@ -570,9 +574,8 @@ def set_axis(self, labels, axis=0, inplace=False): labels : list-like, Index The values for the new index. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to update. The value 0 identifies the rows, and 1 - identifies the columns. + axis : %(axes_single_arg)s, default 0 + The axis to update. The value 0 identifies the rows%(axis_description_sub)s. inplace : bool, default False Whether to return a new %(klass)s instance. @@ -580,57 +583,14 @@ def set_axis(self, labels, axis=0, inplace=False): Returns ------- renamed : %(klass)s or None - An object of same type as caller if inplace=False, None otherwise. + An object of type %(klass)s if inplace=False, None otherwise. See Also -------- - DataFrame.rename_axis : Alter the name of the index or columns. + %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. Examples -------- - **Series** - - >>> s = pd.Series([1, 2, 3]) - >>> s - 0 1 - 1 2 - 2 3 - dtype: int64 - - >>> s.set_axis(['a', 'b', 'c'], axis=0) - a 1 - b 2 - c 3 - dtype: int64 - - **DataFrame** - - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - - Change the row labels. - - >>> df.set_axis(['a', 'b', 'c'], axis='index') - A B - a 1 4 - b 2 5 - c 3 6 - - Change the column labels. - - >>> df.set_axis(['I', 'II'], axis='columns') - I II - 0 1 4 - 1 2 5 - 2 3 6 - - Now, update the labels inplace. - - >>> df.set_axis(['i', 'ii'], axis='columns', inplace=True) - >>> df - i ii - 0 1 4 - 1 2 5 - 2 3 6 """ if inplace: setattr(self, self._get_axis_name(axis), labels) @@ -639,55 +599,11 @@ def set_axis(self, labels, axis=0, inplace=False): obj.set_axis(labels, axis=axis, inplace=True) return obj - def _set_axis(self, axis, labels): + def _set_axis(self, axis, labels) -> None: self._data.set_axis(axis, labels) self._clear_item_cache() - def transpose(self, *args, **kwargs): - """ - Permute the dimensions of the %(klass)s - - Parameters - ---------- - args : %(args_transpose)s - copy : bool, default False - Make a copy of the underlying data. Mixed-dtype data will - always result in a copy - **kwargs - Additional keyword arguments will be passed to the function. - - Returns - ------- - y : same as input - - Examples - -------- - >>> p.transpose(2, 0, 1) - >>> p.transpose(2, 0, 1, copy=True) - """ - - # construct the args - axes, kwargs = self._construct_axes_from_arguments( - args, kwargs, require_all=True - ) - axes_names = tuple(self._get_axis_name(axes[a]) for a in self._AXIS_ORDERS) - axes_numbers = tuple(self._get_axis_number(axes[a]) for a in self._AXIS_ORDERS) - - # we must have unique axes - if len(axes) != len(set(axes)): - raise ValueError(f"Must specify {self._AXIS_LEN} unique axes") - - new_axes = self._construct_axes_dict_from( - self, [self._get_axis(x) for x in axes_names] - ) - new_values = self.values.transpose(axes_numbers) - if kwargs.pop("copy", None) or (len(args) and args[-1]): - new_values = new_values.copy() - - nv.validate_transpose(tuple(), kwargs) - return self._constructor(new_values, **new_axes).__finalize__(self) - - def swapaxes(self, axis1, axis2, copy=True): + def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: """ Interchange axes and swap values axes appropriately. @@ -712,7 +628,7 @@ def swapaxes(self, axis1, axis2, copy=True): return self._constructor(new_values, *new_axes).__finalize__(self) - def droplevel(self, level, axis=0): + def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ Return DataFrame with requested index / column level(s) removed. @@ -772,7 +688,7 @@ def droplevel(self, level, axis=0): result = self.set_axis(new_labels, axis=axis, inplace=False) return result - def pop(self, item): + def pop(self: FrameOrSeries, item) -> FrameOrSeries: """ Return item and drop from frame. Raise KeyError if not found. @@ -933,29 +849,21 @@ def squeeze(self, axis=None): ) ] - def swaplevel(self, i=-2, j=-1, axis=0): - """ - Swap levels i and j in a MultiIndex on a particular axis - - Parameters - ---------- - i, j : int, str (can be mixed) - Level of index to be swapped. Can pass level name as string. - - Returns - ------- - swapped : same type as caller (new object) - """ - axis = self._get_axis_number(axis) - result = self.copy() - labels = result._data.axes[axis] - result._data.set_axis(axis, labels.swaplevel(i, j)) - return result - # ---------------------------------------------------------------------- # Rename - def rename(self, *args, **kwargs): + def rename( + self: FrameOrSeries, + mapper: Optional[Renamer] = None, + *, + index: Optional[Renamer] = None, + columns: Optional[Renamer] = None, + axis: Optional[Axis] = None, + copy: bool = True, + inplace: bool = False, + level: Optional[Level] = None, + errors: str = "ignore", + ) -> Optional[FrameOrSeries]: """ Alter axes input function or functions. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left @@ -1068,44 +976,46 @@ def rename(self, *args, **kwargs): See the :ref:`user guide ` for more. """ - axes, kwargs = self._construct_axes_from_arguments(args, kwargs) - copy = kwargs.pop("copy", True) - inplace = kwargs.pop("inplace", False) - level = kwargs.pop("level", None) - axis = kwargs.pop("axis", None) - errors = kwargs.pop("errors", "ignore") - if axis is not None: - # Validate the axis - self._get_axis_number(axis) - - if kwargs: - raise TypeError( - "rename() got an unexpected keyword " - f'argument "{list(kwargs.keys())[0]}"' - ) - - if com.count_not_none(*axes.values()) == 0: + if mapper is None and index is None and columns is None: raise TypeError("must pass an index to rename") - self._consolidate_inplace() + if index is not None or columns is not None: + if axis is not None: + raise TypeError( + "Cannot specify both 'axis' and any of 'index' or 'columns'" + ) + elif mapper is not None: + raise TypeError( + "Cannot specify both 'mapper' and any of 'index' or 'columns'" + ) + else: + # use the mapper argument + if axis and self._get_axis_number(axis) == 1: + columns = mapper + else: + index = mapper + result = self if inplace else self.copy(deep=copy) - # start in the axis order to eliminate too many copies - for axis in range(self._AXIS_LEN): - v = axes.get(self._AXIS_NAMES[axis]) - if v is None: + for axis_no, replacements in enumerate((index, columns)): + if replacements is None: continue - f = com.get_rename_function(v) - baxis = self._get_block_manager_axis(axis) + + ax = self._get_axis(axis_no) + baxis = self._get_block_manager_axis(axis_no) + f = com.get_rename_function(replacements) + if level is not None: - level = self.axes[axis]._get_level_number(level) + level = ax._get_level_number(level) # GH 13473 - if not callable(v): - indexer = self.axes[axis].get_indexer_for(v) + if not callable(replacements): + indexer = ax.get_indexer_for(replacements) if errors == "raise" and len(indexer[indexer == -1]): missing_labels = [ - label for index, label in enumerate(v) if indexer[index] == -1 + label + for index, label in enumerate(replacements) + if indexer[index] == -1 ] raise KeyError(f"{missing_labels} not found in axis") @@ -1116,11 +1026,12 @@ def rename(self, *args, **kwargs): if inplace: self._update_inplace(result._data) + return None else: return result.__finalize__(self) @rewrite_axis_style_signature("mapper", [("copy", True), ("inplace", False)]) - def rename_axis(self, mapper=sentinel, **kwargs): + def rename_axis(self, mapper=lib.no_default, **kwargs): """ Set the name of the axis for the index or columns. @@ -1245,7 +1156,7 @@ class name monkey 2 2 """ axes, kwargs = self._construct_axes_from_arguments( - (), kwargs, sentinel=sentinel + (), kwargs, sentinel=lib.no_default ) copy = kwargs.pop("copy", True) inplace = kwargs.pop("inplace", False) @@ -1261,7 +1172,7 @@ class name inplace = validate_bool_kwarg(inplace, "inplace") - if mapper is not sentinel: + if mapper is not lib.no_default: # Use v0.23 behavior if a scalar or list non_mapper = is_scalar(mapper) or ( is_list_like(mapper) and not is_dict_like(mapper) @@ -1277,7 +1188,7 @@ class name for axis in range(self._AXIS_LEN): v = axes.get(self._AXIS_NAMES[axis]) - if v is sentinel: + if v is lib.no_default: continue non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v)) if non_mapper: @@ -1517,10 +1428,10 @@ def bool(self): self.__nonzero__() - def __abs__(self): + def __abs__(self: FrameOrSeries) -> FrameOrSeries: return self.abs() - def __round__(self, decimals=0): + def __round__(self: FrameOrSeries, decimals: int = 0) -> FrameOrSeries: return self.round(decimals) # ------------------------------------------------------------------------- @@ -1709,8 +1620,7 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: multi_message = ( "\n" "For a multi-index, the label must be a " - "tuple with elements corresponding to " - "each level." + "tuple with elements corresponding to each level." ) else: multi_message = "" @@ -1916,10 +1826,15 @@ def empty(self) -> bool_t: # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented __array_priority__ = 1000 - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: return com.values_from_object(self) def __array_wrap__(self, result, context=None): + result = lib.item_from_zerodim(result) + if is_scalar(result): + # e.g. we get here with np.ptp(series) + # ptp also requires the item_from_zerodim + return result d = self._construct_axes_dict(self._AXIS_ORDERS, copy=False) return self._constructor(result, **d).__finalize__(self) @@ -1969,9 +1884,9 @@ def __setstate__(self, state): object.__setattr__(self, k, v) else: - self._unpickle_series_compat(state) + raise NotImplementedError("Pre-0.12 pickles are no longer supported") elif len(state) == 2: - self._unpickle_series_compat(state) + raise NotImplementedError("Pre-0.12 pickles are no longer supported") self._item_cache = {} @@ -2009,6 +1924,30 @@ def _repr_data_resource_(self): # ---------------------------------------------------------------------- # I/O Methods + _shared_docs[ + "to_markdown" + ] = """ + Print %(klass)s in Markdown-friendly format. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + mode : str, optional + Mode in which file is opened. + **kwargs + These parameters will be passed to `tabulate`. + + Returns + ------- + str + %(klass)s in Markdown-friendly format. + """ + _shared_docs[ "to_excel" ] = """ @@ -2138,7 +2077,7 @@ def to_excel( inf_rep="inf", verbose=True, freeze_panes=None, - ): + ) -> None: df = self if isinstance(self, ABCDataFrame) else self.to_frame() from pandas.io.formats.excel import ExcelFormatter @@ -2362,7 +2301,7 @@ def to_hdf( data_columns: Optional[List[str]] = None, errors: str = "strict", encoding: str = "UTF-8", - ): + ) -> None: """ Write the contained data to an HDF5 file using HDFStore. @@ -2706,7 +2645,9 @@ def to_pickle( to_pickle(self, path, compression=compression, protocol=protocol) - def to_clipboard(self, excel: bool_t = True, sep: Optional[str] = None, **kwargs): + def to_clipboard( + self, excel: bool_t = True, sep: Optional[str] = None, **kwargs + ) -> None: r""" Copy object to the system clipboard. @@ -3025,10 +2966,10 @@ def to_csv( sep: str = ",", na_rep: str = "", float_format: Optional[str] = None, - columns: Optional[Sequence[Optional[Hashable]]] = None, + columns: Optional[Sequence[Label]] = None, header: Union[bool_t, List[str]] = True, index: bool_t = True, - index_label: Optional[Union[bool_t, str, Sequence[Optional[Hashable]]]] = None, + index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, mode: str = "w", encoding: Optional[str] = None, compression: Optional[Union[str, Mapping[str, str]]] = "infer", @@ -3188,16 +3129,6 @@ def to_csv( return None - # ---------------------------------------------------------------------- - # Fancy Indexing - - @classmethod - def _create_indexer(cls, name: str, indexer) -> None: - """Create an indexer like _name in the class.""" - if getattr(cls, name, None) is None: - _indexer = functools.partial(indexer, name) - setattr(cls, name, property(_indexer, doc=indexer.__doc__)) - # ---------------------------------------------------------------------- # Lookup Caching @@ -3274,7 +3205,9 @@ def _clear_item_cache(self) -> None: # ---------------------------------------------------------------------- # Indexing Methods - def take(self, indices, axis=0, is_copy: bool_t = True, **kwargs): + def take( + self: FrameOrSeries, indices, axis=0, is_copy: Optional[bool_t] = None, **kwargs + ) -> FrameOrSeries: """ Return the elements in the given *positional* indices along an axis. @@ -3291,6 +3224,8 @@ def take(self, indices, axis=0, is_copy: bool_t = True, **kwargs): selecting rows, ``1`` means that we are selecting columns. is_copy : bool, default True Whether to return a copy of the original object or not. + + .. deprecated:: 1.0.0 **kwargs For compatibility with :meth:`numpy.take`. Has no effect on the output. @@ -3349,6 +3284,16 @@ class max_speed 1 monkey mammal NaN 3 lion mammal 80.5 """ + if is_copy is not None: + warnings.warn( + "is_copy is deprecated and will be removed in a future version. " + "take will always return a copy in the future.", + FutureWarning, + stacklevel=2, + ) + else: + is_copy = True + nv.validate_take(tuple(), kwargs) self._consolidate_inplace() @@ -3557,7 +3502,7 @@ def _iget_item_cache(self, item): def _box_item_values(self, key, values): raise AbstractMethodError(self) - def _slice(self, slobj: slice, axis=0, kind=None): + def _slice(self: FrameOrSeries, slobj: slice, axis=0, kind=None) -> FrameOrSeries: """ Construct a slice of this container. @@ -3577,14 +3522,12 @@ def _set_item(self, key, value) -> None: self._data.set(key, value) self._clear_item_cache() - def _set_is_copy(self, ref=None, copy: bool_t = True) -> None: + def _set_is_copy(self, ref, copy: bool_t = True) -> None: if not copy: self._is_copy = None else: - if ref is not None: - self._is_copy = weakref.ref(ref) - else: - self._is_copy = None + assert ref is not None + self._is_copy = weakref.ref(ref) def _check_is_chained_assignment_possible(self) -> bool_t: """ @@ -3663,7 +3606,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): "A value is trying to be set on a copy of a slice from a " "DataFrame\n\n" "See the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" "indexing.html#returning-a-view-versus-a-copy" ) @@ -3674,7 +3617,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): "DataFrame.\n" "Try using .loc[row_indexer,col_indexer] = value " "instead\n\nSee the caveats in the documentation: " - "http://pandas.pydata.org/pandas-docs/stable/user_guide/" + "https://pandas.pydata.org/pandas-docs/stable/user_guide/" "indexing.html#returning-a-view-versus-a-copy" ) @@ -3683,7 +3626,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): elif value == "warn": warnings.warn(t, com.SettingWithCopyWarning, stacklevel=stacklevel) - def __delitem__(self, key): + def __delitem__(self, key) -> None: """ Delete item """ @@ -3745,13 +3688,13 @@ def _is_view(self): return self._data.is_view def reindex_like( - self, + self: FrameOrSeries, other, method: Optional[str] = None, copy: bool_t = True, limit=None, tolerance=None, - ): + ) -> FrameOrSeries: """ Return an object with matching indices as other object. @@ -3893,7 +3836,9 @@ def drop( else: return obj - def _drop_axis(self, labels, axis, level=None, errors: str = "raise"): + def _drop_axis( + self: FrameOrSeries, labels, axis, level=None, errors: str = "raise" + ) -> FrameOrSeries: """ Drop labels from specified axis. Used in the ``drop`` method internally. @@ -3963,7 +3908,7 @@ def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None: self._data = getattr(result, "_data", result) self._maybe_update_cacher(verify_is_copy=verify_is_copy) - def add_prefix(self, prefix: str): + def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: """ Prefix labels with string `prefix`. @@ -4020,9 +3965,9 @@ def add_prefix(self, prefix: str): f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) + return self.rename(**mapper) # type: ignore - def add_suffix(self, suffix: str): + def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: """ Suffix labels with string `suffix`. @@ -4079,16 +4024,16 @@ def add_suffix(self, suffix: str): f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) + return self.rename(**mapper) # type: ignore def sort_values( self, - by=None, axis=0, ascending=True, inplace: bool_t = False, kind: str = "quicksort", na_position: str = "last", + ignore_index: bool_t = False, ): """ Sort by the values along either axis. @@ -4111,6 +4056,10 @@ def sort_values( na_position : {'first', 'last'}, default 'last' Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -4179,65 +4128,7 @@ def sort_values( """ raise AbstractMethodError(self) - def sort_index( - self, - axis=0, - level=None, - ascending: bool_t = True, - inplace: bool_t = False, - kind: str = "quicksort", - na_position: str = "last", - sort_remaining: bool_t = True, - ): - """ - Sort object by labels (along an axis). - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis along which to sort. The value 0 identifies the rows, - and 1 identifies the columns. - level : int or level name or list of ints or list of level names - If not None, sort on values in specified index level(s). - ascending : bool, default True - Sort ascending vs. descending. - inplace : bool, default False - If True, perform operation in-place. - kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' - Choice of sorting algorithm. See also ndarray.np.sort for more - information. `mergesort` is the only stable algorithm. For - DataFrames, this option is only applied when sorting on a single - column or label. - na_position : {'first', 'last'}, default 'last' - Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. - Not implemented for MultiIndex. - sort_remaining : bool, default True - If True and sorting by level and index is multilevel, sort by other - levels too (in order) after sorting by specified level. - - Returns - ------- - sorted_obj : DataFrame or None - DataFrame with sorted index if inplace=False, None otherwise. - """ - inplace = validate_bool_kwarg(inplace, "inplace") - axis = self._get_axis_number(axis) - axis_name = self._get_axis_name(axis) - labels = self._get_axis(axis) - - if level is not None: - raise NotImplementedError("level is not implemented") - if inplace: - raise NotImplementedError("inplace is not implemented") - - sort_index = labels.argsort() - if not ascending: - sort_index = sort_index[::-1] - - new_axis = labels.take(sort_index) - return self.reindex(**{axis_name: new_axis}) - - def reindex(self, *args, **kwargs): + def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: """ Conform %(klass)s to new index with optional filling logic. @@ -4485,7 +4376,9 @@ def reindex(self, *args, **kwargs): axes, level, limit, tolerance, method, fill_value, copy ).__finalize__(self) - def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): + def _reindex_axes( + self: FrameOrSeries, axes, level, limit, tolerance, method, fill_value, copy + ) -> FrameOrSeries: """Perform the reindex for all the axes.""" obj = self for a in self._AXIS_ORDERS: @@ -4521,12 +4414,12 @@ def _reindex_multi(self, axes, copy, fill_value): raise AbstractMethodError(self) def _reindex_with_indexers( - self, + self: FrameOrSeries, reindexers, fill_value=None, copy: bool_t = False, allow_dups: bool_t = False, - ): + ) -> FrameOrSeries: """allow_dups indicates an internal call here """ # reindex doing multiple operations on different axes if indicated @@ -4558,12 +4451,12 @@ def _reindex_with_indexers( return self._constructor(new_data).__finalize__(self) def filter( - self, + self: FrameOrSeries, items=None, like: Optional[str] = None, regex: Optional[str] = None, axis=None, - ): + ) -> FrameOrSeries: """ Subset the dataframe rows or columns according to the specified index labels. @@ -4662,6 +4555,9 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: on position. It is useful for quickly testing if your object has the right type of data in it. + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + Parameters ---------- n : int, default 5 @@ -4669,7 +4565,7 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: Returns ------- - obj_head : same type as caller + same type as caller The first `n` rows of the caller object. See Also @@ -4709,6 +4605,17 @@ def head(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: 0 alligator 1 bee 2 falcon + + For negative values of `n` + + >>> df.head(-3) + animal + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot """ return self.iloc[:n] @@ -4721,6 +4628,9 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: position. It is useful for quickly verifying data, for example, after sorting or appending rows. + For negative values of `n`, this function returns all rows except + the first `n` rows, equivalent to ``df[n:]``. + Parameters ---------- n : int, default 5 @@ -4768,6 +4678,17 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: 6 shark 7 whale 8 zebra + + For negative values of `n` + + >>> df.tail(-3) + animal + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra """ if n == 0: @@ -4775,14 +4696,14 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: return self.iloc[-n:] def sample( - self, + self: FrameOrSeries, n=None, frac=None, replace=False, weights=None, random_state=None, axis=None, - ): + ) -> FrameOrSeries: """ Return a random sample of items from an axis of object. @@ -4970,7 +4891,7 @@ def sample( ) locs = rs.choice(axis_length, size=n, replace=replace, p=weights) - return self.take(locs, axis=axis, is_copy=False) + return self.take(locs, axis=axis) _shared_docs[ "pipe" @@ -5289,11 +5210,6 @@ def _is_numeric_mixed_type(self): f = lambda: self._data.is_numeric_mixed_type return self._protect_consolidate(f) - @property - def _is_datelike_mixed_type(self): - f = lambda: self._data.is_datelike_mixed_type - return self._protect_consolidate(f) - def _check_inplace_setting(self, value) -> bool_t: """ check whether we allow in-place setting with this type of value """ @@ -5321,7 +5237,7 @@ def _get_bool_data(self): # Internal Interface Methods @property - def values(self): + def values(self) -> np.ndarray: """ Return a Numpy representation of the DataFrame. @@ -5398,16 +5314,11 @@ def values(self): return self._data.as_array(transpose=self._AXIS_REVERSED) @property - def _values(self): + def _values(self) -> np.ndarray: """internal implementation""" return self.values - @property - def _get_values(self): - # compat - return self.values - - def _internal_get_values(self): + def _internal_get_values(self) -> np.ndarray: """ Return an ndarray after converting sparse values to dense. @@ -5471,7 +5382,9 @@ def _to_dict_of_blocks(self, copy: bool_t = True): for k, v, in self._data.to_dict(copy=copy).items() } - def astype(self, dtype, copy: bool_t = True, errors: str = "raise"): + def astype( + self: FrameOrSeries, dtype, copy: bool_t = True, errors: str = "raise" + ) -> FrameOrSeries: """ Cast a pandas object to a specified dtype ``dtype``. @@ -5779,7 +5692,7 @@ def _convert( ) ).__finalize__(self) - def infer_objects(self): + def infer_objects(self: FrameOrSeries) -> FrameOrSeries: """ Attempt to infer better dtypes for object columns. @@ -6382,8 +6295,8 @@ def replace( if not is_dict_like(to_replace): if not is_dict_like(regex): raise TypeError( - 'If "to_replace" and "value" are both None' - ' and "to_replace" is not a list, then ' + 'If "to_replace" and "value" are both None ' + 'and "to_replace" is not a list, then ' "regex must be a mapping" ) to_replace = regex @@ -6397,9 +6310,8 @@ def replace( if any(are_mappings): if not all(are_mappings): raise TypeError( - "If a nested mapping is passed, all values" - " of the top level mapping must be " - "mappings" + "If a nested mapping is passed, all values " + "of the top level mapping must be mappings" ) # passed a nested dict/Series to_rep_dict = {} @@ -6919,8 +6831,7 @@ def asof(self, where, subset=None): if not is_list: start = self.index[0] if isinstance(self.index, PeriodIndex): - where = Period(where, freq=self.index.freq).ordinal - start = start.ordinal + where = Period(where, freq=self.index.freq) if where < start: if not is_series: @@ -6965,7 +6876,8 @@ def asof(self, where, subset=None): # mask the missing missing = locs == -1 - data = self.take(locs, is_copy=False) + d = self.take(locs) + data = d.copy() data.index = where data.loc[missing] = np.nan return data if is_list else data.iloc[-1] @@ -7036,11 +6948,11 @@ def asof(self, where, subset=None): """ @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): + def isna(self: FrameOrSeries) -> FrameOrSeries: return isna(self).__finalize__(self) @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isnull(self): + def isnull(self: FrameOrSeries) -> FrameOrSeries: return isna(self).__finalize__(self) _shared_docs[ @@ -7106,11 +7018,11 @@ def isnull(self): """ @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): + def notna(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self) @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notnull(self): + def notnull(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self) def _clip_with_scalar(self, lower, upper, inplace: bool_t = False): @@ -7162,14 +7074,14 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): return self.where(subset, threshold, axis=axis, inplace=inplace) def clip( - self, + self: FrameOrSeries, lower=None, upper=None, axis=None, inplace: bool_t = False, *args, **kwargs, - ): + ) -> FrameOrSeries: """ Trim values at input threshold(s). @@ -7283,19 +7195,10 @@ def clip( return result - def groupby( - self, - by=None, - axis=0, - level=None, - as_index: bool_t = True, - sort: bool_t = True, - group_keys: bool_t = True, - squeeze: bool_t = False, - observed: bool_t = False, - ): - """ - Group DataFrame or Series using a mapper or by a Series of columns. + _shared_docs[ + "groupby" + ] = """ + Group %(klass)s using a mapper or by a Series of columns. A groupby operation involves some combination of splitting the object, applying a function, and combining the results. This can be @@ -7340,9 +7243,8 @@ def groupby( Returns ------- - DataFrameGroupBy or SeriesGroupBy - Depends on the calling object and returns groupby object that - contains information about the groups. + %(klass)sGroupBy + Returns a groupby object that contains information about the groups. See Also -------- @@ -7352,79 +7254,17 @@ def groupby( Notes ----- See the `user guide - `_ for more. - - Examples - -------- - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() - Max Speed - Animal - Falcon 375.0 - Parrot 25.0 - - **Hierarchical Indexes** - - We can groupby different levels of a hierarchical index - using the `level` parameter: - - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - ... index=index) - >>> df - Max Speed - Animal Type - Falcon Captive 390.0 - Wild 350.0 - Parrot Captive 30.0 - Wild 20.0 - >>> df.groupby(level=0).mean() - Max Speed - Animal - Falcon 370.0 - Parrot 25.0 - >>> df.groupby(level=1).mean() - Max Speed - Type - Captive 210.0 - Wild 185.0 - """ - from pandas.core.groupby.groupby import get_groupby - - if level is None and by is None: - raise TypeError("You have to supply one of 'by' and 'level'") - axis = self._get_axis_number(axis) - - return get_groupby( - self, - by=by, - axis=axis, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - squeeze=squeeze, - observed=observed, - ) + `_ for more. + """ def asfreq( - self, + self: FrameOrSeries, freq, method=None, how: Optional[str] = None, normalize: bool_t = False, fill_value=None, - ): + ) -> FrameOrSeries: """ Convert TimeSeries to specified frequency. @@ -7463,7 +7303,7 @@ def asfreq( Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -7527,7 +7367,9 @@ def asfreq( fill_value=fill_value, ) - def at_time(self, time, asof: bool_t = False, axis=None): + def at_time( + self: FrameOrSeries, time, asof: bool_t = False, axis=None + ) -> FrameOrSeries: """ Select values at particular time of day (e.g. 9:30AM). @@ -7584,13 +7426,13 @@ def at_time(self, time, asof: bool_t = False, axis=None): return self.take(indexer, axis=axis) def between_time( - self, + self: FrameOrSeries, start_time, end_time, include_start: bool_t = True, include_end: bool_t = True, axis=None, - ): + ) -> FrameOrSeries: """ Select values between particular times of the day (e.g., 9:00-9:30 AM). @@ -7677,7 +7519,7 @@ def resample( base: int = 0, on=None, level=None, - ): + ) -> "Resampler": """ Resample time-series data. @@ -7740,7 +7582,7 @@ def resample( for more. To learn more about the offset strings, please see `this link - `__. + `__. Examples -------- @@ -7942,10 +7784,10 @@ def resample( 2000-01-04 36 90 """ - from pandas.core.resample import resample + from pandas.core.resample import get_resampler axis = self._get_axis_number(axis) - return resample( + return get_resampler( self, freq=rule, label=label, @@ -7959,7 +7801,7 @@ def resample( level=level, ) - def first(self, offset): + def first(self: FrameOrSeries, offset) -> FrameOrSeries: """ Method to subset initial periods of time series data based on a date offset. @@ -8014,14 +7856,14 @@ def first(self, offset): end_date = end = self.index[0] + offset # Tick-like, e.g. 3 weeks - if not offset.isAnchored() and hasattr(offset, "_inc"): + if not offset.is_anchored() and hasattr(offset, "_inc"): if end_date in self.index: end = self.index.searchsorted(end_date, side="left") return self.iloc[:end] return self.loc[:end] - def last(self, offset): + def last(self: FrameOrSeries, offset) -> FrameOrSeries: """ Method to subset final periods of time series data based on a date offset. @@ -8380,8 +8222,12 @@ def _align_frame( ) if method is not None: - left = left.fillna(axis=fill_axis, method=method, limit=limit) - right = right.fillna(axis=fill_axis, method=method, limit=limit) + left = self._ensure_type( + left.fillna(method=method, axis=fill_axis, limit=limit) + ) + right = self._ensure_type( + right.fillna(method=method, axis=fill_axis, limit=limit) + ) # if DatetimeIndex have different tz, convert to UTC if is_datetime64tz_dtype(left.index): @@ -8874,7 +8720,9 @@ def mask( """ @Appender(_shared_docs["shift"] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0, fill_value=None): + def shift( + self: FrameOrSeries, periods=1, freq=None, axis=0, fill_value=None + ) -> FrameOrSeries: if periods == 0: return self.copy() @@ -8925,7 +8773,9 @@ def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: return new_obj.__finalize__(self) - def tshift(self, periods: int = 1, freq=None, axis=0): + def tshift( + self: FrameOrSeries, periods: int = 1, freq=None, axis=0 + ) -> FrameOrSeries: """ Shift the time index, using the index's frequency if available. @@ -8974,11 +8824,10 @@ def tshift(self, periods: int = 1, freq=None, axis=0): new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods) elif orig_freq is not None: - msg = ( - f"Given freq {freq.rule_code} does not match" - f" PeriodIndex freq {orig_freq.rule_code}" + raise ValueError( + f"Given freq {freq.rule_code} does not match " + f"PeriodIndex freq {orig_freq.rule_code}" ) - raise ValueError(msg) else: new_data = self._data.copy() new_data.axes[block_axis] = index.shift(periods, freq) @@ -9368,7 +9217,7 @@ def _tz_localize(ax, tz, ambiguous, nonexistent): # ---------------------------------------------------------------------- # Numeric Methods - def abs(self): + def abs(self: FrameOrSeries) -> FrameOrSeries: """ Return a Series/DataFrame with absolute numeric value of each element. @@ -9437,7 +9286,9 @@ def abs(self): """ return np.abs(self) - def describe(self, percentiles=None, include=None, exclude=None): + def describe( + self: FrameOrSeries, percentiles=None, include=None, exclude=None + ) -> FrameOrSeries: """ Generate descriptive statistics. @@ -9773,7 +9624,7 @@ def describe_1d(data): ldesc = [describe_1d(s) for _, s in data.items()] # set a convenient order for rows - names = [] + names: List[Optional[Hashable]] = [] ldesc_indexes = sorted((x.index for x in ldesc), key=len) for idxnames in ldesc_indexes: for name in idxnames: @@ -9902,20 +9753,29 @@ def describe_1d(data): """ @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs) - def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, **kwargs): + def pct_change( + self: FrameOrSeries, + periods=1, + fill_method="pad", + limit=None, + freq=None, + **kwargs, + ) -> FrameOrSeries: # TODO: Not sure if above is correct - need someone to confirm. axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name)) if fill_method is None: data = self else: - data = self.fillna(method=fill_method, limit=limit, axis=axis) + data = self._ensure_type( + self.fillna(method=fill_method, axis=axis, limit=limit) + ) rs = data.div(data.shift(periods=periods, freq=freq, axis=axis, **kwargs)) - 1 - rs = rs.loc[~rs.index.duplicated()] - rs = rs.reindex_like(data) - if freq is None: - mask = isna(com.values_from_object(data)) - np.putmask(rs.values, mask, np.nan) + if freq is not None: + # Shift method is implemented differently when freq is not None + # We want to restore the original index + rs = rs.loc[~rs.index.duplicated()] + rs = rs.reindex_like(data) return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): @@ -10166,40 +10026,6 @@ def mad(self, axis=None, skipna=None, level=None): _min_examples, ) - @classmethod - def _add_series_only_operations(cls): - """ - Add the series only operations to the cls; evaluate the doc - strings again. - """ - - axis_descr, name, name2 = _doc_parms(cls) - - def nanptp(values, axis=0, skipna=True): - nmax = nanops.nanmax(values, axis, skipna) - nmin = nanops.nanmin(values, axis, skipna) - warnings.warn( - "Method .ptp is deprecated and will be removed " - "in a future version. Use numpy.ptp instead.", - FutureWarning, - stacklevel=4, - ) - return nmax - nmin - - cls.ptp = _make_stat_function( - cls, - "ptp", - name, - name2, - axis_descr, - """Return the difference between the min and max value. - \n.. deprecated:: 0.24.0 Use numpy.ptp instead - \nReturn the difference between the maximum value and the - minimum value in the object. This is the equivalent of the - ``numpy.ndarray`` method ``ptp``.""", - nanptp, - ) - @classmethod def _add_series_or_dataframe_operations(cls): """ @@ -11130,19 +10956,64 @@ def cum_func(self, axis=None, skipna=True, *args, **kwargs): else: axis = self._get_axis_number(axis) - y = com.values_from_object(self).copy() - - if skipna and issubclass(y.dtype.type, (np.datetime64, np.timedelta64)): - result = accum_func(y, axis) - mask = isna(self) - np.putmask(result, mask, iNaT) - elif skipna and not issubclass(y.dtype.type, (np.integer, np.bool_)): - mask = isna(self) - np.putmask(y, mask, mask_a) - result = accum_func(y, axis) - np.putmask(result, mask, mask_b) - else: - result = accum_func(y, axis) + if axis == 1: + return cum_func(self.T, axis=0, skipna=skipna, *args, **kwargs).T + + def na_accum_func(blk_values): + # We will be applying this function to block values + if blk_values.dtype.kind in ["m", "M"]: + # GH#30460, GH#29058 + # numpy 1.18 started sorting NaTs at the end instead of beginning, + # so we need to work around to maintain backwards-consistency. + orig_dtype = blk_values.dtype + + # We need to define mask before masking NaTs + mask = isna(blk_values) + + if accum_func == np.minimum.accumulate: + # Note: the accum_func comparison fails as an "is" comparison + y = blk_values.view("i8") + y[mask] = np.iinfo(np.int64).max + changed = True + else: + y = blk_values + changed = False + + result = accum_func(y.view("i8"), axis) + if skipna: + np.putmask(result, mask, iNaT) + elif accum_func == np.minimum.accumulate: + # Restore NaTs that we masked previously + nz = (~np.asarray(mask)).nonzero()[0] + if len(nz): + # everything up to the first non-na entry stays NaT + result[: nz[0]] = iNaT + + if changed: + # restore NaT elements + y[mask] = iNaT # TODO: could try/finally for this? + + if isinstance(blk_values, np.ndarray): + result = result.view(orig_dtype) + else: + # DatetimeArray + result = type(blk_values)._from_sequence(result, dtype=orig_dtype) + + elif skipna and not issubclass( + blk_values.dtype.type, (np.integer, np.bool_) + ): + vals = blk_values.copy().T + mask = isna(vals) + np.putmask(vals, mask, mask_a) + result = accum_func(vals, axis) + np.putmask(result, mask, mask_b) + else: + result = accum_func(blk_values.T, axis) + + # transpose back for ndarray, not for EA + return result.T if hasattr(result, "T") else result + + result = self._data.apply(na_accum_func) d = self._construct_axes_dict() d["copy"] = False @@ -11182,8 +11053,3 @@ def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs ) return set_function_name(logical_func, name, cls) - - -# install the indexes -for _name, _indexer in indexing.get_indexers_list(): - NDFrame._create_indexer(_name, _indexer) diff --git a/pandas/core/groupby/__init__.py b/pandas/core/groupby/__init__.py index 252f20ed40068..0c5d2658978b4 100644 --- a/pandas/core/groupby/__init__.py +++ b/pandas/core/groupby/__init__.py @@ -1,7 +1,11 @@ -from pandas.core.groupby.generic import ( # noqa: F401 - DataFrameGroupBy, - NamedAgg, - SeriesGroupBy, -) -from pandas.core.groupby.groupby import GroupBy # noqa: F401 -from pandas.core.groupby.grouper import Grouper # noqa: F401 +from pandas.core.groupby.generic import DataFrameGroupBy, NamedAgg, SeriesGroupBy +from pandas.core.groupby.groupby import GroupBy +from pandas.core.groupby.grouper import Grouper + +__all__ = [ + "DataFrameGroupBy", + "NamedAgg", + "SeriesGroupBy", + "GroupBy", + "Grouper", +] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dc343f670b725..c49677fa27a31 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -5,7 +5,7 @@ These are user facing as the result of the ``df.groupby(...)`` operations, which here returns a DataFrameGroupBy object. """ -from collections import OrderedDict, abc, namedtuple +from collections import abc, defaultdict, namedtuple import copy from functools import partial from textwrap import dedent @@ -14,14 +14,18 @@ TYPE_CHECKING, Any, Callable, + Dict, FrozenSet, Iterable, + List, Mapping, Sequence, + Tuple, Type, Union, cast, ) +import warnings import numpy as np @@ -304,7 +308,7 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) - results = OrderedDict() + results = {} for name, func in arg: obj = self @@ -323,7 +327,7 @@ def _aggregate_multiple_funcs(self, arg): return DataFrame(results, columns=columns) def _wrap_series_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, + self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -441,7 +445,7 @@ def _get_index() -> Index: return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): - result = OrderedDict() + result = {} for name, group in self: group.name = name @@ -807,6 +811,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): periods=periods, fill_method=fill_method, limit=limit, freq=freq ) ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) @@ -993,25 +1000,25 @@ def _iterate_slices(self) -> Iterable[Series]: def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ) -> DataFrame: - agg_items, agg_blocks = self._cython_agg_blocks( + agg_blocks, agg_items = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) return self._wrap_agged_blocks(agg_blocks, items=agg_items) def _cython_agg_blocks( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 - ): + ) -> "Tuple[List[Block], Index]": # TODO: the actual managing of mgr_locs is a PITA # here, it should happen via BlockManager.combine - data = self._get_data_to_aggregate() + data: BlockManager = self._get_data_to_aggregate() if numeric_only: data = data.get_numeric_data(copy=False) - new_blocks = [] - new_items = [] - deleted_items = [] + agg_blocks: List[Block] = [] + new_items: List[np.ndarray] = [] + deleted_items: List[np.ndarray] = [] no_result = object() for block in data.blocks: # Avoid inheriting result from earlier in the loop @@ -1077,20 +1084,20 @@ def _cython_agg_blocks( # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - newb = block.make_block(result) + agg_block: Block = block.make_block(result) new_items.append(locs) - new_blocks.append(newb) + agg_blocks.append(agg_block) - if len(new_blocks) == 0: + if not agg_blocks: raise DataError("No numeric types to aggregate") # reset the locs in the blocks to correspond to our # current ordering indexer = np.concatenate(new_items) - new_items = data.items.take(np.sort(indexer)) + agg_items = data.items.take(np.sort(indexer)) - if len(deleted_items): + if deleted_items: # we need to adjust the indexer to account for the # items we have removed @@ -1103,12 +1110,12 @@ def _cython_agg_blocks( indexer = (ai - mask.cumsum())[indexer] offset = 0 - for b in new_blocks: - loc = len(b.mgr_locs) - b.mgr_locs = indexer[offset : (offset + loc)] + for blk in agg_blocks: + loc = len(blk.mgr_locs) + blk.mgr_locs = indexer[offset : (offset + loc)] offset += loc - return new_items, new_blocks + return agg_blocks, agg_items def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: @@ -1117,7 +1124,7 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: axis = self.axis obj = self._obj_with_exclusions - result: OrderedDict = OrderedDict() + result: Dict[Union[int, str], Union[NDFrame, np.ndarray]] = {} if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) @@ -1134,7 +1141,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: # only for axis==0 obj = self._obj_with_exclusions - result: OrderedDict = OrderedDict() + result: Dict[Union[int, str], NDFrame] = {} cannot_agg = [] for item in obj: data = obj[item] @@ -1572,6 +1579,19 @@ def filter(self, func, dropna=True, *args, **kwargs): return self._apply_filter(indices, dropna) + def __getitem__(self, key): + # per GH 23566 + if isinstance(key, tuple) and len(key) > 1: + # if len == 1, then it becomes a SeriesGroupBy and this is actually + # valid syntax, so don't raise warning + warnings.warn( + "Indexing with multiple keys (implicitly converted to a tuple " + "of keys) will be deprecated, use a list instead.", + FutureWarning, + stacklevel=2, + ) + return super().__getitem__(key) + def _gotitem(self, key, ndim: int, subset=None): """ sub-classes to define @@ -1615,7 +1635,7 @@ def _wrap_frame_output(self, result, obj) -> DataFrame: else: return DataFrame(result, index=obj.index, columns=result_index) - def _get_data_to_aggregate(self): + def _get_data_to_aggregate(self) -> BlockManager: obj = self._obj_with_exclusions if self.axis == 1: return obj.T._data @@ -1872,7 +1892,7 @@ def _normalize_keyword_aggregation(kwargs): Normalize user-provided "named aggregation" kwargs. Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs - to the old OrderedDict[str, List[scalar]]]. + to the old Dict[str, List[scalar]]]. Parameters ---------- @@ -1890,24 +1910,19 @@ def _normalize_keyword_aggregation(kwargs): Examples -------- >>> _normalize_keyword_aggregation({'output': ('input', 'sum')}) - (OrderedDict([('input', ['sum'])]), ('output',), [('input', 'sum')]) + ({'input': ['sum']}, ('output',), [('input', 'sum')]) """ # Normalize the aggregation functions as Mapping[column, List[func]], # process normally, then fixup the names. - # TODO(Py35): When we drop python 3.5, change this to - # defaultdict(list) - # TODO: aggspec type: typing.OrderedDict[str, List[AggScalar]] + # TODO: aggspec type: typing.Dict[str, List[AggScalar]] # May be hitting https://github.com/python/mypy/issues/5958 # saying it doesn't have an attribute __name__ - aggspec = OrderedDict() + aggspec = defaultdict(list) order = [] columns, pairs = list(zip(*kwargs.items())) for name, (column, aggfunc) in zip(columns, pairs): - if column in aggspec: - aggspec[column].append(aggfunc) - else: - aggspec[column] = [aggfunc] + aggspec[column].append(aggfunc) order.append((column, com.get_callable_name(aggfunc) or aggfunc)) # uniquify aggfunc name if duplicated in order list diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b5325d8305249..a8c96840ff17b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -4,7 +4,7 @@ class providing the base-class of operations. The SeriesGroupBy and DataFrameGroupBy sub-class (defined in pandas.core.groupby.generic) -expose these user-facing objects to provide specific functionailty. +expose these user-facing objects to provide specific functionality. """ from contextlib import contextmanager @@ -236,7 +236,7 @@ class providing the base-class of operations. Notes ----- See more `here -`_ +`_ Examples -------- @@ -325,7 +325,7 @@ def f(self): f.__name__ = "plot" return self._groupby.apply(f) - def __getattr__(self, name): + def __getattr__(self, name: str): def attr(*args, **kwargs): def f(self): return getattr(self.plot, name)(*args, **kwargs) @@ -485,8 +485,8 @@ def get_converter(s): except KeyError: # turns out it wasn't a tuple msg = ( - "must supply a same-length tuple to get_group" - " with multiple grouping keys" + "must supply a same-length tuple to get_group " + "with multiple grouping keys" ) raise ValueError(msg) @@ -570,7 +570,7 @@ def _set_result_index_ordered(self, result): def _dir_additions(self): return self.obj._dir_additions() | self._apply_whitelist - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self.obj: @@ -969,22 +969,17 @@ def reset_identity(values): result = concat(values, axis=self.axis) ax = self._selected_obj._get_axis(self.axis) - if isinstance(result, Series): - result = result.reindex(ax) + # this is a very unfortunate situation + # we can't use reindex to restore the original order + # when the ax has duplicates + # so we resort to this + # GH 14776, 30667 + if ax.has_duplicates: + indexer, _ = result.index.get_indexer_non_unique(ax.values) + indexer = algorithms.unique1d(indexer) + result = result.take(indexer, axis=self.axis) else: - - # this is a very unfortunate situation - # we have a multi-index that is NOT lexsorted - # and we have a result which is duplicated - # we can't reindex, so we resort to this - # GH 14776 - if isinstance(ax, MultiIndex) and not ax.is_unique: - indexer = algorithms.unique1d( - result.index.get_indexer_for(ax.values) - ) - result = result.take(indexer, axis=self.axis) - else: - result = result.reindex(ax, axis=self.axis) + result = result.reindex(ax, axis=self.axis) elif self.group_keys: @@ -1937,21 +1932,22 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] # but this hits https://github.com/pandas-dev/pandas/issues/10710 # which doesn't reorder the list-like `q` on the inner level. - order = np.roll(list(range(result.index.nlevels)), -1) - result = result.reorder_levels(order) - result = result.reindex(q, level=-1) + order = list(range(1, result.index.nlevels)) + [0] + + # temporarily saves the index names + index_names = np.array(result.index.names) - # fix order. - hi = len(q) * self.ngroups - arr = np.arange(0, hi, self.ngroups) - arrays = [] + # set index names to positions to avoid confusion + result.index.names = np.arange(len(index_names)) - for i in range(self.ngroups): - arr2 = arr + i - arrays.append(arr2) + # place quantiles on the inside + result = result.reorder_levels(order) + + # restore the index names in order + result.index.names = index_names[order] - indices = np.concatenate(arrays) - assert len(indices) == len(result) + # reorder rows to keep things sorted + indices = np.arange(len(result)).reshape([len(q), self.ngroups]).T.flatten() return result.take(indices) @Substitution(name="groupby") @@ -2098,17 +2094,17 @@ def rank( Parameters ---------- method : {'average', 'min', 'max', 'first', 'dense'}, default 'average' - * average: average rank of group - * min: lowest rank in group - * max: highest rank in group - * first: ranks assigned in order they appear in the array - * dense: like 'min', but rank always increases by 1 between groups + * average: average rank of group. + * min: lowest rank in group. + * max: highest rank in group. + * first: ranks assigned in order they appear in the array. + * dense: like 'min', but rank always increases by 1 between groups. ascending : bool, default True False for ranks by high (1) to low (N). na_option : {'keep', 'top', 'bottom'}, default 'keep' - * keep: leave NA values where they are - * top: smallest rank if ascending - * bottom: smallest rank if descending + * keep: leave NA values where they are. + * top: smallest rank if ascending. + * bottom: smallest rank if descending. pct : bool, default False Compute percentage rank of data within each group. axis : int, default 0 @@ -2361,6 +2357,9 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None, axis=0 axis=axis, ) ) + if fill_method is None: # GH30463 + fill_method = "pad" + limit = 0 filled = getattr(self, fill_method)(limit=limit) fill_grp = filled.groupby(self.grouper.codes) shifted = fill_grp.shift(periods=periods, freq=freq) @@ -2376,6 +2375,8 @@ def head(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). + Does not work for negative values of `n`. + Returns ------- Series or DataFrame @@ -2389,6 +2390,10 @@ def head(self, n=5): A B 0 1 2 2 5 6 + >>> df.groupby('A').head(-1) + Empty DataFrame + Columns: [A, B] + Index: [] """ self._reset_group_selection() mask = self._cumcount_array() < n @@ -2404,6 +2409,8 @@ def tail(self, n=5): from the original DataFrame with original index and order preserved (``as_index`` flag is ignored). + Does not work for negative values of `n`. + Returns ------- Series or DataFrame @@ -2417,6 +2424,10 @@ def tail(self, n=5): A B 1 a 2 3 b 2 + >>> df.groupby('A').tail(-1) + Empty DataFrame + Columns: [A, B] + Index: [] """ self._reset_group_selection() mask = self._cumcount_array(ascending=False) < n @@ -2527,9 +2538,9 @@ def get_groupby( squeeze: bool = False, observed: bool = False, mutated: bool = False, -): +) -> GroupBy: - klass: Union[Type["SeriesGroupBy"], Type["DataFrameGroupBy"]] + klass: Type[GroupBy] if isinstance(obj, Series): from pandas.core.groupby.generic import SeriesGroupBy diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 2c224a1bef338..0b89e702c9867 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -3,7 +3,7 @@ split-apply-combine paradigm. """ -from typing import Hashable, List, Optional, Tuple +from typing import Dict, Hashable, List, Optional, Tuple import numpy as np @@ -34,8 +34,7 @@ class Grouper: """ - A Grouper allows the user to specify a groupby instruction for a target - object. + A Grouper allows the user to specify a groupby instruction for an object. This specification will select a column via the key parameter, or if the level and/or axis parameters are given, a level of the index of the target @@ -47,17 +46,18 @@ class Grouper: Parameters ---------- key : str, defaults to None - groupby key, which selects the grouping column of the target + Groupby key, which selects the grouping column of the target. level : name/number, defaults to None - the level for the target index + The level for the target index. freq : str / frequency object, defaults to None This will groupby the specified frequency if the target selection (via key or level) is a datetime-like object. For full specification of available frequencies, please see `here - `_. - axis : number/name of the axis, defaults to 0 + `_. + axis : str, int, defaults to 0 + Number/name of the axis. sort : bool, default to False - whether to sort the resulting labels + Whether to sort the resulting labels. closed : {'left' or 'right'} Closed end of interval. Only when `freq` parameter is passed. label : {'left' or 'right'} @@ -194,7 +194,7 @@ def _set_grouper(self, obj: FrameOrSeries, sort: bool = False): # use stable sort to support first, last, nth indexer = self.indexer = ax.argsort(kind="mergesort") ax = ax.take(indexer) - obj = obj.take(indexer, axis=self.axis, is_copy=False) + obj = obj.take(indexer, axis=self.axis) self.obj = obj self.grouper = ax @@ -419,7 +419,7 @@ def _make_codes(self) -> None: self._group_index = uniques @cache_readonly - def groups(self) -> dict: + def groups(self) -> Dict[Hashable, np.ndarray]: return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) @@ -605,8 +605,8 @@ def is_in_obj(gpr) -> bool: if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( - f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]})" - " must be same length" + f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) " + "must be same length" ) # create the Grouping diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a99ebe77e8254..37067a1897a52 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -563,7 +563,9 @@ def _cython_operation( return result, names - def aggregate(self, values, how: str, axis: int = 0, min_count: int = -1): + def aggregate( + self, values, how: str, axis: int = 0, min_count: int = -1 + ) -> Tuple[np.ndarray, Optional[List[str]]]: return self._cython_operation( "aggregate", values, how, axis, min_count=min_count ) diff --git a/pandas/core/index.py b/pandas/core/index.py index a9c8e6731a17e..8cff53d7a8b74 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -27,4 +27,5 @@ "pandas.core.index is deprecated and will be removed in a future version. " "The public classes are available in the top-level namespace.", FutureWarning, + stacklevel=2, ) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index f75087ca3b505..4d45769d2fea9 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -3,6 +3,8 @@ """ import numpy as np +from pandas._typing import AnyArrayLike + from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries @@ -240,3 +242,68 @@ def length_of_indexer(indexer, target=None) -> int: elif not is_list_like_indexer(indexer): return 1 raise AssertionError("cannot find the length of the indexer") + + +def check_bool_array_indexer(array: AnyArrayLike, mask: AnyArrayLike) -> np.ndarray: + """ + Check if `mask` is a valid boolean indexer for `array`. + + `array` and `mask` are checked to have the same length, and the + dtype is validated. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + array : array + The array that's being masked. + mask : array + The boolean array that's masking. + + Returns + ------- + numpy.ndarray + The validated boolean mask. + + Raises + ------ + IndexError + When the lengths don't match. + ValueError + When `mask` cannot be converted to a bool-dtype ndarray. + + See Also + -------- + api.types.is_bool_dtype : Check if `key` is of boolean dtype. + + Examples + -------- + A boolean ndarray is returned when the arguments are all valid. + + >>> mask = pd.array([True, False]) + >>> arr = pd.array([1, 2]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + array([ True, False]) + + An IndexError is raised when the lengths don't match. + + >>> mask = pd.array([True, False, True]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + IndexError: Item wrong length 3 instead of 2. + + A ValueError is raised when the mask cannot be converted to + a bool-dtype ndarray. + + >>> mask = pd.array([True, pd.NA]) + >>> pd.api.extensions.check_bool_array_indexer(arr, mask) + Traceback (most recent call last): + ... + ValueError: cannot convert to bool numpy array in presence of missing values + """ + result = np.asarray(mask, dtype=bool) + # GH26658 + if len(result) != len(array): + raise IndexError(f"Item wrong length {len(result)} instead of {len(array)}.") + return result diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 1904456848396..4072d06b9427c 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -198,6 +198,7 @@ def conv(i): result = indexes[0] if hasattr(result, "union_many"): + # DatetimeIndex return result.union_many(indexes[1:]) else: for other in indexes[1:]: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ce7a238daeca9..c158bdfbac441 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1,7 +1,7 @@ from datetime import datetime import operator from textwrap import dedent -from typing import FrozenSet, Union +from typing import Any, Dict, FrozenSet, Hashable, Optional, Union import warnings import numpy as np @@ -50,13 +50,13 @@ from pandas.core.dtypes.generic import ( ABCCategorical, ABCDataFrame, - ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, ABCIntervalIndex, ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, + ABCRangeIndex, ABCSeries, ABCTimedeltaIndex, ) @@ -68,7 +68,6 @@ from pandas.core.arrays import ExtensionArray from pandas.core.base import IndexOpsMixin, PandasObject import pandas.core.common as com -from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.frozen import FrozenList import pandas.core.missing as missing @@ -96,6 +95,7 @@ duplicated="np.ndarray", ) _index_shared_docs = dict() +str_t = str def _make_comparison_op(op, cls): @@ -107,6 +107,11 @@ def cmp_method(self, other): if is_object_dtype(self) and isinstance(other, ABCCategorical): left = type(other)(self._values, dtype=other.dtype) return op(left, other) + elif is_object_dtype(self) and isinstance(other, ExtensionArray): + # e.g. PeriodArray + with np.errstate(all="ignore"): + result = op(self.values, other) + elif is_object_dtype(self) and not isinstance(self, ABCMultiIndex): # don't pass MultiIndex with np.errstate(all="ignore"): @@ -239,7 +244,11 @@ def _outer_indexer(self, left, right): _typ = "index" _data: Union[ExtensionArray, np.ndarray] _id = None - name = None + _name: Optional[Hashable] = None + # MultiIndex.levels previously allowed setting the index name. We + # don't allow this anymore, and raise if it happens rather than + # failing silently. + _no_setting_name: bool = False _comparables = ["name"] _attributes = ["name"] _is_numeric_dtype = False @@ -268,14 +277,13 @@ def __new__( cls, data=None, dtype=None, copy=False, name=None, tupleize_cols=True, **kwargs, ) -> "Index": - from .range import RangeIndex + from pandas.core.indexes.range import RangeIndex from pandas import PeriodIndex, DatetimeIndex, TimedeltaIndex - from .numeric import Float64Index, Int64Index, UInt64Index - from .interval import IntervalIndex - from .category import CategoricalIndex + from pandas.core.indexes.numeric import Float64Index, Int64Index, UInt64Index + from pandas.core.indexes.interval import IntervalIndex + from pandas.core.indexes.category import CategoricalIndex - if name is None and hasattr(data, "name"): - name = data.name + name = maybe_extract_name(name, data, cls) if isinstance(data, ABCPandasArray): # ensure users don't accidentally put a PandasArray in an index. @@ -292,11 +300,15 @@ def __new__( return CategoricalIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # interval - elif ( - is_interval_dtype(data) or is_interval_dtype(dtype) - ) and not is_object_dtype(dtype): - closed = kwargs.get("closed", None) - return IntervalIndex(data, dtype=dtype, name=name, copy=copy, closed=closed) + elif is_interval_dtype(data) or is_interval_dtype(dtype): + closed = kwargs.pop("closed", None) + if is_dtype_equal(_o_dtype, dtype): + return IntervalIndex( + data, name=name, copy=copy, closed=closed, **kwargs + ).astype(object) + return IntervalIndex( + data, dtype=dtype, name=name, copy=copy, closed=closed, **kwargs + ) elif ( is_datetime64_any_dtype(data) @@ -326,8 +338,10 @@ def __new__( else: return TimedeltaIndex(data, copy=copy, name=name, dtype=dtype, **kwargs) - elif is_period_dtype(data) and not is_object_dtype(dtype): - return PeriodIndex(data, copy=copy, name=name, **kwargs) + elif is_period_dtype(data) or is_period_dtype(dtype): + if is_dtype_equal(_o_dtype, dtype): + return PeriodIndex(data, copy=False, name=name, **kwargs).astype(object) + return PeriodIndex(data, dtype=dtype, copy=copy, name=name, **kwargs) # extension dtype elif is_extension_array_dtype(data) or is_extension_array_dtype(dtype): @@ -350,41 +364,8 @@ def __new__( # they are actually ints, e.g. '0' and 0.0 # should not be coerced # GH 11836 - if is_integer_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "integer": - data = maybe_cast_to_integer_array(data, dtype, copy=copy) - elif inferred in ["floating", "mixed-integer-float"]: - if isna(data).any(): - raise ValueError("cannot convert float NaN to integer") - - if inferred == "mixed-integer-float": - data = maybe_cast_to_integer_array(data, dtype) - - # If we are actually all equal to integers, - # then coerce to integer. - try: - return cls._try_convert_to_int_index( - data, copy, name, dtype - ) - except ValueError: - pass - - # Return an actual float index. - return Float64Index(data, copy=copy, name=name) - - elif inferred == "string": - pass - else: - data = data.astype(dtype) - elif is_float_dtype(dtype): - inferred = lib.infer_dtype(data, skipna=False) - if inferred == "string": - pass - else: - data = data.astype(dtype) - else: - data = np.array(data, dtype=dtype, copy=copy) + data = _maybe_cast_with_dtype(data, dtype, copy) + dtype = data.dtype # TODO: maybe not for object? # maybe coerce to a sub-class if is_signed_integer_dtype(data.dtype): @@ -404,45 +385,17 @@ def __new__( subarr = subarr.copy() if dtype is None: - inferred = lib.infer_dtype(subarr, skipna=False) - if inferred == "integer": - try: - return cls._try_convert_to_int_index(subarr, copy, name, dtype) - except ValueError: - pass - - return Index(subarr, copy=copy, dtype=object, name=name) - elif inferred in ["floating", "mixed-integer-float", "integer-na"]: - # TODO: Returns IntegerArray for integer-na case in the future - return Float64Index(subarr, copy=copy, name=name) - elif inferred == "interval": - try: - return IntervalIndex(subarr, name=name, copy=copy) - except ValueError: - # GH27172: mixed closed Intervals --> object dtype - pass - elif inferred == "boolean": - # don't support boolean explicitly ATM - pass - elif inferred != "string": - if inferred.startswith("datetime"): - try: - return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) - except (ValueError, OutOfBoundsDatetime): - # GH 27011 - # If we have mixed timezones, just send it - # down the base constructor - pass - - elif inferred.startswith("timedelta"): - return TimedeltaIndex(subarr, copy=copy, name=name, **kwargs) - elif inferred == "period": - try: - return PeriodIndex(subarr, name=name, **kwargs) - except IncompatibleFrequency: - pass + new_data, new_dtype = _maybe_cast_data_without_dtype(subarr) + if new_dtype is not None: + return cls( + new_data, dtype=new_dtype, copy=False, name=name, **kwargs + ) + if kwargs: raise TypeError(f"Unexpected keyword arguments {repr(set(kwargs))}") + if subarr.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") return cls._simple_new(subarr, name, **kwargs) elif hasattr(data, "__array__"): @@ -458,7 +411,7 @@ def __new__( if data and all(isinstance(e, tuple) for e in data): # we must be all tuples, otherwise don't construct # 10697 - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex return MultiIndex.from_tuples( data, names=name or kwargs.get("names") @@ -506,11 +459,7 @@ def _simple_new(cls, values, name=None, dtype=None): Must be careful not to recurse. """ - if isinstance(values, (ABCSeries, ABCIndexClass)): - # Index._data must always be an ndarray. - # This is no-copy for when _values is an ndarray, - # which should be always at this point. - values = np.asarray(values._values) + assert isinstance(values, np.ndarray), type(values) result = object.__new__(cls) result._data = values @@ -520,7 +469,7 @@ def _simple_new(cls, values, name=None, dtype=None): # data buffers and strides. We don't re-use `_ndarray_values`, since # we actually set this value too. result._index_data = values - result.name = name + result._name = name return result._reset_identity() @@ -556,6 +505,7 @@ def _get_attributes_dict(self): def _shallow_copy(self, values=None, **kwargs): if values is None: values = self.values + attributes = self._get_attributes_dict() attributes.update(kwargs) if not len(values) and "dtype" not in kwargs: @@ -563,10 +513,6 @@ def _shallow_copy(self, values=None, **kwargs): # _simple_new expects an the type of self._data values = getattr(values, "_values", values) - if isinstance(values, ABCDatetimeArray): - # `self.values` returns `self` for tz-aware, so we need to unwrap - # more specifically - values = values.asi8 return self._simple_new(values, **attributes) @@ -647,7 +593,7 @@ def __len__(self) -> int: """ return len(self._data) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ The array interface, return my values. """ @@ -658,7 +604,7 @@ def __array_wrap__(self, result, context=None): Gets called after a ufunc. """ result = lib.item_from_zerodim(result) - if is_bool_dtype(result) or lib.is_scalar(result): + if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: return result attrs = self._get_attributes_dict() @@ -729,7 +675,7 @@ def astype(self, dtype, copy=True): return self.copy() if copy else self elif is_categorical_dtype(dtype): - from .category import CategoricalIndex + from pandas.core.indexes.category import CategoricalIndex return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) @@ -737,11 +683,10 @@ def astype(self, dtype, copy=True): return Index(np.asarray(self), dtype=dtype, copy=copy) try: - return Index( - self.values.astype(dtype, copy=copy), name=self.name, dtype=dtype - ) + casted = self.values.astype(dtype, copy=copy) except (TypeError, ValueError): raise TypeError(f"Cannot cast {type(self).__name__} to dtype {dtype}") + return Index(casted, name=self.name, dtype=dtype) _index_shared_docs[ "take" @@ -958,7 +903,7 @@ def _format_data(self, name=None): # do we want to justify (only do so for non-objects) is_justify = not ( - self.inferred_type in ("string", "unicode") + self.inferred_type in ("string") or ( self.inferred_type == "categorical" and is_object_dtype(self.categories) ) @@ -1209,6 +1154,24 @@ def to_frame(self, index=True, name=None): # -------------------------------------------------------------------- # Name-Centric Methods + @property + def name(self): + """ + Return Index or MultiIndex name. + """ + return self._name + + @name.setter + def name(self, value): + if self._no_setting_name: + # Used in MultiIndex.levels to avoid silently ignoring name updates. + raise RuntimeError( + "Cannot set name on a level of a MultiIndex. Use " + "'MultiIndex.set_names' instead." + ) + maybe_extract_name(value, None, type(self)) + self._name = value + def _validate_names(self, name=None, names=None, deep=False): """ Handles the quirks of having a singular 'name' parameter for general @@ -1258,7 +1221,7 @@ def _set_names(self, values, level=None): for name in values: if not is_hashable(name): raise TypeError(f"{type(self).__name__}.name must be a hashable type") - self.name = values[0] + self._name = values[0] names = property(fset=_set_names, fget=_get_names) @@ -1546,10 +1509,10 @@ def droplevel(self, level=0): if mask.any(): result = result.putmask(mask, np.nan) - result.name = new_names[0] + result._name = new_names[0] return result else: - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex return MultiIndex( levels=new_levels, @@ -1677,21 +1640,230 @@ def is_unique(self) -> bool: @property def has_duplicates(self) -> bool: + """ + Check if the Index has duplicate values. + + Returns + ------- + bool + Whether or not the Index has duplicate values. + + Examples + -------- + >>> idx = pd.Index([1, 5, 7, 7]) + >>> idx.has_duplicates + True + + >>> idx = pd.Index([1, 5, 7]) + >>> idx.has_duplicates + False + + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.has_duplicates + True + + >>> idx = pd.Index(["Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.has_duplicates + False + """ return not self.is_unique def is_boolean(self) -> bool: + """ + Check if the Index only consists of booleans. + + Returns + ------- + bool + Whether or not the Index only consists of booleans. + + See Also + -------- + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([True, False, True]) + >>> idx.is_boolean() + True + + >>> idx = pd.Index(["True", "False", "True"]) + >>> idx.is_boolean() + False + + >>> idx = pd.Index([True, False, "True"]) + >>> idx.is_boolean() + False + """ return self.inferred_type in ["boolean"] def is_integer(self) -> bool: + """ + Check if the Index only consists of integers. + + Returns + ------- + bool + Whether or not the Index only consists of integers. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx.is_integer() + True + + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_integer() + False + + >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) + >>> idx.is_integer() + False + """ return self.inferred_type in ["integer"] def is_floating(self) -> bool: + """ + Check if the Index is a floating type. + + The Index may consist of only floats, NaNs, or a mix of floats, + integers, or NaNs. + + Returns + ------- + bool + Whether or not the Index only consists of only consists of floats, NaNs, or + a mix of floats, integers, or NaNs. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_floating() + True + + >>> idx = pd.Index([1.0, 2.0, np.nan, 4.0]) + >>> idx.is_floating() + True + + >>> idx = pd.Index([1, 2, 3, 4, np.nan]) + >>> idx.is_floating() + True + + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx.is_floating() + False + """ return self.inferred_type in ["floating", "mixed-integer-float", "integer-na"] def is_numeric(self) -> bool: + """ + Check if the Index only consists of numeric data. + + Returns + ------- + bool + Whether or not the Index only consists of numeric data. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_numeric() + True + + >>> idx = pd.Index([1, 2, 3, 4.0]) + >>> idx.is_numeric() + True + + >>> idx = pd.Index([1, 2, 3, 4]) + >>> idx.is_numeric() + True + + >>> idx = pd.Index([1, 2, 3, 4.0, np.nan]) + >>> idx.is_numeric() + True + + >>> idx = pd.Index([1, 2, 3, 4.0, np.nan, "Apple"]) + >>> idx.is_numeric() + False + """ return self.inferred_type in ["integer", "floating"] def is_object(self) -> bool: + """ + Check if the Index is of the object dtype. + + Returns + ------- + bool + Whether or not the Index is of the object dtype. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index(["Apple", "Mango", "Watermelon"]) + >>> idx.is_object() + True + + >>> idx = pd.Index(["Apple", "Mango", 2.0]) + >>> idx.is_object() + True + + >>> idx = pd.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.object() + False + + >>> idx = pd.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_object() + False + """ return is_object_dtype(self.dtype) def is_categorical(self) -> bool: @@ -1700,12 +1872,19 @@ def is_categorical(self) -> bool: Returns ------- - boolean + bool True if the Index is categorical. See Also -------- CategoricalIndex : Index for categorical data. + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_interval : Check if the Index holds Interval objects. + is_mixed : Check if the Index holds data with mixed data types. Examples -------- @@ -1731,9 +1910,67 @@ def is_categorical(self) -> bool: return self.inferred_type in ["categorical"] def is_interval(self) -> bool: + """ + Check if the Index holds Interval objects. + + Returns + ------- + bool + Whether or not the Index holds Interval objects. + + See Also + -------- + IntervalIndex : Index for Interval objects. + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_mixed : Check if the Index holds data with mixed data types. + + Examples + -------- + >>> idx = pd.Index([pd.Interval(left=0, right=5), + ... pd.Interval(left=5, right=10)]) + >>> idx.is_interval() + True + + >>> idx = pd.Index([1, 3, 5, 7]) + >>> idx.is_interval() + False + """ return self.inferred_type in ["interval"] def is_mixed(self) -> bool: + """ + Check if the Index holds data with mixed data types. + + Returns + ------- + bool + Whether or not the Index holds data with mixed data types. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + + Examples + -------- + >>> idx = pd.Index(['a', np.nan, 'b']) + >>> idx.is_mixed() + True + + >>> idx = pd.Index([1.0, 2.0, 3.0, 5.0]) + >>> idx.is_mixed() + False + """ return self.inferred_type in ["mixed"] def holds_integer(self): @@ -1751,6 +1988,9 @@ def inferred_type(self): @cache_readonly def is_all_dates(self) -> bool: + """ + Whether or not the index values only consist of dates. + """ return is_datetime_array(ensure_object(self.values)) # -------------------------------------------------------------------- @@ -1761,35 +2001,6 @@ def __reduce__(self): d.update(self._get_attributes_dict()) return _new_Index, (type(self), d), None - def __setstate__(self, state): - """ - Necessary for making this object picklable. - """ - - if isinstance(state, dict): - self._data = state.pop("data") - for k, v in state.items(): - setattr(self, k, v) - - elif isinstance(state, tuple): - - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self.name = own_state[0] - - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - # -------------------------------------------------------------------- # Null Handling Methods @@ -1982,7 +2193,7 @@ def dropna(self, how="any"): raise ValueError(f"invalid how option: {how}") if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy(self._values[~self._isnan]) return self._shallow_copy() # -------------------------------------------------------------------- @@ -2345,11 +2556,11 @@ def _union(self, other, sort): return other._get_reconciled_name_object(self) # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self) or is_datetime64tz_dtype(self): + if is_datetime64tz_dtype(self): lvals = self._ndarray_values else: lvals = self._values - if is_period_dtype(other) or is_datetime64tz_dtype(other): + if is_datetime64tz_dtype(other): rvals = other._ndarray_values else: rvals = other._values @@ -2448,14 +2659,8 @@ def intersection(self, other, sort=False): return this.intersection(other, sort=sort) # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other): - rvals = other._ndarray_values - else: - rvals = other._values + lvals = self._values + rvals = other._values if self.is_monotonic and other.is_monotonic: try: @@ -2474,18 +2679,13 @@ def intersection(self, other, sort=False): indexer = indexer[indexer != -1] taken = other.take(indexer) + res_name = get_op_result_name(self, other) if sort is None: taken = algos.safe_sort(taken.values) - if self.name != other.name: - name = None - else: - name = self.name - return self._shallow_copy(taken, name=name) - - if self.name != other.name: - taken.name = None + return self._shallow_copy(taken, name=res_name) + taken.name = res_name return taken def difference(self, other, sort=None): @@ -2614,11 +2814,11 @@ def symmetric_difference(self, other, result_name=None, sort=None): left_indexer = np.setdiff1d( np.arange(this.size), common_indexer, assume_unique=True ) - left_diff = this.values.take(left_indexer) + left_diff = this._values.take(left_indexer) # {other} minus {this} right_indexer = (indexer == -1).nonzero()[0] - right_diff = other.values.take(right_indexer) + right_diff = other._values.take(right_indexer) the_diff = concat_compat([left_diff, right_diff]) if sort is None: @@ -2759,7 +2959,9 @@ def get_loc(self, key, method=None, tolerance=None): """ @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) - def get_indexer(self, target, method=None, limit=None, tolerance=None): + def get_indexer( + self, target, method=None, limit=None, tolerance=None + ) -> np.ndarray: method = missing.clean_reindex_fill_method(method) target = ensure_index(target) if tolerance is not None: @@ -2816,14 +3018,16 @@ def _convert_tolerance(self, tolerance, target): raise ValueError("list-like tolerance size must match target index size") return tolerance - def _get_fill_indexer(self, target, method, limit=None, tolerance=None): + def _get_fill_indexer( + self, target: "Index", method: str_t, limit=None, tolerance=None + ) -> np.ndarray: if self.is_monotonic_increasing and target.is_monotonic_increasing: - method = ( + engine_method = ( self._engine.get_pad_indexer if method == "pad" else self._engine.get_backfill_indexer ) - indexer = method(target._ndarray_values, limit) + indexer = engine_method(target._ndarray_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: @@ -2832,7 +3036,9 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None): ) return indexer - def _get_fill_indexer_searchsorted(self, target, method, limit=None): + def _get_fill_indexer_searchsorted( + self, target: "Index", method: str_t, limit=None + ) -> np.ndarray: """ Fallback pad/backfill get_indexer that works for monotonic decreasing indexes and non-monotonic targets. @@ -2863,7 +3069,7 @@ def _get_fill_indexer_searchsorted(self, target, method, limit=None): indexer[indexer == len(self)] = -1 return indexer - def _get_nearest_indexer(self, target, limit, tolerance): + def _get_nearest_indexer(self, target: "Index", limit, tolerance) -> np.ndarray: """ Get the indexer for the nearest index labels; requires an index with values that can be subtracted from each other (e.g., not strings or @@ -2886,7 +3092,9 @@ def _get_nearest_indexer(self, target, limit, tolerance): indexer = self._filter_indexer_tolerance(target, indexer, tolerance) return indexer - def _filter_indexer_tolerance(self, target, indexer, tolerance): + def _filter_indexer_tolerance( + self, target: "Index", indexer: np.ndarray, tolerance + ) -> np.ndarray: distance = abs(self.values[indexer] - target) indexer = np.where(distance <= tolerance, indexer, -1) return indexer @@ -2902,12 +3110,12 @@ def _filter_indexer_tolerance(self, target, indexer, tolerance): Parameters ---------- key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem', 'iloc'} or None """ @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] if kind == "iloc": return self._validate_indexer("positional", key, kind) @@ -2915,13 +3123,13 @@ def _convert_scalar_indexer(self, key, kind=None): if len(self) and not isinstance(self, ABCMultiIndex): # we can raise here if we are definitive that this - # is positional indexing (eg. .ix on with a float) + # is positional indexing (eg. .loc on with a float) # or label indexing if we are using a type able # to be represented in the index - if kind in ["getitem", "ix"] and is_float(key): + if kind in ["getitem"] and is_float(key): if not self.is_floating(): - return self._invalid_indexer("label", key) + self._invalid_indexer("label", key) elif kind in ["loc"] and is_float(key): @@ -2933,7 +3141,6 @@ def _convert_scalar_indexer(self, key, kind=None): "mixed-integer-float", "integer-na", "string", - "unicode", "mixed", ]: self._invalid_indexer("label", key) @@ -2955,12 +3162,12 @@ def _convert_scalar_indexer(self, key, kind=None): Parameters ---------- key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem', 'iloc'} or None """ @Appender(_index_shared_docs["_convert_slice_indexer"]) def _convert_slice_indexer(self, key: slice, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # validate iloc if kind == "iloc": @@ -3099,7 +3306,7 @@ def _convert_index_indexer(self, keyarr): @Appender(_index_shared_docs["_convert_list_indexer"]) def _convert_list_indexer(self, keyarr, kind=None): if ( - kind in [None, "iloc", "ix"] + kind in [None, "iloc"] and is_integer_dtype(keyarr) and not self.is_floating() and not isinstance(keyarr, ABCPeriodIndex) @@ -3179,7 +3386,10 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): if not isinstance(target, Index) and len(target) == 0: attrs = self._get_attributes_dict() attrs.pop("freq", None) # don't preserve freq - values = self._data[:0] # appropriately-dtyped empty array + if isinstance(self, ABCRangeIndex): + values = range(0) + else: + values = self._data[:0] # appropriately-dtyped empty array target = self._simple_new(values, dtype=self.dtype, **attrs) else: target = ensure_index(target) @@ -3403,7 +3613,7 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) return join_index def _join_multi(self, other, how, return_indexers=True): - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex from pandas.core.reshape.merge import _restore_dropped_levels_multijoin # figure out join names @@ -3424,8 +3634,13 @@ def _join_multi(self, other, how, return_indexers=True): ldrop_names = list(self_names - overlap) rdrop_names = list(other_names - overlap) - self_jnlevels = self.droplevel(ldrop_names) - other_jnlevels = other.droplevel(rdrop_names) + # if only the order differs + if not len(ldrop_names + rdrop_names): + self_jnlevels = self + other_jnlevels = other.reorder_levels(self.names) + else: + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) # Join left and right # Join on same leveled multi-index frames is supported @@ -3505,7 +3720,7 @@ def _join_level( MultiIndex will not be changed; otherwise, it will tie out with `other`. """ - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex def _get_leaf_sorter(labels): """ @@ -3710,15 +3925,16 @@ def values(self): @property def _values(self) -> Union[ExtensionArray, ABCIndexClass, np.ndarray]: - # TODO(EA): remove index types as they become extension arrays """ The best array representation. - This is an ndarray, ExtensionArray, or Index subclass. This differs - from ``_ndarray_values``, which always returns an ndarray. + This is an ndarray or ExtensionArray. This differs from + ``_ndarray_values``, which always returns an ndarray. Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index``. + ``Series`` and ``Index`` (except for datetime64[ns], which returns + a DatetimeArray for _values on the Index, but ndarray[M8ns] on the + Series). It may differ from the public '.values' method. @@ -3726,8 +3942,8 @@ def _values(self) -> Union[ExtensionArray, ABCIndexClass, np.ndarray]: ----------------- | --------------- | ------------- | --------------- | Index | ndarray | ndarray | ndarray | CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + DatetimeIndex | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | + DatetimeIndex[tz] | ndarray[M8ns] | DatetimeArray | ndarray[M8ns] | PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | @@ -3831,50 +4047,6 @@ def where(self, cond, other=None): return self._shallow_copy_with_infer(values, dtype=dtype) # construction helpers - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desired - try: - res = data.astype("i8", copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype("u8", copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - @classmethod def _scalar_data_error(cls, data): # We return the TypeError so that we can raise it from the constructor @@ -3972,7 +4144,7 @@ def is_type_compatible(self, kind) -> bool: """ @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: hash(key) try: return key in self._engine @@ -4016,6 +4188,9 @@ def __getitem__(self, key): key = com.values_from_object(key) result = getitem(key) if not is_scalar(result): + if np.ndim(result) > 1: + deprecate_ndim_indexing(result) + return result return promote(result) else: return result @@ -4391,7 +4566,7 @@ def shift(self, periods=1, freq=None): """ raise NotImplementedError(f"Not supported for type {type(self).__name__}") - def argsort(self, *args, **kwargs): + def argsort(self, *args, **kwargs) -> np.ndarray: """ Return the integer indices that would sort the index. @@ -4446,57 +4621,45 @@ def argsort(self, *args, **kwargs): @Appender(_index_shared_docs["get_value"] % _index_doc_kwargs) def get_value(self, series, key): - # if we have something that is Index-like, then - # use this, e.g. DatetimeIndex - # Things like `Series._get_value` (via .at) pass the EA directly here. - s = extract_array(series, extract_numpy=True) - if isinstance(s, ExtensionArray): - if is_scalar(key): - # GH 20882, 21257 - # First try to convert the key to a location - # If that fails, raise a KeyError if an integer - # index, otherwise, see if key is an integer, and - # try that - try: - iloc = self.get_loc(key) - return s[iloc] - except KeyError: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): - raise - elif is_integer(key): - return s[key] - else: - # if key is not a scalar, directly raise an error (the code below - # would convert to numpy arrays and raise later any way) - GH29926 - raise InvalidIndexError(key) - - s = com.values_from_object(series) - k = com.values_from_object(key) + if not is_scalar(key): + # if key is not a scalar, directly raise an error (the code below + # would convert to numpy arrays and raise later any way) - GH29926 + raise InvalidIndexError(key) - k = self._convert_scalar_indexer(k, kind="getitem") try: - return self._engine.get_value(s, k, tz=getattr(series.dtype, "tz", None)) - except KeyError as e1: + # GH 20882, 21257 + # First try to convert the key to a location + # If that fails, raise a KeyError if an integer + # index, otherwise, see if key is an integer, and + # try that + loc = self._engine.get_loc(key) + except KeyError: if len(self) > 0 and (self.holds_integer() or self.is_boolean()): raise - - try: - return libindex.get_value_at(s, key) - except IndexError: + elif is_integer(key): + # If the Index cannot hold integer, then this is unambiguously + # a locational lookup. + loc = key + else: raise - except TypeError: - # generator/iterator-like - if is_iterator(key): - raise InvalidIndexError(key) - else: - raise e1 - except Exception: - raise e1 - except TypeError: - # e.g. "[False] is an invalid key" - if is_scalar(key): - raise IndexError(key) - raise InvalidIndexError(key) + + return self._get_values_for_loc(series, loc) + + def _get_values_for_loc(self, series, loc): + """ + Do a positional lookup on the given Series, returning either a scalar + or a Series. + + Assumes that `series.index is self` + """ + if is_integer(loc): + if isinstance(series._values, np.ndarray): + # Since we have an ndarray and not DatetimeArray, we dont + # have to worry about a tz. + return libindex.get_value_at(series._values, loc, tz=None) + return series._values[loc] + + return series.iloc[loc] def set_value(self, arr, key, value): """ @@ -4551,7 +4714,7 @@ def get_indexer_non_unique(self, target): if is_categorical(target): tgt_values = np.asarray(target) - elif self.is_all_dates: + elif self.is_all_dates and target.is_all_dates: # GH 30399 tgt_values = target.asi8 else: tgt_values = target._ndarray_values @@ -4586,7 +4749,7 @@ def _maybe_promote(self, other): return self.astype("object"), other.astype("object") return self, other - def groupby(self, values): + def groupby(self, values) -> Dict[Hashable, np.ndarray]: """ Group the index labels by a given array of values. @@ -4597,7 +4760,7 @@ def groupby(self, values): Returns ------- - groups : dict + dict {group name -> group labels} """ @@ -4633,7 +4796,7 @@ def map(self, mapper, na_action=None): a MultiIndex will be returned. """ - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex new_values = super()._map_values(mapper, na_action=na_action) @@ -4739,7 +4902,7 @@ def isin(self, values, level=None): self._validate_index_level(level) return algos.isin(self, values) - def _get_string_slice(self, key, use_lhs=True, use_rhs=True): + def _get_string_slice(self, key: str_t, use_lhs: bool = True, use_rhs: bool = True): # this is for partial string indexing, # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex raise NotImplementedError @@ -4813,7 +4976,7 @@ def _validate_indexer(self, form, key, kind): If we are positional indexer, validate that we have appropriate typed bounds must be an integer. """ - assert kind in ["ix", "loc", "getitem", "iloc"] + assert kind in ["loc", "getitem", "iloc"] if key is None: pass @@ -4834,7 +4997,7 @@ def _validate_indexer(self, form, key, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- @@ -4847,15 +5010,14 @@ def _validate_indexer(self, form, key, kind): @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] # We are a plain index here (sub-class override this method if they # wish to have special treatment for floats/ints, e.g. Float64Index and # datetimelike Indexes # reject them if is_float(label): - if not (kind in ["ix"] and (self.holds_integer() or self.is_floating())): - self._invalid_indexer("slice", label) + self._invalid_indexer("slice", label) # we are trying to find integer bounds on a non-integer based index # this is rejected (generally .loc gets you here) @@ -4878,7 +5040,7 @@ def _searchsorted_monotonic(self, label, side="left"): raise ValueError("index must be monotonic increasing or decreasing") - def get_slice_bound(self, label, side, kind): + def get_slice_bound(self, label, side, kind) -> int: """ Calculate slice bound that corresponds to given label. @@ -4889,19 +5051,19 @@ def get_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- int Index of label. """ - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] if side not in ("left", "right"): raise ValueError( - f"Invalid value for side kwarg, must be either" - f" 'left' or 'right': {side}" + "Invalid value for side kwarg, must be either " + f"'left' or 'right': {side}" ) original_label = label @@ -4956,7 +5118,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): If None, defaults to the end. step : int, defaults None If None, defaults to 1. - kind : {'ix', 'loc', 'getitem'} or None + kind : {'loc', 'getitem'} or None Returns ------- @@ -5043,7 +5205,7 @@ def delete(self, loc): """ return self._shallow_copy(np.delete(self._data, loc)) - def insert(self, loc, item): + def insert(self, loc: int, item): """ Make new Index inserting new item at location. @@ -5354,7 +5516,7 @@ def ensure_index_from_sequences(sequences, names=None): -------- ensure_index """ - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex if len(sequences) == 1: if names is not None: @@ -5415,7 +5577,7 @@ def ensure_index(index_like, copy=False): converted, all_arrays = lib.clean_index_list(index_like) if len(converted) > 0 and all_arrays: - from .multi import MultiIndex + from pandas.core.indexes.multi import MultiIndex return MultiIndex.from_arrays(converted) else: @@ -5462,3 +5624,202 @@ def default_index(n): from pandas.core.indexes.range import RangeIndex return RangeIndex(0, n, name=None) + + +def maybe_extract_name(name, obj, cls) -> Optional[Hashable]: + """ + If no name is passed, then extract it from data, validating hashability. + """ + if name is None and isinstance(obj, (Index, ABCSeries)): + # Note we don't just check for "name" attribute since that would + # pick up e.g. dtype.name + name = obj.name + + # GH#29069 + if not is_hashable(name): + raise TypeError(f"{cls.__name__}.name must be a hashable type") + + return name + + +def _maybe_cast_with_dtype(data: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: + """ + If a dtype is passed, cast to the closest matching dtype that is supported + by Index. + + Parameters + ---------- + data : np.ndarray + dtype : np.dtype + copy : bool + + Returns + ------- + np.ndarray + """ + # we need to avoid having numpy coerce + # things that look like ints/floats to ints unless + # they are actually ints, e.g. '0' and 0.0 + # should not be coerced + # GH 11836 + if is_integer_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "integer": + data = maybe_cast_to_integer_array(data, dtype, copy=copy) + elif inferred in ["floating", "mixed-integer-float"]: + if isna(data).any(): + raise ValueError("cannot convert float NaN to integer") + + if inferred == "mixed-integer-float": + data = maybe_cast_to_integer_array(data, dtype) + + # If we are actually all equal to integers, + # then coerce to integer. + try: + data = _try_convert_to_int_array(data, copy, dtype) + except ValueError: + data = np.array(data, dtype=np.float64, copy=copy) + + elif inferred == "string": + pass + else: + data = data.astype(dtype) + elif is_float_dtype(dtype): + inferred = lib.infer_dtype(data, skipna=False) + if inferred == "string": + pass + else: + data = data.astype(dtype) + else: + data = np.array(data, dtype=dtype, copy=copy) + + return data + + +def _maybe_cast_data_without_dtype(subarr): + """ + If we have an arraylike input but no passed dtype, try to infer + a supported dtype. + + Parameters + ---------- + subarr : np.ndarray, Index, or Series + + Returns + ------- + converted : np.ndarray or ExtensionArray + dtype : np.dtype or ExtensionDtype + """ + # Runtime import needed bc IntervalArray imports Index + from pandas.core.arrays import ( + IntervalArray, + PeriodArray, + DatetimeArray, + TimedeltaArray, + ) + + inferred = lib.infer_dtype(subarr, skipna=False) + + if inferred == "integer": + try: + data = _try_convert_to_int_array(subarr, False, None) + return data, data.dtype + except ValueError: + pass + + return subarr, object + + elif inferred in ["floating", "mixed-integer-float", "integer-na"]: + # TODO: Returns IntegerArray for integer-na case in the future + return subarr, np.float64 + + elif inferred == "interval": + try: + data = IntervalArray._from_sequence(subarr, copy=False) + return data, data.dtype + except ValueError: + # GH27172: mixed closed Intervals --> object dtype + pass + elif inferred == "boolean": + # don't support boolean explicitly ATM + pass + elif inferred != "string": + if inferred.startswith("datetime"): + try: + data = DatetimeArray._from_sequence(subarr, copy=False) + return data, data.dtype + except (ValueError, OutOfBoundsDatetime): + # GH 27011 + # If we have mixed timezones, just send it + # down the base constructor + pass + + elif inferred.startswith("timedelta"): + data = TimedeltaArray._from_sequence(subarr, copy=False) + return data, data.dtype + elif inferred == "period": + try: + data = PeriodArray._from_sequence(subarr) + return data, data.dtype + except IncompatibleFrequency: + pass + + return subarr, subarr.dtype + + +def _try_convert_to_int_array( + data: np.ndarray, copy: bool, dtype: np.dtype +) -> np.ndarray: + """ + Attempt to convert an array of data into an integer array. + + Parameters + ---------- + data : The data to convert. + copy : bool + Whether to copy the data or not. + dtype : np.dtype + + Returns + ------- + int_array : data converted to either an ndarray[int64] or ndarray[uint64] + + Raises + ------ + ValueError if the conversion was not successful. + """ + + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desired + try: + res = data.astype("i8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype("u8", copy=False) + if (res == data).all(): + return res # TODO: might still need to copy + except (OverflowError, TypeError, ValueError): + pass + + raise ValueError + + +def deprecate_ndim_indexing(result): + if np.ndim(result) > 1: + # GH#27125 indexer like idx[:, None] expands dim, but we + # cannot do that and keep an index, so return ndarray + # Deprecation GH#30588 + warnings.warn( + "Support for multi-dimensional indexing (e.g. `index[:, None]`) " + "on an Index is deprecated and will be removed in a future " + "version. Convert to a numpy array before indexing instead.", + DeprecationWarning, + stacklevel=3, + ) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index dd917a524e491..268ab9ba4e4c4 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,5 +1,5 @@ -import operator -from typing import Any +from typing import Any, List +import warnings import numpy as np @@ -8,9 +8,7 @@ from pandas._libs import index as libindex from pandas._libs.hashtable import duplicated_int64 from pandas._typing import AnyArrayLike -import pandas.compat as compat -from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_platform_int, @@ -26,10 +24,10 @@ from pandas.core import accessor from pandas.core.algorithms import take_1d from pandas.core.arrays.categorical import Categorical, _recode_for_categories, contains -from pandas.core.base import _shared_docs import pandas.core.common as com import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name +from pandas.core.indexes.extension import ExtensionIndex import pandas.core.missing as missing from pandas.core.ops import get_op_result_name @@ -37,6 +35,12 @@ _index_doc_kwargs.update(dict(target_klass="CategoricalIndex")) +@accessor.delegate_names( + delegate=Categorical, + accessors=["codes", "categories", "ordered"], + typ="property", + overwrite=True, +) @accessor.delegate_names( delegate=Categorical, accessors=[ @@ -50,11 +54,17 @@ "as_unordered", "min", "max", + "is_dtype_equal", + "tolist", + "_internal_get_values", + "_reverse_indexer", + "searchsorted", + "argsort", ], typ="method", overwrite=True, ) -class CategoricalIndex(Index, accessor.PandasDelegate): +class CategoricalIndex(ExtensionIndex, accessor.PandasDelegate): """ Index based on an underlying :class:`Categorical`. @@ -121,7 +131,7 @@ class CategoricalIndex(Index, accessor.PandasDelegate): Notes ----- See the `user guide - `_ + `_ for more. Examples @@ -147,6 +157,20 @@ class CategoricalIndex(Index, accessor.PandasDelegate): _typ = "categoricalindex" + _raw_inherit = { + "argsort", + "_internal_get_values", + "tolist", + "codes", + "categories", + "ordered", + "_reverse_indexer", + "searchsorted", + } + + codes: np.ndarray + categories: Index + @property def _engine_type(self): # self.codes can have dtype int8, int16, int32 or int64, so we need @@ -164,19 +188,12 @@ def _engine_type(self): # Constructors def __new__( - cls, - data=None, - categories=None, - ordered=None, - dtype=None, - copy=False, - name=None, + cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None ): dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) - if name is None and hasattr(data, "name"): - name = data.name + name = maybe_extract_name(name, data, cls) if not is_categorical_dtype(data): # don't allow scalars @@ -244,16 +261,15 @@ def _create_categorical(cls, data, dtype=None): return data @classmethod - def _simple_new(cls, values, name=None, dtype=None, **kwargs): + def _simple_new(cls, values, name=None, dtype=None): result = object.__new__(cls) values = cls._create_categorical(values, dtype=dtype) result._data = values result.name = name - for k, v in kwargs.items(): - setattr(result, k, v) result._reset_identity() + result._no_setting_name = False return result # -------------------------------------------------------------------- @@ -362,47 +378,22 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def itemsize(self): - # Size of the items in categories, not codes. - return self.values.itemsize - def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) + # We use _shallow_copy rather than the Index implementation + # (which uses _constructor) in order to preserve dtype. return self._shallow_copy(result, name=name) - def _internal_get_values(self): - # override base Index version to get the numpy array representation of - # the underlying Categorical - return self._data._internal_get_values() - - def tolist(self): - return self._data.tolist() - - @property - def codes(self): - return self._data.codes - - @property - def categories(self): - return self._data.categories - - @property - def ordered(self): - return self._data.ordered - - def _reverse_indexer(self): - return self._data._reverse_indexer() - @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: # if key is a NaN, check if any NaN is in self. if is_scalar(key) and isna(key): return self.hasnans + hash(key) return contains(self, key, container=self._engine) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ return np.array(self._data, dtype=dtype) @@ -418,7 +409,7 @@ def astype(self, dtype, copy=True): if dtype == self.dtype: return self.copy() if copy else self - return super().astype(dtype=dtype, copy=copy) + return Index.astype(self, dtype=dtype, copy=copy) @cache_readonly def _isnan(self): @@ -430,9 +421,6 @@ def fillna(self, value, downcast=None): self._assert_can_do_op(value) return CategoricalIndex(self._data.fillna(value), name=self.name) - def argsort(self, *args, **kwargs): - return self.values.argsort(*args, **kwargs) - @cache_readonly def _engine(self): # we are going to look things up with the codes themselves. @@ -441,19 +429,6 @@ def _engine(self): codes = self.codes return self._engine_type(lambda: codes, len(self)) - # introspection - @cache_readonly - def is_unique(self) -> bool: - return self._engine.is_unique - - @property - def is_monotonic_increasing(self): - return self._engine.is_monotonic_increasing - - @property - def is_monotonic_decreasing(self) -> bool: - return self._engine.is_monotonic_decreasing - @Appender(_index_shared_docs["index_unique"] % _index_doc_kwargs) def unique(self, level=None): if level is not None: @@ -529,8 +504,8 @@ def get_value(self, series: AnyArrayLike, key: Any): Any The element of the series at the position indicated by the key """ + k = key try: - k = com.values_from_object(key) k = self._convert_scalar_indexer(k, kind="getitem") indexer = self.get_loc(k) return series.take([indexer])[0] @@ -540,11 +515,6 @@ def get_value(self, series: AnyArrayLike, key: Any): # we might be a positional inexer return super().get_value(series, key) - @Substitution(klass="CategoricalIndex") - @Appender(_shared_docs["searchsorted"]) - def searchsorted(self, value, side="left", sorter=None): - return self._data.searchsorted(value, side=side, sorter=sorter) - @Appender(_index_shared_docs["where"]) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with @@ -584,6 +554,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): target = ibase.ensure_index(target) + missing: List[int] if self.equals(target): indexer = None missing = [] @@ -733,23 +704,14 @@ def _convert_arr_indexer(self, keyarr): def _convert_index_indexer(self, keyarr): return self._shallow_copy(keyarr) - @Appender(_index_shared_docs["take"] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): - nv.validate_take(tuple(), kwargs) - indices = ensure_platform_int(indices) - taken = self._assert_take_fillable( - self.codes, - indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=-1, + def take_nd(self, *args, **kwargs): + """Alias for `take`""" + warnings.warn( + "CategoricalIndex.take_nd is deprecated, use CategoricalIndex.take instead", + FutureWarning, + stacklevel=2, ) - return self._create_from_codes(taken) - - def is_dtype_equal(self, other): - return self._data.is_dtype_equal(other) - - take_nd = take + return self.take(*args, **kwargs) @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): @@ -837,7 +799,7 @@ def delete(self, loc): """ return self._create_from_codes(np.delete(self.codes, loc)) - def insert(self, loc, item): + def insert(self, loc: int, item): """ Make new Index inserting new item at location. Follows Python list.append semantics for negative values @@ -882,34 +844,10 @@ def _concat_same_dtype(self, to_concat, name): result.name = name return result - def _codes_for_groupby(self, sort, observed): - """ Return a Categorical adjusted for groupby """ - return self.values._codes_for_groupby(sort, observed) - - @classmethod - def _add_comparison_methods(cls): - """ add in comparison methods """ - - def _make_compare(op): - opname = f"__{op.__name__}__" - - def _evaluate_compare(self, other): - with np.errstate(all="ignore"): - result = op(self.array, other) - if isinstance(result, ABCSeries): - # Dispatch to pd.Categorical returned NotImplemented - # and we got a Series back; down-cast to ndarray - result = result._values - return result - - return compat.set_function_name(_evaluate_compare, opname, cls) - - cls.__eq__ = _make_compare(operator.eq) - cls.__ne__ = _make_compare(operator.ne) - cls.__lt__ = _make_compare(operator.lt) - cls.__gt__ = _make_compare(operator.gt) - cls.__le__ = _make_compare(operator.le) - cls.__ge__ = _make_compare(operator.ge) + def _delegate_property_get(self, name, *args, **kwargs): + """ method delegation to the ._values """ + prop = getattr(self._values, name) + return prop # no wrapping for now def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ @@ -917,7 +855,7 @@ def _delegate_method(self, name, *args, **kwargs): if "inplace" in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) - if is_scalar(res): + if is_scalar(res) or name in self._raw_inherit: return res return CategoricalIndex(res, name=self.name) @@ -925,4 +863,3 @@ def _delegate_method(self, name, *args, **kwargs): CategoricalIndex._add_numeric_methods_add_sub_disabled() CategoricalIndex._add_numeric_methods_disabled() CategoricalIndex._add_logical_methods_disabled() -CategoricalIndex._add_comparison_methods() diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 3bf6dce00a031..1bfec9fbad0ed 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,12 +2,13 @@ Base and utility classes for tseries type pandas objects. """ import operator -from typing import List, Set +from typing import Any, List, Optional, Set import numpy as np -from pandas._libs import NaT, iNaT, lib +from pandas._libs import NaT, iNaT, join as libjoin, lib from pandas._libs.algos import unique_deltas +from pandas._libs.tslibs import timezones from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly @@ -15,134 +16,93 @@ from pandas.core.dtypes.common import ( ensure_int64, is_bool_dtype, + is_categorical_dtype, is_dtype_equal, is_float, is_integer, is_list_like, is_period_dtype, is_scalar, + needs_i8_conversion, ) +from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna -from pandas.core import algorithms, ops +from pandas.core import algorithms from pandas.core.accessor import PandasDelegate -from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin -from pandas.core.arrays.datetimelike import ( - DatetimeLikeArrayMixin, - _ensure_datetimelike_to_i8, -) +from pandas.core.arrays import DatetimeArray, ExtensionArray, TimedeltaArray +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.core.base import _shared_docs import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.extension import ( + ExtensionIndex, + inherit_names, + make_wrapped_arith_op, +) +from pandas.core.indexes.numeric import Int64Index +from pandas.core.ops import get_op_result_name from pandas.core.tools.timedeltas import to_timedelta -from pandas.tseries.frequencies import to_offset +from pandas.tseries.frequencies import DateOffset, to_offset _index_doc_kwargs = dict(ibase._index_doc_kwargs) -def ea_passthrough(array_method): +def _join_i8_wrapper(joinf, with_indexers: bool = True): """ - Make an alias for a method of the underlying ExtensionArray. - - Parameters - ---------- - array_method : method on an Array class - - Returns - ------- - method + Create the join wrapper methods. """ - def method(self, *args, **kwargs): - return array_method(self._data, *args, **kwargs) + @staticmethod # type: ignore + def wrapper(left, right): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + left = left.view("i8") + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): + right = right.view("i8") - method.__name__ = array_method.__name__ - method.__doc__ = array_method.__doc__ - return method + results = joinf(left, right) + if with_indexers: + # dtype should be timedelta64[ns] for TimedeltaIndex + # and datetime64[ns] for DatetimeIndex + dtype = left.dtype.base + join_index, left_indexer, right_indexer = results + join_index = join_index.view(dtype) + return join_index, left_indexer, right_indexer + return results -def _make_wrapped_arith_op(opname): - def method(self, other): - meth = getattr(self._data, opname) - result = meth(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) + return wrapper - method.__name__ = opname - return method - -class DatetimeIndexOpsMixin(ExtensionOpsMixin): +@inherit_names( + ["inferred_freq", "_isnan", "_resolution", "resolution"], + DatetimeLikeArrayMixin, + cache=True, +) +@inherit_names( + ["__iter__", "mean", "freq", "freqstr", "_ndarray_values", "asi8", "_box_values"], + DatetimeLikeArrayMixin, +) +class DatetimeIndexOpsMixin(ExtensionIndex): """ Common ops mixin to support a unified interface datetimelike Index. """ _data: ExtensionArray + freq: Optional[DateOffset] + freqstr: Optional[str] + _resolution: int + _bool_ops: List[str] = [] + _field_ops: List[str] = [] - # DatetimeLikeArrayMixin assumes subclasses are mutable, so these are - # properties there. They can be made into cache_readonly for Index - # subclasses bc they are immutable - inferred_freq = cache_readonly( - DatetimeLikeArrayMixin.inferred_freq.fget # type: ignore - ) - _isnan = cache_readonly(DatetimeLikeArrayMixin._isnan.fget) # type: ignore hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore _hasnans = hasnans # for index / array -agnostic code - _resolution = cache_readonly( - DatetimeLikeArrayMixin._resolution.fget # type: ignore - ) - resolution = cache_readonly(DatetimeLikeArrayMixin.resolution.fget) # type: ignore - - _maybe_mask_results = ea_passthrough(DatetimeLikeArrayMixin._maybe_mask_results) - __iter__ = ea_passthrough(DatetimeLikeArrayMixin.__iter__) - mean = ea_passthrough(DatetimeLikeArrayMixin.mean) @property - def freq(self): - """ - Return the frequency object if it is set, otherwise None. - """ - return self._data.freq - - @property - def freqstr(self): - """ - Return the frequency object as a string if it is set, otherwise None. - """ - return self._data.freqstr - - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) - - result = self._data.unique() - - # Note: if `self` is already unique, then self.unique() should share - # a `freq` with self. If not already unique, then self.freq must be - # None, so again sharing freq is correct. - return self._shallow_copy(result._data) - - @classmethod - def _create_comparison_method(cls, op): - """ - Create a comparison method that dispatches to ``cls.values``. - """ - - def wrapper(self, other): - if isinstance(other, ABCSeries): - # the arrays defer to Series for comparison ops but the indexes - # don't, so we have to unwrap here. - other = other._values - - result = op(self._data, maybe_unwrap_index(other)) - return result - - wrapper.__doc__ = op.__doc__ - wrapper.__name__ = f"__{op.__name__}__" - return wrapper - - @property - def _ndarray_values(self) -> np.ndarray: - return self._data._ndarray_values + def is_all_dates(self) -> bool: + return True # ------------------------------------------------------------------------ # Abstract data attributes @@ -152,11 +112,6 @@ def values(self): # Note: PeriodArray overrides this to return an ndarray of objects. return self._data._data - @property # type: ignore # https://github.com/python/mypy/issues/1362 - @Appender(DatetimeLikeArrayMixin.asi8.__doc__) - def asi8(self): - return self._data.asi8 - def __array_wrap__(self, result, context=None): """ Gets called after a ufunc. @@ -173,7 +128,7 @@ def __array_wrap__(self, result, context=None): # ------------------------------------------------------------------------ - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ @@ -196,66 +151,18 @@ def equals(self, other): # have different timezone return False - elif is_period_dtype(self): - if not is_period_dtype(other): - return False - if self.freq != other.freq: - return False - return np.array_equal(self.asi8, other.asi8) - @staticmethod - def _join_i8_wrapper(joinf, dtype, with_indexers=True): - """ - Create the join wrapper methods. - """ - from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin - - @staticmethod - def wrapper(left, right): - if isinstance( - left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) - ): - left = left.view("i8") - if isinstance( - right, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin) - ): - right = right.view("i8") - results = joinf(left, right) - if with_indexers: - join_index, left_indexer, right_indexer = results - join_index = join_index.view(dtype) - return join_index, left_indexer, right_indexer - return results - - return wrapper - - def _ensure_localized( - self, arg, ambiguous="raise", nonexistent="raise", from_utc=False - ): - # See DatetimeLikeArrayMixin._ensure_localized.__doc__ - if getattr(self, "tz", None): - # ensure_localized is only relevant for tz-aware DTI - result = self._data._ensure_localized( - arg, ambiguous=ambiguous, nonexistent=nonexistent, from_utc=from_utc - ) - return type(self)._simple_new(result, name=self.name) - return arg - - def _box_values(self, values): - return self._data._box_values(values) - @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key): + def __contains__(self, key: Any) -> bool: + hash(key) try: res = self.get_loc(key) - return ( - is_scalar(res) - or isinstance(res, slice) - or (is_list_like(res) and len(res)) - ) except (KeyError, TypeError, ValueError): return False + return bool( + is_scalar(res) or isinstance(res, slice) or (is_list_like(res) and len(res)) + ) # Try to run function on index first, and then on elements of index # Especially important for group-by functionality @@ -288,20 +195,21 @@ def sort_values(self, return_indexer=False, ascending=True): # because the treatment of NaT has been changed to put NaT last # instead of first. sorted_values = np.sort(self.asi8) - attribs = self._get_attributes_dict() - freq = attribs["freq"] + freq = self.freq if freq is not None and not is_period_dtype(self): if freq.n > 0 and not ascending: freq = freq * -1 elif freq.n < 0 and ascending: freq = freq * -1 - attribs["freq"] = freq if not ascending: sorted_values = sorted_values[::-1] - return self._simple_new(sorted_values, **attribs) + arr = type(self._data)._simple_new( + sorted_values, dtype=self.dtype, freq=freq + ) + return self._simple_new(arr, name=self.name) @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): @@ -312,17 +220,21 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): if isinstance(maybe_slice, slice): return self[maybe_slice] - taken = self._assert_take_fillable( - self.asi8, - indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=iNaT, + return ExtensionIndex.take( + self, indices, axis, allow_fill, fill_value, **kwargs ) - # keep freq in PeriodArray/Index, reset otherwise - freq = self.freq if is_period_dtype(self) else None - return self._shallow_copy(taken, freq=freq) + @Appender(_shared_docs["searchsorted"]) + def searchsorted(self, value, side="left", sorter=None): + if isinstance(value, str): + raise TypeError( + "searchsorted requires compatible dtype or scalar, " + f"not {type(value).__name__}" + ) + if isinstance(value, Index): + value = value._data + + return self._data.searchsorted(value, side=side, sorter=sorter) _can_hold_na = True @@ -489,10 +401,10 @@ def _convert_scalar_indexer(self, key, kind=None): Parameters ---------- key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None + kind : {'loc', 'getitem', 'iloc'} or None """ - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # we don't allow integer/float indexing for loc # we don't allow float indexing for ix/getitem @@ -501,56 +413,27 @@ def _convert_scalar_indexer(self, key, kind=None): is_flt = is_float(key) if kind in ["loc"] and (is_int or is_flt): self._invalid_indexer("index", key) - elif kind in ["ix", "getitem"] and is_flt: + elif kind in ["getitem"] and is_flt: self._invalid_indexer("index", key) return super()._convert_scalar_indexer(key, kind=kind) - @classmethod - def _add_datetimelike_methods(cls): - """ - Add in the datetimelike methods (as we may have to override the - superclass). - """ - - def __add__(self, other): - # dispatch to ExtensionArray implementation - result = self._data.__add__(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - cls.__add__ = __add__ - - def __radd__(self, other): - # alias for __add__ - return self.__add__(other) - - cls.__radd__ = __radd__ - - def __sub__(self, other): - # dispatch to ExtensionArray implementation - result = self._data.__sub__(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - cls.__sub__ = __sub__ - - def __rsub__(self, other): - result = self._data.__rsub__(maybe_unwrap_index(other)) - return wrap_arithmetic_op(self, other, result) - - cls.__rsub__ = __rsub__ - - __pow__ = _make_wrapped_arith_op("__pow__") - __rpow__ = _make_wrapped_arith_op("__rpow__") - __mul__ = _make_wrapped_arith_op("__mul__") - __rmul__ = _make_wrapped_arith_op("__rmul__") - __floordiv__ = _make_wrapped_arith_op("__floordiv__") - __rfloordiv__ = _make_wrapped_arith_op("__rfloordiv__") - __mod__ = _make_wrapped_arith_op("__mod__") - __rmod__ = _make_wrapped_arith_op("__rmod__") - __divmod__ = _make_wrapped_arith_op("__divmod__") - __rdivmod__ = _make_wrapped_arith_op("__rdivmod__") - __truediv__ = _make_wrapped_arith_op("__truediv__") - __rtruediv__ = _make_wrapped_arith_op("__rtruediv__") + __add__ = make_wrapped_arith_op("__add__") + __radd__ = make_wrapped_arith_op("__radd__") + __sub__ = make_wrapped_arith_op("__sub__") + __rsub__ = make_wrapped_arith_op("__rsub__") + __pow__ = make_wrapped_arith_op("__pow__") + __rpow__ = make_wrapped_arith_op("__rpow__") + __mul__ = make_wrapped_arith_op("__mul__") + __rmul__ = make_wrapped_arith_op("__rmul__") + __floordiv__ = make_wrapped_arith_op("__floordiv__") + __rfloordiv__ = make_wrapped_arith_op("__rfloordiv__") + __mod__ = make_wrapped_arith_op("__mod__") + __rmod__ = make_wrapped_arith_op("__rmod__") + __divmod__ = make_wrapped_arith_op("__divmod__") + __rdivmod__ = make_wrapped_arith_op("__rdivmod__") + __truediv__ = make_wrapped_arith_op("__truediv__") + __rtruediv__ = make_wrapped_arith_op("__rtruediv__") def isin(self, values, level=None): """ @@ -576,82 +459,29 @@ def isin(self, values, level=None): return algorithms.isin(self.asi8, values.asi8) - def intersection(self, other, sort=False): - self._validate_sort_keyword(sort) - self._assert_can_do_setop(other) - - if self.equals(other): - return self._get_reconciled_name_object(other) - - if len(self) == 0: - return self.copy() - if len(other) == 0: - return other.copy() - - if not isinstance(other, type(self)): - result = Index.intersection(self, other, sort=sort) - if isinstance(result, type(self)): - if result.freq is None: - # TODO: find a less code-smelly way to set this - result._data._freq = to_offset(result.inferred_freq) - return result - - elif ( - other.freq is None - or self.freq is None - or other.freq != self.freq - or not other.freq.isAnchored() - or (not self.is_monotonic or not other.is_monotonic) - ): - result = Index.intersection(self, other, sort=sort) - - # Invalidate the freq of `result`, which may not be correct at - # this point, depending on the values. + @Appender(_index_shared_docs["where"] % _index_doc_kwargs) + def where(self, cond, other=None): + values = self.view("i8") - # TODO: find a less code-smelly way to set this - result._data._freq = None - if hasattr(self, "tz"): - result = self._shallow_copy( - result._values, name=result.name, tz=result.tz, freq=None - ) - else: - result = self._shallow_copy(result._values, name=result.name, freq=None) - if result.freq is None: - # TODO: find a less code-smelly way to set this - result._data._freq = to_offset(result.inferred_freq) - return result + if is_scalar(other) and isna(other): + other = NaT.value - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other else: - left, right = other, self + # Do type inference if necessary up front + # e.g. we passed PeriodIndex.values and got an ndarray of Periods + other = Index(other) - # after sorting, the intersection always starts with the right index - # and ends with the index of which the last elements is smallest - end = min(left[-1], right[-1]) - start = right[0] + if is_categorical_dtype(other): + # e.g. we have a Categorical holding self.dtype + if needs_i8_conversion(other.categories): + other = other._internal_get_values() - if end < start: - return type(self)(data=[]) - else: - lslice = slice(*left.slice_locs(start, end)) - left_chunk = left.values[lslice] - return self._shallow_copy(left_chunk) + if not is_dtype_equal(self.dtype, other.dtype): + raise TypeError(f"Where requires matching dtype, not {other.dtype}") - @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) - def repeat(self, repeats, axis=None): - nv.validate_repeat(tuple(), dict(axis=axis)) - freq = self.freq if is_period_dtype(self) else None - return self._shallow_copy(self.asi8.repeat(repeats), freq=freq) + other = other.view("i8") - @Appender(_index_shared_docs["where"] % _index_doc_kwargs) - def where(self, cond, other=None): - other = _ensure_datetimelike_to_i8(other, to_utc=True) - values = _ensure_datetimelike_to_i8(self, to_utc=True) result = np.where(cond, values, other).astype("i8") - - result = self._ensure_localized(result, from_utc=True) return self._shallow_copy(result) def _summary(self, name=None): @@ -688,34 +518,21 @@ def _concat_same_dtype(self, to_concat, name): """ Concatenate to_concat which has the same class. """ - attribs = self._get_attributes_dict() - attribs["name"] = name + # do not pass tz to set because tzlocal cannot be hashed if len({str(x.dtype) for x in to_concat}) != 1: raise ValueError("to_concat must have the same tz") - new_data = type(self._values)._concat_same_type(to_concat).asi8 - - # GH 3232: If the concat result is evenly spaced, we can retain the - # original frequency - is_diff_evenly_spaced = len(unique_deltas(new_data)) == 1 - if not is_period_dtype(self) and not is_diff_evenly_spaced: - # reset freq - attribs["freq"] = None - - return self._simple_new(new_data, **attribs) - - @Appender(_index_shared_docs["astype"]) - def astype(self, dtype, copy=True): - if is_dtype_equal(self.dtype, dtype) and copy is False: - # Ensure that self.astype(self.dtype) is self - return self + new_data = type(self._data)._concat_same_type(to_concat) - new_values = self._data.astype(dtype, copy=copy) + if not is_period_dtype(self.dtype): + # GH 3232: If the concat result is evenly spaced, we can retain the + # original frequency + is_diff_evenly_spaced = len(unique_deltas(new_data.asi8)) == 1 + if is_diff_evenly_spaced: + new_data._freq = self.freq - # pass copy=False because any copying will be done in the - # _data.astype call above - return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) + return self._simple_new(new_data, name=name) def shift(self, periods=1, freq=None): """ @@ -750,45 +567,389 @@ def shift(self, periods=1, freq=None): result = self._data._time_shift(periods, freq=freq) return type(self)(result, name=self.name) + # -------------------------------------------------------------------- + # List-like Methods + + def delete(self, loc): + new_i8s = np.delete(self.asi8, loc) + + freq = None + if is_period_dtype(self): + freq = self.freq + elif is_integer(loc): + if loc in (0, -len(self), -1, len(self) - 1): + freq = self.freq + else: + if is_list_like(loc): + loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) + if isinstance(loc, slice) and loc.step in (1, None): + if loc.start in (0, None) or loc.stop in (len(self), None): + freq = self.freq + + return self._shallow_copy(new_i8s, freq=freq) + + +class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): + """ + Mixin class for methods shared by DatetimeIndex and TimedeltaIndex, + but not PeriodIndex + """ + + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique + + def _set_freq(self, freq): + """ + Set the _freq attribute on our underlying DatetimeArray. + + Parameters + ---------- + freq : DateOffset, None, or "infer" + """ + # GH#29843 + if freq is None: + # Always valid + pass + elif len(self) == 0 and isinstance(freq, DateOffset): + # Always valid. In the TimedeltaIndex case, we assume this + # is a Tick offset. + pass + else: + # As an internal method, we can ensure this assertion always holds + assert freq == "infer" + freq = to_offset(self.inferred_freq) + + self._data._freq = freq + + def _shallow_copy(self, values=None, **kwargs): + if values is None: + values = self._data + if isinstance(values, type(self)): + values = values._data + + attributes = self._get_attributes_dict() + + if "freq" not in kwargs and self.freq is not None: + if isinstance(values, (DatetimeArray, TimedeltaArray)): + if values.freq is None: + del attributes["freq"] + + attributes.update(kwargs) + return self._simple_new(values, **attributes) + + # -------------------------------------------------------------------- + # Set Operation Methods + + @Appender(Index.difference.__doc__) + def difference(self, other, sort=None): + new_idx = super().difference(other, sort=sort) + new_idx._set_freq(None) + return new_idx + + def intersection(self, other, sort=False): + """ + Specialized intersection for DatetimeIndex/TimedeltaIndex. + + May be much faster than Index.intersection + + Parameters + ---------- + other : Same type as self or array-like + sort : False or None, default False + Sort the resulting index if possible. + + .. versionadded:: 0.24.0 + + .. versionchanged:: 0.24.1 + + Changed the default to ``False`` to match the behaviour + from before 0.24.0. + + .. versionchanged:: 0.25.0 + + The `sort` keyword is added + + Returns + ------- + y : Index or same type as self + """ + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if len(self) == 0: + return self.copy() + if len(other) == 0: + return other.copy() + + if not isinstance(other, type(self)): + result = Index.intersection(self, other, sort=sort) + if isinstance(result, type(self)): + if result.freq is None: + result._set_freq("infer") + return result + + elif ( + other.freq is None + or self.freq is None + or other.freq != self.freq + or not other.freq.is_anchored() + or (not self.is_monotonic or not other.is_monotonic) + ): + result = Index.intersection(self, other, sort=sort) + + # Invalidate the freq of `result`, which may not be correct at + # this point, depending on the values. + + result._set_freq(None) + result = self._shallow_copy( + result._data, name=result.name, dtype=result.dtype, freq=None + ) + if result.freq is None: + result._set_freq("infer") + return result -def wrap_arithmetic_op(self, other, result): - if result is NotImplemented: - return NotImplemented + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self - if isinstance(result, tuple): - # divmod, rdivmod - assert len(result) == 2 - return ( - wrap_arithmetic_op(self, other, result[0]), - wrap_arithmetic_op(self, other, result[1]), + # after sorting, the intersection always starts with the right index + # and ends with the index of which the last elements is smallest + end = min(left[-1], right[-1]) + start = right[0] + + if end < start: + return type(self)(data=[]) + else: + lslice = slice(*left.slice_locs(start, end)) + left_chunk = left.values[lslice] + return self._shallow_copy(left_chunk) + + def _can_fast_union(self, other) -> bool: + if not isinstance(other, type(self)): + return False + + freq = self.freq + + if freq is None or freq != other.freq: + return False + + if not self.is_monotonic or not other.is_monotonic: + return False + + if len(self) == 0 or len(other) == 0: + return True + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + right_start = right[0] + left_end = left[-1] + + # Only need to "adjoin", not overlap + try: + return (right_start == left_end + freq) or right_start in left + except ValueError: + # if we are comparing a freq that does not propagate timezones + # this will raise + return False + + def _fast_union(self, other, sort=None): + if len(other) == 0: + return self.view(type(self)) + + if len(self) == 0: + return other.view(type(self)) + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + elif sort is False: + # TDIs are not in the "correct" order and we don't want + # to sort but want to remove overlaps + left, right = self, other + left_start = left[0] + loc = right.searchsorted(left_start, side="left") + right_chunk = right.values[:loc] + dates = concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + else: + left, right = other, self + + left_end = left[-1] + right_end = right[-1] + + # concatenate + if left_end < right_end: + loc = right.searchsorted(left_end, side="right") + right_chunk = right.values[loc:] + dates = concat_compat((left.values, right_chunk)) + return self._shallow_copy(dates) + else: + return left + + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) + + # We are called by `union`, which is responsible for this validation + assert isinstance(other, type(self)) + + this, other = self._maybe_utc_convert(other) + + if this._can_fast_union(other): + return this._fast_union(other, sort=sort) + else: + result = Index._union(this, other, sort=sort) + if isinstance(result, type(self)): + assert result._data.dtype == this.dtype + if result.freq is None: + result._set_freq("infer") + return result + + # -------------------------------------------------------------------- + # Join Methods + _join_precedence = 10 + + _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) + _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) + _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) + _left_indexer_unique = _join_i8_wrapper( + libjoin.left_join_indexer_unique, with_indexers=False + ) + + def join( + self, other, how: str = "left", level=None, return_indexers=False, sort=False + ): + """ + See Index.join + """ + if self._is_convertible_to_index_for_join(other): + try: + other = type(self)(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join( + this, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, ) - if not isinstance(result, Index): - # Index.__new__ will choose appropriate subclass for dtype - result = Index(result) + def _maybe_utc_convert(self, other): + this = self + if not hasattr(self, "tz"): + return this, other - res_name = ops.get_op_result_name(self, other) - result.name = res_name - return result + if isinstance(other, type(self)): + if self.tz is not None: + if other.tz is None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + elif other.tz is not None: + raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert("UTC") + other = other.tz_convert("UTC") + return this, other -def maybe_unwrap_index(obj): - """ - If operating against another Index object, we need to unwrap the underlying - data before deferring to the DatetimeArray/TimedeltaArray/PeriodArray - implementation, otherwise we will incorrectly return NotImplemented. + @classmethod + def _is_convertible_to_index_for_join(cls, other: Index) -> bool: + """ + return a boolean whether I can attempt conversion to a + DatetimeIndex/TimedeltaIndex + """ + if isinstance(other, cls): + return False + elif len(other) > 0 and other.inferred_type not in ( + "floating", + "mixed-integer", + "integer", + "integer-na", + "mixed-integer-float", + "mixed", + ): + return True + return False + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + if self._can_fast_union(other): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + kwargs = {} + if hasattr(self, "tz"): + kwargs["tz"] = getattr(other, "tz", None) + return self._simple_new(joined, name, **kwargs) - Parameters - ---------- - obj : object + # -------------------------------------------------------------------- + # List-Like Methods - Returns - ------- - unwrapped object - """ - if isinstance(obj, ABCIndexClass): - return obj._data - return obj + def insert(self, loc, item): + """ + Make new Index inserting new item at location + Parameters + ---------- + loc : int + item : object + if not either a Python datetime or a numpy integer-like, returned + Index dtype will be object rather than datetime. + Returns + ------- + new_index : Index + """ + if isinstance(item, self._data._recognized_scalars): + item = self._data._scalar_type(item) + elif is_valid_nat_for_dtype(item, self.dtype): + # GH 18295 + item = self._na_value + elif is_scalar(item) and isna(item): + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) + + freq = None + if isinstance(item, self._data._scalar_type) or item is NaT: + self._data._check_compatible_with(item, setitem=True) + + # check freq can be preserved on edge cases + if self.size and self.freq is not None: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + item = item.asm8 + + try: + new_i8s = np.concatenate( + (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) + ) + return self._shallow_copy(new_i8s, freq=freq) + except (AttributeError, TypeError): + + # fall back to object index + if isinstance(item, str): + return self.astype(object).insert(loc, item) + raise TypeError( + f"cannot insert {type(self).__name__} with incompatible label" + ) class DatetimelikeDelegateMixin(PandasDelegate): @@ -798,8 +959,6 @@ class DatetimelikeDelegateMixin(PandasDelegate): Functionality is delegated from the Index class to an Array class. A few things can be customized - * _delegate_class : type - The class being delegated to. * _delegated_methods, delegated_properties : List The list of property / method names being delagated. * raw_methods : Set @@ -814,13 +973,8 @@ class DatetimelikeDelegateMixin(PandasDelegate): _raw_methods: Set[str] = set() # raw_properties : dispatch properties that shouldn't be boxed in an Index _raw_properties: Set[str] = set() - name = None _data: ExtensionArray - @property - def _delegate_class(self): - raise AbstractMethodError - def _delegate_property_get(self, name, *args, **kwargs): result = getattr(self._data, name) if name not in self._raw_properties: diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1fd962dd24656..ee9b948a76ac8 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1,47 +1,43 @@ -from datetime import datetime, time, timedelta +from datetime import datetime, time, timedelta, tzinfo import operator +from typing import Optional import warnings import numpy as np -from pandas._libs import NaT, Timestamp, index as libindex, lib, tslib as libts -import pandas._libs.join as libjoin -from pandas._libs.tslibs import ccalendar, fields, parsing, timezones -from pandas.util._decorators import Appender, Substitution, cache_readonly - -from pandas.core.dtypes.common import ( - _NS_DTYPE, - ensure_int64, - is_float, - is_integer, - is_list_like, - is_scalar, +from pandas._libs import ( + NaT, + Timedelta, + Timestamp, + index as libindex, + lib, + tslib as libts, ) -from pandas.core.dtypes.concat import concat_compat +from pandas._libs.tslibs import ccalendar, fields, parsing, timezones +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.common import _NS_DTYPE, is_float, is_integer, is_scalar from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas.core.accessor import delegate_names from pandas.core.arrays.datetimes import ( DatetimeArray, - _to_M8, tz_to_dtype, validate_tz_from_dtype, ) -from pandas.core.base import _shared_docs import pandas.core.common as com -from pandas.core.indexes.base import Index +from pandas.core.indexes.base import Index, maybe_extract_name from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, - ea_passthrough, + DatetimeTimedeltaMixin, ) -from pandas.core.indexes.numeric import Int64Index +from pandas.core.indexes.extension import inherit_names from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools from pandas.tseries.frequencies import Resolution, to_offset -from pandas.tseries.offsets import Nano, prefix_mapping +from pandas.tseries.offsets import prefix_mapping def _new_DatetimeIndex(cls, d): @@ -69,8 +65,14 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): # We also have a few "extra" attrs, which may or may not be raw, # which we we dont' want to expose in the .dt accessor. _extra_methods = ["to_period", "to_perioddelta", "to_julian_date", "strftime"] - _extra_raw_methods = ["to_pydatetime", "_local_timestamps", "_has_same_tz"] - _extra_raw_properties = ["_box_func", "tz", "tzinfo"] + _extra_raw_methods = [ + "to_pydatetime", + "_local_timestamps", + "_has_same_tz", + "_format_native_types", + "__iter__", + ] + _extra_raw_properties = ["_box_func", "tz", "tzinfo", "dtype"] _delegated_properties = DatetimeArray._datetimelike_ops + _extra_raw_properties _delegated_methods = ( DatetimeArray._datetimelike_methods + _extra_methods + _extra_raw_methods @@ -81,9 +83,19 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): | set(_extra_raw_properties) ) _raw_methods = set(_extra_raw_methods) - _delegate_class = DatetimeArray +@inherit_names(["_timezone", "is_normalized", "_resolution"], DatetimeArray, cache=True) +@inherit_names( + [ + "_bool_ops", + "_object_ops", + "_field_ops", + "_datetimelike_ops", + "_datetimelike_methods", + ], + DatetimeArray, +) @delegate_names( DatetimeArray, DatetimeDelegateMixin._delegated_properties, typ="property" ) @@ -91,9 +103,9 @@ class DatetimeDelegateMixin(DatetimelikeDelegateMixin): DatetimeArray, DatetimeDelegateMixin._delegated_methods, typ="method", - overwrite=False, + overwrite=True, ) -class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): +class DatetimeIndex(DatetimeTimedeltaMixin, DatetimeDelegateMixin): """ Immutable ndarray of datetime64 data, represented internally as int64, and which can be boxed to Timestamp objects that are subclasses of datetime and @@ -192,41 +204,21 @@ class DatetimeIndex(DatetimeIndexOpsMixin, Int64Index, DatetimeDelegateMixin): Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. """ _typ = "datetimeindex" - _join_precedence = 10 - - def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="M8[ns]", **kwargs) - - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) - _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique, with_indexers=False - ) _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True - _tz = None - _freq = None _comparables = ["name", "freqstr", "tz"] _attributes = ["name", "tz", "freq"] _is_numeric_dtype = False _infer_as_myclass = True - # Use faster implementation given we know we have DatetimeArrays - __iter__ = DatetimeArray.__iter__ - # some things like freq inference make use of these attributes. - _bool_ops = DatetimeArray._bool_ops - _object_ops = DatetimeArray._object_ops - _field_ops = DatetimeArray._field_ops - _datetimelike_ops = DatetimeArray._datetimelike_ops - _datetimelike_methods = DatetimeArray._datetimelike_methods + tz: Optional[tzinfo] # -------------------------------------------------------------------- # Constructors @@ -254,8 +246,7 @@ def __new__( # - Cases checked above all return/raise before reaching here - # - if name is None and hasattr(data, "name"): - name = data.name + name = maybe_extract_name(name, data, cls) dtarr = DatetimeArray._from_sequence( data, @@ -268,7 +259,7 @@ def __new__( ambiguous=ambiguous, ) - subarr = cls._simple_new(dtarr, name=name, freq=dtarr.freq, tz=dtarr.tz) + subarr = cls._simple_new(dtarr, name=name) return subarr @classmethod @@ -289,10 +280,6 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): freq = values.freq values = values._data - # DatetimeArray._simple_new will accept either i8 or M8[ns] dtypes - if isinstance(values, DatetimeIndex): - values = values._data - dtype = tz_to_dtype(tz) dtarr = DatetimeArray._simple_new(values, freq=freq, dtype=dtype) assert isinstance(dtarr, DatetimeArray) @@ -300,6 +287,7 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): result = object.__new__(cls) result._data = dtarr result.name = name + result._no_setting_name = False # For groupby perf. See note in indexes/base about _index_data result._index_data = dtarr._data result._reset_identity() @@ -307,43 +295,9 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): # -------------------------------------------------------------------- - def __array__(self, dtype=None): - if ( - dtype is None - and isinstance(self._data, DatetimeArray) - and getattr(self.dtype, "tz", None) - ): - msg = ( - "Converting timezone-aware DatetimeArray to timezone-naive " - "ndarray with 'datetime64[ns]' dtype. In the future, this " - "will return an ndarray with 'object' dtype where each " - "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t" - "To accept the future behavior, pass 'dtype=object'.\n\t" - "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = "M8[ns]" + def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._data, dtype=dtype) - @property - def dtype(self): - return self._data.dtype - - @property - def tz(self): - # GH 18595 - return self._data.tz - - @tz.setter - def tz(self, value): - # GH 3746: Prevent localizing or converting the index by setting tz - raise AttributeError( - "Cannot directly set timezone. Use tz_localize() " - "or tz_convert() as appropriate" - ) - - tzinfo = tz - @cache_readonly def _is_dates_only(self) -> bool: """ @@ -366,55 +320,14 @@ def __reduce__(self): d.update(self._get_attributes_dict()) return _new_DatetimeIndex, (type(self), d), None - def __setstate__(self, state): - """ - Necessary for making this object picklable. - """ - if isinstance(state, dict): - super().__setstate__(state) - - elif isinstance(state, tuple): - - # < 0.15 compat - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - - freq = own_state[1] - tz = timezones.tz_standardize(own_state[2]) - dtype = tz_to_dtype(tz) - dtarr = DatetimeArray._simple_new(data, freq=freq, dtype=dtype) - - self.name = own_state[0] - - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) - dtarr = DatetimeArray(data) - - self._data = dtarr - self._reset_identity() - - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - def _convert_for_op(self, value): """ Convert value to be insertable to ndarray. """ if self._has_same_tz(value): - return _to_M8(value) + return Timestamp(value).asm8 raise ValueError("Passed item and index have different timezone") - @Appender(Index.difference.__doc__) - def difference(self, other, sort=None): - new_idx = super().difference(other, sort=sort) - new_idx._data._freq = None - return new_idx - # -------------------------------------------------------------------- # Rendering Methods @@ -422,15 +335,6 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return libts.ints_to_pydatetime(self.asi8, self.tz) - def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import _get_format_datetime64_from_values - - fmt = _get_format_datetime64_from_values(self, date_format) - - return libts.format_array_from_datetime( - self.asi8, tz=self.tz, format=fmt, na_rep=na_rep - ) - @property def _formatter_func(self): from pandas.io.formats.format import _get_format_datetime64 @@ -441,35 +345,6 @@ def _formatter_func(self): # -------------------------------------------------------------------- # Set Operation Methods - def _union(self, other, sort): - if not len(other) or self.equals(other) or not len(self): - return super()._union(other, sort=sort) - - if len(other) == 0 or self.equals(other) or len(self) == 0: - return super().union(other, sort=sort) - - if not isinstance(other, DatetimeIndex): - try: - other = DatetimeIndex(other) - except TypeError: - pass - - this, other = self._maybe_utc_convert(other) - - if this._can_fast_union(other): - return this._fast_union(other, sort=sort) - else: - result = Index._union(this, other, sort=sort) - if isinstance(result, DatetimeIndex): - # TODO: we shouldn't be setting attributes like this; - # in all the tests this equality already holds - result._data._dtype = this.dtype - if result.freq is None and ( - this.freq is not None or other.freq is not None - ): - result._data._freq = to_offset(result.inferred_freq) - return result - def union_many(self, others): """ A bit of a hack to accelerate unioning a collection of indexes. @@ -500,102 +375,6 @@ def union_many(self, others): this._data._dtype = dtype return this - def _can_fast_union(self, other) -> bool: - if not isinstance(other, DatetimeIndex): - return False - - freq = self.freq - - if freq is None or freq != other.freq: - return False - - if not self.is_monotonic or not other.is_monotonic: - return False - - if len(self) == 0 or len(other) == 0: - return True - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - right_start = right[0] - left_end = left[-1] - - # Only need to "adjoin", not overlap - try: - return (right_start == left_end + freq) or right_start in left - except (ValueError): - - # if we are comparing a freq that does not propagate timezones - # this will raise - return False - - def _fast_union(self, other, sort=None): - if len(other) == 0: - return self.view(type(self)) - - if len(self) == 0: - return other.view(type(self)) - - # Both DTIs are monotonic. Check if they are already - # in the "correct" order - if self[0] <= other[0]: - left, right = self, other - # DTIs are not in the "correct" order and we don't want - # to sort but want to remove overlaps - elif sort is False: - left, right = self, other - left_start = left[0] - loc = right.searchsorted(left_start, side="left") - right_chunk = right.values[:loc] - dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - # DTIs are not in the "correct" order and we want - # to sort - else: - left, right = other, self - - left_end = left[-1] - right_end = right[-1] - - # TODO: consider re-implementing freq._should_cache for fastpath - - # concatenate dates - if left_end < right_end: - loc = right.searchsorted(left_end, side="right") - right_chunk = right.values[loc:] - dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - else: - return left - - def intersection(self, other, sort=False): - """ - Specialized intersection for DatetimeIndex objects. - May be much faster than Index.intersection - - Parameters - ---------- - other : DatetimeIndex or array-like - sort : False or None, default False - Sort the resulting index if possible. - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default to ``False`` to match the behaviour - from before 0.24.0. - - Returns - ------- - Index or DatetimeIndex or TimedeltaIndex - """ - return super().intersection(other, sort=sort) - def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) return self._shallow_copy(result, name=name, freq=None, tz=self.tz) @@ -608,7 +387,7 @@ def _get_time_micros(self): values = self._data._local_timestamps() return fields.get_time_micros(values) - def to_series(self, keep_tz=lib._no_default, index=None, name=None): + def to_series(self, keep_tz=lib.no_default, index=None, name=None): """ Create a Series with both index and values equal to the index keys useful with map for returning an indexer based on an index. @@ -653,7 +432,7 @@ def to_series(self, keep_tz=lib._no_default, index=None, name=None): if name is None: name = self.name - if keep_tz is not lib._no_default: + if keep_tz is not lib.no_default: if keep_tz: warnings.warn( "The 'keep_tz' keyword in DatetimeIndex.to_series " @@ -698,7 +477,7 @@ def snap(self, freq="S"): for i, v in enumerate(self): s = v - if not freq.onOffset(s): + if not freq.is_on_offset(s): t0 = freq.rollback(s) t1 = freq.rollforward(s) if abs(s - t0) < abs(t1 - s): @@ -710,68 +489,6 @@ def snap(self, freq="S"): # we know it conforms; skip check return DatetimeIndex._simple_new(snapped, name=self.name, tz=self.tz, freq=freq) - def join( - self, other, how: str = "left", level=None, return_indexers=False, sort=False - ): - """ - See Index.join - """ - if ( - not isinstance(other, DatetimeIndex) - and len(other) > 0 - and other.inferred_type - not in ( - "floating", - "integer", - "integer-na", - "mixed-integer", - "mixed-integer-float", - "mixed", - ) - ): - try: - other = DatetimeIndex(other) - except (TypeError, ValueError): - pass - - this, other = self._maybe_utc_convert(other) - return Index.join( - this, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - - def _maybe_utc_convert(self, other): - this = self - if isinstance(other, DatetimeIndex): - if self.tz is not None: - if other.tz is None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - elif other.tz is not None: - raise TypeError("Cannot join tz-naive with tz-aware DatetimeIndex") - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert("UTC") - other = other.tz_convert("UTC") - return this, other - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if ( - isinstance(other, DatetimeIndex) - and self.freq == other.freq - and self._can_fast_union(other) - ): - joined = self._shallow_copy(joined) - joined.name = name - return joined - else: - tz = getattr(other, "tz", None) - return self._simple_new(joined, name, tz=tz) - def _parsed_string_to_bounds(self, reso, parsed): """ Calculate datetime bounds for parsed time string and its resolution. @@ -804,27 +521,27 @@ def _parsed_string_to_bounds(self, reso, parsed): raise KeyError if reso == "year": start = Timestamp(parsed.year, 1, 1) - end = Timestamp(parsed.year, 12, 31, 23, 59, 59, 999999) + end = Timestamp(parsed.year + 1, 1, 1) - Timedelta(nanoseconds=1) elif reso == "month": d = ccalendar.get_days_in_month(parsed.year, parsed.month) start = Timestamp(parsed.year, parsed.month, 1) - end = Timestamp(parsed.year, parsed.month, d, 23, 59, 59, 999999) + end = start + Timedelta(days=d, nanoseconds=-1) elif reso == "quarter": qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead d = ccalendar.get_days_in_month(parsed.year, qe) # at end of month start = Timestamp(parsed.year, parsed.month, 1) - end = Timestamp(parsed.year, qe, d, 23, 59, 59, 999999) + end = Timestamp(parsed.year, qe, 1) + Timedelta(days=d, nanoseconds=-1) elif reso == "day": start = Timestamp(parsed.year, parsed.month, parsed.day) - end = start + timedelta(days=1) - Nano(1) + end = start + Timedelta(days=1, nanoseconds=-1) elif reso == "hour": start = Timestamp(parsed.year, parsed.month, parsed.day, parsed.hour) - end = start + timedelta(hours=1) - Nano(1) + end = start + Timedelta(hours=1, nanoseconds=-1) elif reso == "minute": start = Timestamp( parsed.year, parsed.month, parsed.day, parsed.hour, parsed.minute ) - end = start + timedelta(minutes=1) - Nano(1) + end = start + Timedelta(minutes=1, nanoseconds=-1) elif reso == "second": start = Timestamp( parsed.year, @@ -834,7 +551,7 @@ def _parsed_string_to_bounds(self, reso, parsed): parsed.minute, parsed.second, ) - end = start + timedelta(seconds=1) - Nano(1) + end = start + Timedelta(seconds=1, nanoseconds=-1) elif reso == "microsecond": start = Timestamp( parsed.year, @@ -845,7 +562,7 @@ def _parsed_string_to_bounds(self, reso, parsed): parsed.second, parsed.microsecond, ) - end = start + timedelta(microseconds=1) - Nano(1) + end = start + Timedelta(microseconds=1, nanoseconds=-1) # GH 24076 # If an incoming date string contained a UTC offset, need to localize # the parsed date to this offset first before aligning with the index's @@ -925,47 +642,32 @@ def get_value(self, series, key): know what you're doing """ - if isinstance(key, datetime): - - # needed to localize naive datetimes - if self.tz is not None: - if key.tzinfo is not None: - key = Timestamp(key).tz_convert(self.tz) - else: - key = Timestamp(key).tz_localize(self.tz) - + if isinstance(key, (datetime, np.datetime64)): return self.get_value_maybe_box(series, key) if isinstance(key, time): locs = self.indexer_at_time(key) return series.take(locs) - try: - return com.maybe_box(self, Index.get_value(self, series, key), series, key) - except KeyError: + if isinstance(key, str): try: loc = self._get_string_slice(key) return series[loc] except (TypeError, ValueError, KeyError): pass - try: - return self.get_value_maybe_box(series, key) - except (TypeError, ValueError, KeyError): + stamp = self._maybe_cast_for_get_loc(key) + loc = self.get_loc(stamp) + return series[loc] + except (KeyError, ValueError): raise KeyError(key) + value = Index.get_value(self, series, key) + return com.maybe_box(self, value, series, key) + def get_value_maybe_box(self, series, key): - # needed to localize naive datetimes - if self.tz is not None: - key = Timestamp(key) - if key.tzinfo is not None: - key = key.tz_convert(self.tz) - else: - key = key.tz_localize(self.tz) - elif not isinstance(key, Timestamp): - key = Timestamp(key) - values = self._engine.get_value(com.values_from_object(series), key, tz=self.tz) - return com.maybe_box(self, values, series, key) + loc = self.get_loc(key) + return self._get_values_for_loc(series, loc) def get_loc(self, key, method=None, tolerance=None): """ @@ -975,20 +677,31 @@ def get_loc(self, key, method=None, tolerance=None): ------- loc : int """ + if is_scalar(key) and isna(key): + key = NaT # FIXME: do this systematically if tolerance is not None: # try converting tolerance now, so errors don't get swallowed by # the try/except clauses below tolerance = self._convert_tolerance(tolerance, np.asarray(key)) - if isinstance(key, datetime): + if isinstance(key, (datetime, np.datetime64)): # needed to localize naive datetimes - if key.tzinfo is None: - key = Timestamp(key, tz=self.tz) - else: - key = Timestamp(key).tz_convert(self.tz) + key = self._maybe_cast_for_get_loc(key) return Index.get_loc(self, key, method, tolerance) + elif isinstance(key, str): + try: + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError, OverflowError): + pass + + try: + stamp = self._maybe_cast_for_get_loc(key) + return Index.get_loc(self, stamp, method, tolerance) + except (KeyError, ValueError): + raise KeyError(key) + elif isinstance(key, timedelta): # GH#20464 raise TypeError( @@ -1002,28 +715,16 @@ def get_loc(self, key, method=None, tolerance=None): ) return self.indexer_at_time(key) - try: - return Index.get_loc(self, key, method, tolerance) - except (KeyError, ValueError, TypeError): - try: - return self._get_string_slice(key) - except (TypeError, KeyError, ValueError, OverflowError): - pass + return Index.get_loc(self, key, method, tolerance) - try: - stamp = Timestamp(key) - if stamp.tzinfo is not None and self.tz is not None: - stamp = stamp.tz_convert(self.tz) - else: - stamp = stamp.tz_localize(self.tz) - return Index.get_loc(self, stamp, method, tolerance) - except KeyError: - raise KeyError(key) - except ValueError as e: - # list-like tolerance size must match target index size - if "list-like" in str(e): - raise e - raise KeyError(key) + def _maybe_cast_for_get_loc(self, key): + # needed to localize naive datetimes + key = Timestamp(key) + if key.tzinfo is None: + key = key.tz_localize(self.tz) + else: + key = key.tz_convert(self.tz) + return key def _maybe_cast_slice_bound(self, label, side, kind): """ @@ -1033,7 +734,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- @@ -1043,14 +744,14 @@ def _maybe_cast_slice_bound(self, label, side, kind): ----- Value of `side` parameter should be validated in caller. """ - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] if is_float(label) or isinstance(label, time) or is_integer(label): self._invalid_indexer("slice", label) if isinstance(label, str): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) - _, parsed, reso = parsing.parse_time_string(label, freq) + parsed, reso = parsing.parse_time_string(label, freq) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: # [parsed, parsed + 1 freq) @@ -1066,7 +767,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) - _, parsed, reso = parsing.parse_time_string(key, freq) + parsed, reso = parsing.parse_time_string(key, freq) loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) return loc @@ -1122,45 +823,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): raise # -------------------------------------------------------------------- - # Wrapping DatetimeArray - - # Compat for frequency inference, see GH#23789 - _is_monotonic_increasing = Index.is_monotonic_increasing - _is_monotonic_decreasing = Index.is_monotonic_decreasing - _is_unique = Index.is_unique - - _timezone = cache_readonly(DatetimeArray._timezone.fget) # type: ignore - is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) # type: ignore - _resolution = cache_readonly(DatetimeArray._resolution.fget) # type: ignore - - _has_same_tz = ea_passthrough(DatetimeArray._has_same_tz) - - def __getitem__(self, key): - result = self._data.__getitem__(key) - if is_scalar(result): - return result - elif result.ndim > 1: - # To support MPL which performs slicing with 2 dim - # even though it only has 1 dim by definition - assert isinstance(result, np.ndarray), result - return result - return type(self)(result, name=self.name) - - @property - def _box_func(self): - return lambda x: Timestamp(x, tz=self.tz) - - # -------------------------------------------------------------------- - - @Substitution(klass="DatetimeIndex") - @Appender(_shared_docs["searchsorted"]) - def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, (np.ndarray, Index)): - value = np.array(value, dtype=_NS_DTYPE, copy=False) - else: - value = _to_M8(value, tz=self.tz) - - return self.values.searchsorted(value, side=side) def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "datetime" @@ -1171,85 +833,6 @@ def inferred_type(self) -> str: # sure we can't have ambiguous indexing return "datetime64" - @property - def is_all_dates(self) -> bool: - return True - - def insert(self, loc, item): - """ - Make new Index inserting new item at location - - Parameters - ---------- - loc : int - item : object - if not either a Python datetime or a numpy integer-like, returned - Index dtype will be object rather than datetime. - - Returns - ------- - new_index : Index - """ - if is_scalar(item) and isna(item): - # GH 18295 - item = self._na_value - - freq = None - - if isinstance(item, (datetime, np.datetime64)): - self._assert_can_do_op(item) - if not self._has_same_tz(item) and not isna(item): - raise ValueError("Passed item and index have different timezone") - # check freq can be preserved on edge cases - if self.size and self.freq is not None: - if item is NaT: - pass - elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: - freq = self.freq - elif (loc == len(self)) and item - self.freq == self[-1]: - freq = self.freq - item = _to_M8(item, tz=self.tz) - - try: - new_dates = np.concatenate( - (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) - ) - return self._shallow_copy(new_dates, freq=freq) - except (AttributeError, TypeError): - - # fall back to object index - if isinstance(item, str): - return self.astype(object).insert(loc, item) - raise TypeError("cannot insert DatetimeIndex with incompatible label") - - def delete(self, loc): - """ - Make a new DatetimeIndex with passed location(s) deleted. - - Parameters - ---------- - loc: int, slice or array of ints - Indicate which sub-arrays to remove. - - Returns - ------- - new_index : DatetimeIndex - """ - new_dates = np.delete(self.asi8, loc) - - freq = None - if is_integer(loc): - if loc in (0, -len(self), -1, len(self) - 1): - freq = self.freq - else: - if is_list_like(loc): - loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) - if isinstance(loc, slice) and loc.step in (1, None): - if loc.start in (0, None) or loc.stop in (len(self), None): - freq = self.freq - - return self._shallow_copy(new_dates, freq=freq) - def indexer_at_time(self, time, asof=False): """ Return index locations of index values at particular time of day @@ -1338,10 +921,8 @@ def indexer_between_time( return mask.nonzero()[0] -DatetimeIndex._add_comparison_ops() DatetimeIndex._add_numeric_methods_disabled() DatetimeIndex._add_logical_methods_disabled() -DatetimeIndex._add_datetimelike_methods() def date_range( @@ -1354,7 +935,7 @@ def date_range( name=None, closed=None, **kwargs, -): +) -> DatetimeIndex: """ Return a fixed frequency DatetimeIndex. @@ -1403,7 +984,7 @@ def date_range( ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -1505,7 +1086,7 @@ def date_range( closed=closed, **kwargs, ) - return DatetimeIndex._simple_new(dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) + return DatetimeIndex._simple_new(dtarr, name=name) def bdate_range( @@ -1520,7 +1101,7 @@ def bdate_range( holidays=None, closed=None, **kwargs, -): +) -> DatetimeIndex: """ Return a fixed frequency DatetimeIndex, with business day as the default frequency. @@ -1574,7 +1155,7 @@ def bdate_range( desired. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py new file mode 100644 index 0000000000000..db35cdb72979f --- /dev/null +++ b/pandas/core/indexes/extension.py @@ -0,0 +1,245 @@ +""" +Shared methods for Index subclasses backed by ExtensionArray. +""" +from typing import List + +import numpy as np + +from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, cache_readonly + +from pandas.core.dtypes.common import ensure_platform_int, is_dtype_equal +from pandas.core.dtypes.generic import ABCSeries + +from pandas.core.arrays import ExtensionArray +from pandas.core.indexes.base import Index, deprecate_ndim_indexing +from pandas.core.ops import get_op_result_name + + +def inherit_from_data(name: str, delegate, cache: bool = False): + """ + Make an alias for a method of the underlying ExtensionArray. + + Parameters + ---------- + name : str + Name of an attribute the class should inherit from its EA parent. + delegate : class + cache : bool, default False + Whether to convert wrapped properties into cache_readonly + + Returns + ------- + attribute, method, property, or cache_readonly + """ + + attr = getattr(delegate, name) + + if isinstance(attr, property): + if cache: + method = cache_readonly(attr.fget) + + else: + + def fget(self): + return getattr(self._data, name) + + def fset(self, value): + setattr(self._data, name, value) + + fget.__name__ = name + fget.__doc__ = attr.__doc__ + + method = property(fget, fset) + + elif not callable(attr): + # just a normal attribute, no wrapping + method = attr + + else: + + def method(self, *args, **kwargs): + result = attr(self._data, *args, **kwargs) + return result + + method.__name__ = name + method.__doc__ = attr.__doc__ + return method + + +def inherit_names(names: List[str], delegate, cache: bool = False): + """ + Class decorator to pin attributes from an ExtensionArray to a Index subclass. + + Parameters + ---------- + names : List[str] + delegate : class + cache : bool, default False + """ + + def wrapper(cls): + for name in names: + meth = inherit_from_data(name, delegate, cache=cache) + setattr(cls, name, meth) + + return cls + + return wrapper + + +def _make_wrapped_comparison_op(opname): + """ + Create a comparison method that dispatches to ``._data``. + """ + + def wrapper(self, other): + if isinstance(other, ABCSeries): + # the arrays defer to Series for comparison ops but the indexes + # don't, so we have to unwrap here. + other = other._values + + other = _maybe_unwrap_index(other) + + op = getattr(self._data, opname) + return op(other) + + wrapper.__name__ = opname + return wrapper + + +def make_wrapped_arith_op(opname): + def method(self, other): + meth = getattr(self._data, opname) + result = meth(_maybe_unwrap_index(other)) + return _wrap_arithmetic_op(self, other, result) + + method.__name__ = opname + return method + + +def _wrap_arithmetic_op(self, other, result): + if result is NotImplemented: + return NotImplemented + + if isinstance(result, tuple): + # divmod, rdivmod + assert len(result) == 2 + return ( + _wrap_arithmetic_op(self, other, result[0]), + _wrap_arithmetic_op(self, other, result[1]), + ) + + if not isinstance(result, Index): + # Index.__new__ will choose appropriate subclass for dtype + result = Index(result) + + res_name = get_op_result_name(self, other) + result.name = res_name + return result + + +def _maybe_unwrap_index(obj): + """ + If operating against another Index object, we need to unwrap the underlying + data before deferring to the DatetimeArray/TimedeltaArray/PeriodArray + implementation, otherwise we will incorrectly return NotImplemented. + + Parameters + ---------- + obj : object + + Returns + ------- + unwrapped object + """ + if isinstance(obj, Index): + return obj._data + return obj + + +class ExtensionIndex(Index): + """ + Index subclass for indexes backed by ExtensionArray. + """ + + _data: ExtensionArray + + __eq__ = _make_wrapped_comparison_op("__eq__") + __ne__ = _make_wrapped_comparison_op("__ne__") + __lt__ = _make_wrapped_comparison_op("__lt__") + __gt__ = _make_wrapped_comparison_op("__gt__") + __le__ = _make_wrapped_comparison_op("__le__") + __ge__ = _make_wrapped_comparison_op("__ge__") + + def __getitem__(self, key): + result = self._data[key] + if isinstance(result, type(self._data)): + return type(self)(result, name=self.name) + + # Includes cases where we get a 2D ndarray back for MPL compat + deprecate_ndim_indexing(result) + return result + + def __iter__(self): + return self._data.__iter__() + + @property + def _ndarray_values(self) -> np.ndarray: + return self._data._ndarray_values + + @Appender(Index.dropna.__doc__) + def dropna(self, how="any"): + if how not in ("any", "all"): + raise ValueError(f"invalid how option: {how}") + + if self.hasnans: + return self._shallow_copy(self._data[~self._isnan]) + return self._shallow_copy() + + def repeat(self, repeats, axis=None): + nv.validate_repeat(tuple(), dict(axis=axis)) + result = self._data.repeat(repeats, axis=axis) + return self._shallow_copy(result) + + @Appender(Index.take.__doc__) + def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + + taken = self._assert_take_fillable( + self._data, + indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value, + ) + return type(self)(taken, name=self.name) + + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + + result = self._data.unique() + return self._shallow_copy(result) + + def _get_unique_index(self, dropna=False): + if self.is_unique and not dropna: + return self + + result = self._data.unique() + if dropna and self.hasnans: + result = result[~result.isna()] + return self._shallow_copy(result) + + @Appender(Index.astype.__doc__) + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype) and copy is False: + # Ensure that self.astype(self.dtype) is self + return self + + new_values = self._data.astype(dtype, copy=copy) + + # pass copy=False because any copying will be done in the + # _data.astype call above + return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index b61e80b9e89a7..3108c1a1afd0c 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -37,6 +37,7 @@ from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna +from pandas.core import accessor from pandas.core.algorithms import take_1d from pandas.core.arrays.interval import IntervalArray, _interval_shared_docs import pandas.core.common as com @@ -47,8 +48,10 @@ _index_shared_docs, default_pprint, ensure_index, + maybe_extract_name, ) from pandas.core.indexes.datetimes import DatetimeIndex, date_range +from pandas.core.indexes.extension import ExtensionIndex, inherit_names from pandas.core.indexes.multi import MultiIndex from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.core.ops import get_op_result_name @@ -102,19 +105,6 @@ def _get_prev_label(label): raise TypeError(f"cannot determine next label for type {repr(type(label))}") -def _get_interval_closed_bounds(interval): - """ - Given an Interval or IntervalIndex, return the corresponding interval with - closed bounds. - """ - left, right = interval.left, interval.right - if interval.open_left: - left = _get_next_label(left) - if interval.open_right: - right = _get_prev_label(right) - return left, right - - def _new_IntervalIndex(cls, d): """ This is called upon unpickling, rather than the default which doesn't have @@ -193,7 +183,31 @@ def func(intvidx_self, other, sort=False): ), ) ) -class IntervalIndex(IntervalMixin, Index): +@accessor.delegate_names( + delegate=IntervalArray, + accessors=["length", "size", "left", "right", "mid", "closed", "dtype"], + typ="property", + overwrite=True, +) +@accessor.delegate_names( + delegate=IntervalArray, + accessors=[ + "__array__", + "overlaps", + "contains", + "__len__", + "set_closed", + "to_tuples", + ], + typ="method", + overwrite=True, +) +@inherit_names( + ["is_non_overlapping_monotonic", "mid", "_ndarray_values"], + IntervalArray, + cache=True, +) +class IntervalIndex(IntervalMixin, ExtensionIndex, accessor.PandasDelegate): _typ = "intervalindex" _comparables = ["name"] _attributes = ["name", "closed"] @@ -204,6 +218,8 @@ class IntervalIndex(IntervalMixin, Index): # Immutable, so we are able to cache computations like isna in '_mask' _mask = None + _raw_inherit = {"__array__", "overlaps", "contains"} + # -------------------------------------------------------------------- # Constructors @@ -217,8 +233,7 @@ def __new__( verify_integrity: bool = True, ): - if name is None and hasattr(data, "name"): - name = data.name + name = maybe_extract_name(name, data, cls) with rewrite_exception("IntervalArray", cls.__name__): array = IntervalArray( @@ -244,9 +259,12 @@ def _simple_new(cls, array, name, closed=None): closed : Any Ignored. """ + assert isinstance(array, IntervalArray), type(array) + result = IntervalMixin.__new__(cls) result._data = array result.name = name + result._no_setting_name = False result._reset_identity() return result @@ -356,7 +374,7 @@ def _engine(self): right = self._maybe_convert_i8(self.right) return IntervalTree(left, right, closed=self.closed) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: """ return a boolean if this key is IN the index We *only* accept an Interval @@ -369,6 +387,7 @@ def __contains__(self, key) -> bool: ------- bool """ + hash(key) if not isinstance(key, Interval): return False @@ -378,98 +397,10 @@ def __contains__(self, key) -> bool: except KeyError: return False - @Appender( - _interval_shared_docs["to_tuples"] - % dict( - return_type="Index", - examples=""" - Examples - -------- - >>> idx = pd.IntervalIndex.from_arrays([0, np.nan, 2], [1, np.nan, 3]) - >>> idx.to_tuples() - Index([(0.0, 1.0), (nan, nan), (2.0, 3.0)], dtype='object') - >>> idx.to_tuples(na_tuple=False) - Index([(0.0, 1.0), nan, (2.0, 3.0)], dtype='object') - """, - ) - ) - def to_tuples(self, na_tuple=True): - tuples = self._data.to_tuples(na_tuple=na_tuple) - return Index(tuples) - @cache_readonly def _multiindex(self): return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) - @property - def left(self): - """ - Return the left endpoints of each Interval in the IntervalIndex as - an Index. - """ - return self._data._left - - @property - def right(self): - """ - Return the right endpoints of each Interval in the IntervalIndex as - an Index. - """ - return self._data._right - - @property - def closed(self): - """ - Whether the intervals are closed on the left-side, right-side, both or - neither. - """ - return self._data._closed - - @Appender( - _interval_shared_docs["set_closed"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( - """\ - Examples - -------- - >>> index = pd.interval_range(0, 3) - >>> index - IntervalIndex([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') - >>> index.set_closed('both') - IntervalIndex([[0, 1], [1, 2], [2, 3]], - closed='both', - dtype='interval[int64]') - """ - ), - ) - ) - def set_closed(self, closed): - if closed not in _VALID_CLOSED: - raise ValueError(f"invalid option for 'closed': {closed}") - - # return self._shallow_copy(closed=closed) - array = self._data.set_closed(closed) - return self._simple_new(array, self.name) - - @property - def length(self): - """ - Return an Index with entries denoting the length of each Interval in - the IntervalIndex. - """ - return self._data.length - - @property - def size(self): - # Avoid materializing ndarray[Interval] - return self._data.size - - def __len__(self) -> int: - return len(self.left) - @cache_readonly def values(self): """ @@ -477,20 +408,6 @@ def values(self): """ return self._data - @cache_readonly - def _values(self): - return self._data - - @cache_readonly - def _ndarray_values(self) -> np.ndarray: - return np.array(self._data) - - def __array__(self, result=None): - """ - The array interface, return my values. - """ - return self._ndarray_values - def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result @@ -500,31 +417,13 @@ def __reduce__(self): d.update(self._get_attributes_dict()) return _new_IntervalIndex, (type(self), d), None - @Appender(_index_shared_docs["copy"]) - def copy(self, deep=False, name=None): - array = self._data - if deep: - array = array.copy() - attributes = self._get_attributes_dict() - if name is not None: - attributes.update(name=name) - - return self._simple_new(array, **attributes) - @Appender(_index_shared_docs["astype"]) def astype(self, dtype, copy=True): with rewrite_exception("IntervalArray", type(self).__name__): new_values = self.values.astype(dtype, copy=copy) if is_interval_dtype(new_values): return self._shallow_copy(new_values.left, new_values.right) - return super().astype(dtype, copy=copy) - - @cache_readonly - def dtype(self): - """ - Return the dtype object of the underlying data. - """ - return self._data.dtype + return Index.astype(self, dtype, copy=copy) @property def inferred_type(self) -> str: @@ -537,29 +436,8 @@ def memory_usage(self, deep: bool = False) -> int: # so return the bytes here return self.left.memory_usage(deep=deep) + self.right.memory_usage(deep=deep) - @cache_readonly - def mid(self): - """ - Return the midpoint of each Interval in the IntervalIndex as an Index. - """ - return self._data.mid - - @cache_readonly - def is_monotonic(self) -> bool: - """ - Return True if the IntervalIndex is monotonic increasing (only equal or - increasing values), else False - """ - return self.is_monotonic_increasing - - @cache_readonly - def is_monotonic_increasing(self) -> bool: - """ - Return True if the IntervalIndex is monotonic increasing (only equal or - increasing values), else False - """ - return self._engine.is_monotonic_increasing - + # IntervalTree doesn't have a is_monotonic_decreasing, so have to override + # the Index implemenation @cache_readonly def is_monotonic_decreasing(self) -> bool: """ @@ -592,13 +470,8 @@ def is_unique(self): return True - @cache_readonly - @Appender(_interval_shared_docs["is_non_overlapping_monotonic"] % _index_doc_kwargs) - def is_non_overlapping_monotonic(self): - return self._data.is_non_overlapping_monotonic - @property - def is_overlapping(self): + def is_overlapping(self) -> bool: """ Return True if the IntervalIndex has overlapping intervals, else False. @@ -675,26 +548,6 @@ def _convert_list_indexer(self, keyarr, kind=None): return locs - def _maybe_cast_indexed(self, key): - """ - we need to cast the key, which could be a scalar - or an array-like to the type of our subtype - """ - if isinstance(key, IntervalIndex): - return key - - subtype = self.dtype.subtype - if is_float_dtype(subtype): - if is_integer(key): - key = float(key) - elif isinstance(key, (np.ndarray, Index)): - key = key.astype("float64") - elif is_integer_dtype(subtype): - if is_integer(key): - key = int(key) - - return key - def _can_reindex(self, indexer: np.ndarray) -> None: """ Check if we are allowing reindexing with this particular indexer. @@ -712,7 +565,7 @@ def _can_reindex(self, indexer: np.ndarray) -> None: if self.is_overlapping and len(indexer): raise ValueError("cannot reindex from an overlapping axis") - def _needs_i8_conversion(self, key): + def _needs_i8_conversion(self, key) -> bool: """ Check if a given key needs i8 conversion. Conversion is necessary for Timestamp, Timedelta, DatetimeIndex, and TimedeltaIndex keys. An @@ -827,34 +680,6 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) - def _find_non_overlapping_monotonic_bounds(self, key): - if isinstance(key, IntervalMixin): - start = self._searchsorted_monotonic( - key.left, "left", exclude_label=key.open_left - ) - stop = self._searchsorted_monotonic( - key.right, "right", exclude_label=key.open_right - ) - elif isinstance(key, slice): - # slice - start, stop = key.start, key.stop - if (key.step or 1) != 1: - raise NotImplementedError("cannot slice with a slice step") - if start is None: - start = 0 - else: - start = self._searchsorted_monotonic(start, "left") - if stop is None: - stop = len(self) - else: - stop = self._searchsorted_monotonic(stop, "right") - else: - # scalar or index-like - - start = self._searchsorted_monotonic(key, "left") - stop = self._searchsorted_monotonic(key, "right") - return start, stop - def get_loc( self, key: Any, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: @@ -978,7 +803,7 @@ def get_indexer( right_indexer = self.right.get_indexer(target_as_index.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) elif is_categorical(target_as_index): - # get an indexer for unique categories then propogate to codes via take_1d + # get an indexer for unique categories then propagate to codes via take_1d categories_indexer = self.get_indexer(target_as_index.categories) indexer = take_1d(categories_indexer, target_as_index.codes, fill_value=-1) elif not is_object_dtype(target_as_index): @@ -1146,8 +971,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): result = self._data.take( indices, axis=axis, allow_fill=allow_fill, fill_value=fill_value, **kwargs ) - attributes = self._get_attributes_dict() - return self._simple_new(result, **attributes) + return self._shallow_copy(result) def __getitem__(self, value): result = self._data[value] @@ -1215,7 +1039,7 @@ def _format_space(self) -> str: # -------------------------------------------------------------------- - def argsort(self, *args, **kwargs): + def argsort(self, *args, **kwargs) -> np.ndarray: return np.lexsort((self.right, self.left)) def equals(self, other) -> bool: @@ -1230,7 +1054,7 @@ def equals(self, other) -> bool: if not isinstance(other, IntervalIndex): if not is_interval_dtype(other): return False - other = Index(getattr(other, ".values", other)) + other = Index(other) return ( self.left.equals(other.left) @@ -1238,44 +1062,6 @@ def equals(self, other) -> bool: and self.closed == other.closed ) - @Appender( - _interval_shared_docs["contains"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( - """\ - >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - IntervalIndex([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') - >>> intervals.contains(0.5) - array([ True, False, False]) - """ - ), - ) - ) - def contains(self, other): - return self._data.contains(other) - - @Appender( - _interval_shared_docs["overlaps"] - % dict( - klass="IntervalIndex", - examples=textwrap.dedent( - """\ - >>> intervals = pd.IntervalIndex.from_tuples([(0, 1), (1, 3), (2, 4)]) - >>> intervals - IntervalIndex([(0, 1], (1, 3], (2, 4]], - closed='right', - dtype='interval[int64]') - """ - ), - ) - ) - def overlaps(self, other): - return self._data.overlaps(other) - @Appender(_index_shared_docs["intersection"]) @SetopCheck(op_name="intersection") def intersection( @@ -1375,6 +1161,34 @@ def is_all_dates(self) -> bool: # TODO: arithmetic operations + def _delegate_property_get(self, name, *args, **kwargs): + """ method delegation to the ._values """ + prop = getattr(self._data, name) + return prop # no wrapping for now + + def _delegate_method(self, name, *args, **kwargs): + """ method delegation to the ._data """ + method = getattr(self._data, name) + res = method(*args, **kwargs) + if is_scalar(res) or name in self._raw_inherit: + return res + if isinstance(res, IntervalArray): + return type(self)._simple_new(res, name=self.name) + return Index(res) + + # GH#30817 until IntervalArray implements inequalities, get them from Index + def __lt__(self, other): + return Index.__lt__(self, other) + + def __le__(self, other): + return Index.__le__(self, other) + + def __gt__(self, other): + return Index.__gt__(self, other) + + def __ge__(self, other): + return Index.__ge__(self, other) + IntervalIndex._add_logical_methods_disabled() @@ -1447,7 +1261,7 @@ def interval_range( ``start`` and ``end``, inclusively. To learn more about datetime-like frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 05a4da28eb0a1..8682af6ab6369 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,7 +1,6 @@ -from collections import OrderedDict import datetime from sys import getsizeof -from typing import List, Optional +from typing import Any, Hashable, List, Optional, Sequence, Union import warnings import numpy as np @@ -62,8 +61,6 @@ dict(klass="MultiIndex", target_klass="MultiIndex or list of tuples") ) -_no_default_names = object() - class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine): """ @@ -206,7 +203,7 @@ class MultiIndex(Index): Notes ----- See the `user guide - `_ + `_ for more. Examples @@ -374,7 +371,7 @@ def _verify_integrity( return new_codes @classmethod - def from_arrays(cls, arrays, sortorder=None, names=_no_default_names): + def from_arrays(cls, arrays, sortorder=None, names=lib.no_default): """ Convert arrays to MultiIndex. @@ -428,7 +425,7 @@ def from_arrays(cls, arrays, sortorder=None, names=_no_default_names): raise ValueError("all arrays must be same length") codes, levels = factorize_from_iterables(arrays) - if names is _no_default_names: + if names is lib.no_default: names = [getattr(arr, "name", None) for arr in arrays] return MultiIndex( @@ -498,7 +495,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) @classmethod - def from_product(cls, iterables, sortorder=None, names=_no_default_names): + def from_product(cls, iterables, sortorder=None, names=lib.no_default): """ Make a MultiIndex from the cartesian product of multiple iterables. @@ -549,7 +546,7 @@ def from_product(cls, iterables, sortorder=None, names=_no_default_names): iterables = list(iterables) codes, levels = factorize_from_iterables(iterables) - if names is _no_default_names: + if names is lib.no_default: names = [getattr(it, "name", None) for it in iterables] codes = cartesian_product(codes) @@ -628,6 +625,9 @@ def levels(self): result = [ x._shallow_copy(name=name) for x, name in zip(self._levels, self._names) ] + for level in result: + # disallow midx.levels[0].name = "foo" + level._no_setting_name = True return FrozenList(result) @property @@ -659,31 +659,6 @@ def array(self): "'MultiIndex.to_numpy()' to get a NumPy array of tuples." ) - @property - def _is_homogeneous_type(self) -> bool: - """ - Whether the levels of a MultiIndex all have the same dtype. - - This looks at the dtypes of the levels. - - See Also - -------- - Index._is_homogeneous_type : Whether the object has a single - dtype. - DataFrame._is_homogeneous_type : Whether all the columns in a - DataFrame have the same dtype. - - Examples - -------- - >>> MultiIndex.from_tuples([ - ... ('a', 'b'), ('a', 'c')])._is_homogeneous_type - True - >>> MultiIndex.from_tuples([ - ... ('a', 1), ('a', 2)])._is_homogeneous_type - False - """ - return len({x.dtype for x in self.levels}) <= 1 - def _set_levels( self, levels, level=None, copy=False, validate=True, verify_integrity=False ): @@ -743,32 +718,47 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Examples -------- >>> idx = pd.MultiIndex.from_tuples([(1, 'one'), (1, 'two'), - (2, 'one'), (2, 'two')], + (2, 'one'), (2, 'two'), + (3, 'one'), (3, 'two')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]]) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2]]) MultiIndex([('a', 1), ('a', 2), ('b', 1), - ('b', 2)], + ('b', 2), + ('c', 1), + ('c', 2)], names=['foo', 'bar']) - >>> idx.set_levels(['a', 'b'], level=0) + >>> idx.set_levels(['a', 'b', 'c'], level=0) MultiIndex([('a', 'one'), ('a', 'two'), ('b', 'one'), - ('b', 'two')], + ('b', 'two'), + ('c', 'one'), + ('c', 'two')], names=['foo', 'bar']) >>> idx.set_levels(['a', 'b'], level='bar') MultiIndex([(1, 'a'), (1, 'b'), (2, 'a'), - (2, 'b')], + (2, 'b'), + (3, 'a'), + (3, 'b')], names=['foo', 'bar']) - >>> idx.set_levels([['a', 'b'], [1, 2]], level=[0, 1]) + + If any of the levels passed to ``set_levels()`` exceeds the + existing length, all of the values from that argument will + be stored in the MultiIndex levels, though the values will + be truncated in the MultiIndex output. + + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]) MultiIndex([('a', 1), ('a', 2), ('b', 1), ('b', 2)], names=['foo', 'bar']) + >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels + FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) @@ -961,7 +951,7 @@ def copy( _set_identity=_set_identity, ) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ return self.values @@ -983,7 +973,7 @@ def _shallow_copy_with_infer(self, values, **kwargs): return self._shallow_copy(values, **kwargs) @Appender(_index_shared_docs["contains"] % _index_doc_kwargs) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: hash(key) try: self.get_loc(key) @@ -1298,8 +1288,8 @@ def _get_level_number(self, level) -> int: if level < 0: orig_level = level - self.nlevels raise IndexError( - f"Too many levels: Index has only {self.nlevels} levels," - f" {orig_level} is not a valid level number" + f"Too many levels: Index has only {self.nlevels} levels, " + f"{orig_level} is not a valid level number" ) # Note: levels are zero-based elif level >= self.nlevels: @@ -1639,17 +1629,12 @@ def to_frame(self, index=True, name=None): else: idx_names = self.names - # Guarantee resulting column order + # Guarantee resulting column order - PY36+ dict maintains insertion order result = DataFrame( - OrderedDict( - [ - ( - (level if lvlname is None else lvlname), - self._get_level_values(level), - ) - for lvlname, level in zip(idx_names, range(len(self.levels))) - ] - ), + { + (level if lvlname is None else lvlname): self._get_level_values(level) + for lvlname, level in zip(idx_names, range(len(self.levels))) + }, copy=False, ) @@ -2028,7 +2013,7 @@ def append(self, other): except (TypeError, IndexError): return Index(new_tuples) - def argsort(self, *args, **kwargs): + def argsort(self, *args, **kwargs) -> np.ndarray: return self.values.argsort(*args, **kwargs) @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs) @@ -2065,7 +2050,7 @@ def drop(self, codes, level=None, errors="raise"): dropped : MultiIndex """ if level is not None: - return self._drop_from_level(codes, level) + return self._drop_from_level(codes, level, errors) if not isinstance(codes, (np.ndarray, Index)): try: @@ -2086,9 +2071,8 @@ def drop(self, codes, level=None, errors="raise"): elif com.is_bool_indexer(loc): if self.lexsort_depth == 0: warnings.warn( - "dropping on a non-lexsorted multi-index" - " without a level parameter may impact " - "performance.", + "dropping on a non-lexsorted multi-index " + "without a level parameter may impact performance.", PerformanceWarning, stacklevel=3, ) @@ -2103,13 +2087,15 @@ def drop(self, codes, level=None, errors="raise"): return self.delete(inds) - def _drop_from_level(self, codes, level): + def _drop_from_level(self, codes, level, errors="raise"): codes = com.index_labels_to_array(codes) i = self._get_level_number(level) index = self.levels[i] values = index.get_indexer(codes) mask = ~algos.isin(self.codes[i], values) + if mask.all() and errors != "ignore": + raise KeyError(f"labels {codes} not found in level") return self[mask] @@ -2185,8 +2171,8 @@ def reorder_levels(self, order): order = [self._get_level_number(i) for i in order] if len(order) != self.nlevels: raise AssertionError( - f"Length of order must be same as number of levels ({self.nlevels})," - f" got {len(order)}" + f"Length of order must be same as number of levels ({self.nlevels}), " + f"got {len(order)}" ) new_levels = [self.levels[i] for i in order] new_codes = [self.codes[i] for i in order] @@ -2430,7 +2416,53 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): return target, indexer - def get_slice_bound(self, label, side, kind): + def get_slice_bound( + self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str + ) -> int: + """ + For an ordered MultiIndex, compute slice bound + that corresponds to given label. + + Returns leftmost (one-past-the-rightmost if `side=='right') position + of given label. + + Parameters + ---------- + label : object or tuple of objects + side : {'left', 'right'} + kind : {'loc', 'getitem'} + + Returns + ------- + int + Index of label. + + Notes + ----- + This method only works if level 0 index of the MultiIndex is lexsorted. + + Examples + -------- + >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')]) + + Get the locations from the leftmost 'b' in the first level + until the end of the multiindex: + + >>> mi.get_slice_bound('b', side="left", kind="loc") + 1 + + Like above, but if you get the locations from the rightmost + 'b' in the first level and 'f' in the second level: + + >>> mi.get_slice_bound(('b','f'), side="right", kind="loc") + 3 + + See Also + -------- + MultiIndex.get_loc : Get location for a label or a tuple of labels. + MultiIndex.get_locs : Get location for a label/slice/list/mask or a + sequence of such. + """ if not isinstance(label, tuple): label = (label,) @@ -2495,8 +2527,8 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): def _partial_tup_index(self, tup, side="left"): if len(tup) > self.lexsort_depth: raise UnsortedIndexError( - f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth" - f" ({self.lexsort_depth})" + f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth " + f"({self.lexsort_depth})" ) n = len(tup) @@ -2505,7 +2537,7 @@ def _partial_tup_index(self, tup, side="left"): for k, (lab, lev, labs) in enumerate(zipped): section = labs[start:end] - if lab not in lev: + if lab not in lev and not isna(lab): if not lev.is_type_compatible(lib.infer_dtype([lab], skipna=False)): raise TypeError(f"Level type mismatch: {lab}") @@ -2515,13 +2547,38 @@ def _partial_tup_index(self, tup, side="left"): loc -= 1 return start + section.searchsorted(loc, side=side) - idx = lev.get_loc(lab) + idx = self._get_loc_single_level_index(lev, lab) if k < n - 1: end = start + section.searchsorted(idx, side="right") start = start + section.searchsorted(idx, side="left") else: return start + section.searchsorted(idx, side=side) + def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: + """ + If key is NA value, location of index unify as -1. + + Parameters + ---------- + level_index: Index + key : label + + Returns + ------- + loc : int + If key is NA value, loc is -1 + Else, location of key in index. + + See Also + -------- + Index.get_loc : The get_loc method for (single-level) index. + """ + + if is_scalar(key) and isna(key): + return -1 + else: + return level_index.get_loc(key) + def get_loc(self, key, method=None): """ Get location for a label or a tuple of labels as an integer, slice or @@ -2620,7 +2677,9 @@ def _maybe_to_slice(loc): loc = np.arange(start, stop, dtype="int64") for i, k in enumerate(follow_key, len(lead_key)): - mask = self.codes[i][loc] == self.levels[i].get_loc(k) + mask = self.codes[i][loc] == self._get_loc_single_level_index( + self.levels[i], k + ) if not mask.all(): loc = loc[mask] if not len(loc): @@ -2638,7 +2697,7 @@ def get_loc_level(self, key, level=0, drop_level: bool = True): key : label or sequence of labels level : int/level name or list thereof, optional drop_level : bool, default True - if ``False``, the resulting index will not drop any level. + If ``False``, the resulting index will not drop any level. Returns ------- @@ -2848,7 +2907,7 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes): else: - code = level_index.get_loc(key) + code = self._get_loc_single_level_index(level_index, key) if level > 0 or self.lexsort_depth == 0: # Desired level is not sorted @@ -2899,7 +2958,7 @@ def get_locs(self, seq): >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP array([2], dtype=int64) """ - from .numeric import Int64Index + from pandas.core.indexes.numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -3076,7 +3135,7 @@ def equals(self, other) -> bool: return True - def equal_levels(self, other): + def equal_levels(self, other) -> bool: """ Return True if the levels of both MultiIndex objects are the same @@ -3276,7 +3335,7 @@ def _convert_can_do_setop(self, other): result_names = self.names if self.names == other.names else None return other, result_names - def insert(self, loc, item): + def insert(self, loc: int, item): """ Make new MultiIndex inserting new item at location @@ -3343,14 +3402,11 @@ def isin(self, values, level=None): return algos.isin(self.values, values) else: num = self._get_level_number(level) - levs = self.levels[num] - level_codes = self.codes[num] + levs = self.get_level_values(num) - sought_labels = levs.isin(values).nonzero()[0] if levs.size == 0: - return np.zeros(len(level_codes), dtype=np.bool_) - else: - return np.lib.arraysetops.in1d(level_codes, sought_labels) + return np.zeros(len(levs), dtype=np.bool_) + return levs.isin(values) MultiIndex._add_numeric_methods_disabled() diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 00d81f3ed95a9..465f21da1278a 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING, Any + import numpy as np from pandas._libs import index as libindex, lib @@ -30,9 +32,17 @@ from pandas.core import algorithms import pandas.core.common as com -from pandas.core.indexes.base import Index, InvalidIndexError, _index_shared_docs +from pandas.core.indexes.base import ( + Index, + InvalidIndexError, + _index_shared_docs, + maybe_extract_name, +) from pandas.core.ops import get_op_result_name +if TYPE_CHECKING: + from pandas import Series + _num_index_shared_docs = dict() @@ -47,6 +57,7 @@ class NumericIndex(Index): def __new__(cls, data=None, dtype=None, copy=False, name=None): cls._validate_dtype(dtype) + name = maybe_extract_name(name, data, cls) # Coerce to ndarray if not already ndarray or Index if not isinstance(data, (np.ndarray, Index)): @@ -68,8 +79,11 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None): else: subarr = data - if name is None and hasattr(data, "name"): - name = data.name + if subarr.ndim > 1: + # GH#13601, GH#20285, GH#27125 + raise ValueError("Index data must be 1-dimensional") + + subarr = np.asarray(subarr) return cls._simple_new(subarr, name=name) @classmethod @@ -91,7 +105,7 @@ def _validate_dtype(cls, dtype: Dtype) -> None: @Appender(_index_shared_docs["_maybe_cast_slice_bound"]) def _maybe_cast_slice_bound(self, label, side, kind): - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] # we will try to coerce to integers return self._maybe_cast_indexer(label) @@ -152,7 +166,7 @@ def is_all_dates(self) -> bool: return False @Appender(Index.insert.__doc__) - def insert(self, loc, item): + def insert(self, loc: int, item): # treat NA values as nans: if is_scalar(item) and isna(item): item = self._na_value @@ -217,6 +231,8 @@ class IntegerIndex(NumericIndex): This is an abstract class for Int64Index, UInt64Index. """ + _default_dtype: np.dtype + def __contains__(self, key) -> bool: """ Check if key is a float and has a decimal. If it has, return False. @@ -229,36 +245,36 @@ def __contains__(self, key) -> bool: except (OverflowError, TypeError, ValueError): return False - -class Int64Index(IntegerIndex): - __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args - - _typ = "int64index" - _can_hold_na = False - _engine_type = libindex.Int64Engine - _default_dtype = np.int64 - @property def inferred_type(self) -> str: """ - Always 'integer' for ``Int64Index`` + Always 'integer' for ``Int64Index`` and ``UInt64Index`` """ return "integer" @property def asi8(self) -> np.ndarray: # do not cache or you'll create a memory leak - return self.values.view("i8") + return self.values.view(self._default_dtype) @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] # don't coerce ilocs to integers if kind != "iloc": key = self._maybe_cast_indexer(key) return super()._convert_scalar_indexer(key, kind=kind) + +class Int64Index(IntegerIndex): + __doc__ = _num_index_shared_docs["class_descr"] % _int64_descr_args + + _typ = "int64index" + _can_hold_na = False + _engine_type = libindex.Int64Engine + _default_dtype = np.dtype(np.int64) + def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) return Int64Index(joined, name=name) @@ -293,28 +309,7 @@ class UInt64Index(IntegerIndex): _typ = "uint64index" _can_hold_na = False _engine_type = libindex.UInt64Engine - _default_dtype = np.uint64 - - @property - def inferred_type(self) -> str: - """ - Always 'integer' for ``UInt64Index`` - """ - return "integer" - - @property - def asi8(self) -> np.ndarray: - # do not cache or you'll create a memory leak - return self.values.view("u8") - - @Appender(_index_shared_docs["_convert_scalar_indexer"]) - def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] - - # don't coerce ilocs to integers - if kind != "iloc": - key = self._maybe_cast_indexer(key) - return super()._convert_scalar_indexer(key, kind=kind) + _default_dtype = np.dtype(np.uint64) @Appender(_index_shared_docs["_convert_arr_indexer"]) def _convert_arr_indexer(self, keyarr): @@ -396,7 +391,7 @@ def astype(self, dtype, copy=True): @Appender(_index_shared_docs["_convert_scalar_indexer"]) def _convert_scalar_indexer(self, key, kind=None): - assert kind in ["ix", "loc", "getitem", "iloc", None] + assert kind in ["loc", "getitem", "iloc", None] if kind == "iloc": return self._validate_indexer("positional", key, kind) @@ -430,17 +425,18 @@ def _format_native_types( ) return formatter.get_result_as_array() - def get_value(self, series, key): + def get_value(self, series: "Series", key): """ We always want to get an index value, never a value. """ if not is_scalar(key): raise InvalidIndexError - k = com.values_from_object(key) - loc = self.get_loc(k) - new_values = com.values_from_object(series)[loc] + loc = self.get_loc(key) + if not is_scalar(loc): + return series.iloc[loc] + new_values = series._values[loc] return new_values def equals(self, other) -> bool: @@ -465,38 +461,27 @@ def equals(self, other) -> bool: except (TypeError, ValueError): return False - def __contains__(self, other) -> bool: + def __contains__(self, other: Any) -> bool: + hash(other) if super().__contains__(other): return True - try: - # if other is a sequence this throws a ValueError - return np.isnan(other) and self.hasnans - except ValueError: - try: - return len(other) <= 1 and other.item() in self - except AttributeError: - return len(other) <= 1 and other in self - except TypeError: - pass - except TypeError: - pass - - return False + return is_float(other) and np.isnan(other) and self.hasnans @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): - try: - if np.all(np.isnan(key)) or is_bool(key): - nan_idxs = self._nan_idxs - try: - return nan_idxs.item() - except ValueError: - if not len(nan_idxs): - raise KeyError(key) - return nan_idxs - except (TypeError, NotImplementedError): - pass + if is_bool(key): + # Catch this to avoid accidentally casting to 1.0 + raise KeyError(key) + + if is_float(key) and np.isnan(key): + nan_idxs = self._nan_idxs + if not len(nan_idxs): + raise KeyError(key) + elif len(nan_idxs) == 1: + return nan_idxs[0] + return nan_idxs + return super().get_loc(key, method=method, tolerance=tolerance) @cache_readonly diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 9485116a8084a..9d501b2601c09 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,39 +1,50 @@ from datetime import datetime, timedelta +from typing import Any import weakref import numpy as np from pandas._libs import index as libindex -from pandas._libs.tslibs import NaT, frequencies as libfrequencies, iNaT, resolution -from pandas._libs.tslibs.period import DIFFERENT_FREQ, IncompatibleFrequency, Period -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas._libs.tslibs import NaT, frequencies as libfrequencies, resolution +from pandas._libs.tslibs.parsing import parse_time_string +from pandas._libs.tslibs.period import Period +from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, + is_dtype_equal, is_float, - is_float_dtype, is_integer, is_integer_dtype, + is_list_like, + is_object_dtype, pandas_dtype, ) from pandas.core.accessor import delegate_names -from pandas.core.algorithms import unique1d -from pandas.core.arrays.period import PeriodArray, period_array, validate_dtype_freq -from pandas.core.base import _shared_docs +from pandas.core.arrays.period import ( + PeriodArray, + period_array, + raise_on_incompatible, + validate_dtype_freq, +) import pandas.core.common as com import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import _index_shared_docs, ensure_index +from pandas.core.indexes.base import ( + _index_shared_docs, + ensure_index, + maybe_extract_name, +) from pandas.core.indexes.datetimelike import ( DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, ) -from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index -from pandas.core.missing import isna +from pandas.core.indexes.datetimes import DatetimeIndex, Index +from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name -from pandas.core.tools.datetimes import DateParseError, parse_time_string +from pandas.core.tools.datetimes import DateParseError from pandas.tseries import frequencies from pandas.tseries.offsets import DateOffset, Tick @@ -61,13 +72,11 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): Delegate from PeriodIndex to PeriodArray. """ - _delegate_class = PeriodArray - _delegated_properties = PeriodArray._datetimelike_ops - _delegated_methods = set(PeriodArray._datetimelike_methods) | { - "_addsub_int_array", - "strftime", - } - _raw_properties = {"is_leap_year"} + _raw_methods = {"_format_native_types"} + _raw_properties = {"is_leap_year", "freq"} + + _delegated_properties = PeriodArray._datetimelike_ops + list(_raw_properties) + _delegated_methods = set(PeriodArray._datetimelike_methods) | _raw_methods @delegate_names(PeriodArray, PeriodDelegateMixin._delegated_properties, typ="property") @@ -76,8 +85,7 @@ class PeriodDelegateMixin(DatetimelikeDelegateMixin): ) class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): """ - Immutable ndarray holding ordinal values indicating regular periods in - time such as particular years, quarters, months, etc. + Immutable ndarray holding ordinal values indicating regular periods in time. Index keys are boxed to Period objects which carries the metadata (eg, frequency information). @@ -85,9 +93,9 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): Parameters ---------- data : array-like (1d int np.ndarray or PeriodArray), optional - Optional period-like data to construct index with + Optional period-like data to construct index with. copy : bool - Make a copy of input ndarray + Make a copy of input ndarray. freq : str or period object, optional One of pandas period strings or corresponding objects year : int, array, or Series, default None @@ -98,7 +106,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index, PeriodDelegateMixin): minute : int, array, or Series, default None second : int, array, or Series, default None tz : object, default None - Timezone for converting datetime64 data to Periods + Timezone for converting datetime64 data to Periods. dtype : str or PeriodDtype, default None Attributes @@ -184,8 +192,7 @@ def __new__( argument = list(set(fields) - valid_field_set)[0] raise TypeError(f"__new__() got an unexpected keyword argument {argument}") - if name is None and hasattr(data, "name"): - name = data.name + name = maybe_extract_name(name, data, cls) if data is None and ordinal is None: # range-based. @@ -226,24 +233,13 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): Parameters ---------- - values : PeriodArray, PeriodIndex, Index[int64], ndarray[int64] + values : PeriodArray Values that can be converted to a PeriodArray without inference or coercion. - """ - # TODO: raising on floats is tested, but maybe not useful. - # Should the callers know not to pass floats? - # At the very least, I think we can ensure that lists aren't passed. - if isinstance(values, list): - values = np.asarray(values) - if is_float_dtype(values): - raise TypeError("PeriodIndex._simple_new does not accept floats.") - if freq: - freq = Period._maybe_convert_freq(freq) - values = PeriodArray(values, freq=freq) + assert isinstance(values, PeriodArray), type(values) + assert freq is None or freq == values.freq, (freq, values.freq) - if not isinstance(values, PeriodArray): - raise TypeError("PeriodIndex._simple_new only accepts PeriodArray") result = object.__new__(cls) result._data = values # For groupby perf. See note in indexes/base about _index_data @@ -259,29 +255,20 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): def values(self): return np.asarray(self) - @property - def freq(self) -> DateOffset: - return self._data.freq - def _shallow_copy(self, values=None, **kwargs): # TODO: simplify, figure out type of values if values is None: values = self._data if isinstance(values, type(self)): - values = values._values + values = values._data if not isinstance(values, PeriodArray): - if isinstance(values, np.ndarray) and is_integer_dtype(values.dtype): + if isinstance(values, np.ndarray) and values.dtype == "i8": values = PeriodArray(values, freq=self.freq) else: - # in particular, I would like to avoid period_array here. - # Some people seem to be calling use with unexpected types - # Index.difference -> ndarray[Period] - # DatetimelikeIndexOpsMixin.repeat -> ndarray[ordinal] - # I think that once all of Datetime* are EAs, we can simplify - # this quite a bit. - values = period_array(values, freq=self.freq) + # GH#30713 this should never be reached + raise TypeError(type(values), getattr(values, "dtype", None)) # We don't allow changing `freq` in _shallow_copy. validate_dtype_freq(self.dtype, kwargs.get("freq")) @@ -341,10 +328,7 @@ def _maybe_convert_timedelta(self, other): if base == self.freq.rule_code: return other.n - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(self, other) elif is_integer(other): # integer is passed to .shift via # _add_datetimelike_methods basically @@ -352,18 +336,11 @@ def _maybe_convert_timedelta(self, other): return other # raise when input doesn't have freq - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, own_freq=self.freqstr, other_freq=None - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(self, None) # ------------------------------------------------------------------------ # Rendering Methods - def _format_native_types(self, na_rep="NaT", quoting=None, **kwargs): - # just dispatch, return ndarray - return self._data._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) - def _mpl_repr(self): # how to represent ourselves to matplotlib return self.astype(object).values @@ -382,18 +359,18 @@ def _engine(self): return self._engine_type(period, len(self)) @Appender(_index_shared_docs["contains"]) - def __contains__(self, key) -> bool: + def __contains__(self, key: Any) -> bool: if isinstance(key, Period): if key.freq != self.freq: return False else: return key.ordinal in self._engine else: + hash(key) try: self.get_loc(key) return True - except (TypeError, KeyError): - # TypeError can be reached if we pass a tuple that is not hashable + except KeyError: return False @cache_readonly @@ -403,17 +380,7 @@ def _int64index(self): # ------------------------------------------------------------------------ # Index Methods - def _coerce_scalar_to_index(self, item): - """ - we need to coerce a scalar to a compat for our index type - - Parameters - ---------- - item : scalar item to coerce - """ - return PeriodIndex([item], **self._get_attributes_dict()) - - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: if is_integer_dtype(dtype): return self.asi8 else: @@ -488,30 +455,6 @@ def astype(self, dtype, copy=True, how="start"): # TODO: should probably raise on `how` here, so we don't ignore it. return super().astype(dtype, copy=copy) - @Substitution(klass="PeriodIndex") - @Appender(_shared_docs["searchsorted"]) - def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, Period): - if value.freq != self.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=value.freqstr, - ) - raise IncompatibleFrequency(msg) - value = value.ordinal - elif isinstance(value, str): - try: - value = Period(value, freq=self.freq).ordinal - except DateParseError: - raise KeyError(f"Cannot interpret '{value}' as period") - - return self._ndarray_values.searchsorted(value, side=side, sorter=sorter) - - @property - def is_all_dates(self) -> bool: - return True - @property def is_full(self) -> bool: """ @@ -536,54 +479,54 @@ def get_value(self, series, key): Fast lookup of value from 1-dimensional ndarray. Only use this if you know what you're doing """ - s = com.values_from_object(series) - try: - return com.maybe_box(self, super().get_value(s, key), series, key) - except (KeyError, IndexError): - if isinstance(key, str): - asdt, parsed, reso = parse_time_string(key, self.freq) - grp = resolution.Resolution.get_freq_group(reso) - freqn = resolution.get_freq_group(self.freq) - - vals = self._ndarray_values - - # if our data is higher resolution than requested key, slice - if grp < freqn: - iv = Period(asdt, freq=(grp, 1)) - ord1 = iv.asfreq(self.freq, how="S").ordinal - ord2 = iv.asfreq(self.freq, how="E").ordinal - - if ord2 < vals[0] or ord1 > vals[-1]: - raise KeyError(key) - - pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) - key = slice(pos[0], pos[1] + 1) - return series[key] - elif grp == freqn: - key = Period(asdt, freq=self.freq).ordinal - return com.maybe_box( - self, self._int64index.get_value(s, key), series, key - ) - else: + if is_integer(key): + return series.iat[key] + + if isinstance(key, str): + asdt, reso = parse_time_string(key, self.freq) + grp = resolution.Resolution.get_freq_group(reso) + freqn = resolution.get_freq_group(self.freq) + + vals = self._ndarray_values + + # if our data is higher resolution than requested key, slice + if grp < freqn: + iv = Period(asdt, freq=(grp, 1)) + ord1 = iv.asfreq(self.freq, how="S").ordinal + ord2 = iv.asfreq(self.freq, how="E").ordinal + + if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) - period = Period(key, self.freq) - key = period.value if isna(period) else period.ordinal - return com.maybe_box(self, self._int64index.get_value(s, key), series, key) + pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) + key = slice(pos[0], pos[1] + 1) + return series[key] + elif grp == freqn: + key = Period(asdt, freq=self.freq) + loc = self.get_loc(key) + return series.iloc[loc] + else: + raise KeyError(key) + + elif isinstance(key, Period) or key is NaT: + ordinal = key.ordinal if key is not NaT else NaT.value + loc = self._engine.get_loc(ordinal) + return series[loc] + + # slice, PeriodIndex, np.ndarray, List[Period] + value = Index.get_value(self, series, key) + return com.maybe_box(self, value, series, key) @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) - if hasattr(target, "freq") and target.freq != self.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=target.freqstr, - ) - raise IncompatibleFrequency(msg) - if isinstance(target, PeriodIndex): + if target.freq != self.freq: + # No matches + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches + target = target.asi8 self_index = self._int64index else: @@ -598,71 +541,63 @@ def get_indexer_non_unique(self, target): target = ensure_index(target) if isinstance(target, PeriodIndex): + if target.freq != self.freq: + no_matches = -1 * np.ones(self.shape, dtype=np.intp) + return no_matches, no_matches + target = target.asi8 - if hasattr(target, "freq") and target.freq != self.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, - own_freq=self.freqstr, - other_freq=target.freqstr, - ) - raise IncompatibleFrequency(msg) indexer, missing = self._int64index.get_indexer_non_unique(target) return ensure_platform_int(indexer), missing - def _get_unique_index(self, dropna=False): - """ - wrap Index._get_unique_index to handle NaT - """ - res = super()._get_unique_index(dropna=dropna) - if dropna: - res = res.dropna() - return res - - @Appender(Index.unique.__doc__) - def unique(self, level=None): - # override the Index.unique method for performance GH#23083 - if level is not None: - # this should never occur, but is retained to make the signature - # match Index.unique - self._validate_index_level(level) - - values = self._ndarray_values - result = unique1d(values) - return self._shallow_copy(result) - def get_loc(self, key, method=None, tolerance=None): """ - Get integer location for requested label + Get integer location for requested label. + + Parameters + ---------- + key : Period, NaT, str, or datetime + String or datetime key must be parseable as Period. Returns ------- - loc : int + loc : int or ndarray[int64] + + Raises + ------ + KeyError + Key is not present in the index. + TypeError + If key is listlike or otherwise not hashable. """ - try: - return self._engine.get_loc(key) - except KeyError: - if is_integer(key): - raise + if isinstance(key, str): try: - asdt, parsed, reso = parse_time_string(key, self.freq) + asdt, reso = parse_time_string(key, self.freq) key = asdt - except TypeError: - pass except DateParseError: # A string with invalid format raise KeyError(f"Cannot interpret '{key}' as period") - try: - key = Period(key, freq=self.freq) - except ValueError: - # we cannot construct the Period - # as we have an invalid type - raise KeyError(key) + elif is_integer(key): + # Period constructor will cast to string, which we dont want + raise KeyError(key) + + try: + key = Period(key, freq=self.freq) + except ValueError: + # we cannot construct the Period + # as we have an invalid type + if is_list_like(key): + raise TypeError(f"'{key}' is an invalid key") + raise KeyError(key) + + ordinal = key.ordinal if key is not NaT else key.value + try: + return self._engine.get_loc(ordinal) + except KeyError: try: - ordinal = iNaT if key is NaT else key.ordinal if tolerance is not None: tolerance = self._convert_tolerance(tolerance, np.asarray(key)) return self._int64index.get_loc(ordinal, method, tolerance) @@ -679,7 +614,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} Returns ------- @@ -690,13 +625,13 @@ def _maybe_cast_slice_bound(self, label, side, kind): Value of `side` parameter should be validated in caller. """ - assert kind in ["ix", "loc", "getitem"] + assert kind in ["loc", "getitem"] if isinstance(label, datetime): return Period(label, freq=self.freq) elif isinstance(label, str): try: - _, parsed, reso = parse_time_string(label, self.freq) + parsed, reso = parse_time_string(label, self.freq) bounds = self._parsed_string_to_bounds(reso, parsed) return bounds[0 if side == "left" else 1] except ValueError: @@ -749,11 +684,12 @@ def _parsed_string_to_bounds(self, reso, parsed): raise KeyError(reso) return (t1.asfreq(self.freq, how="start"), t1.asfreq(self.freq, how="end")) - def _get_string_slice(self, key): + def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): + # TODO: Check for non-True use_lhs/use_rhs if not self.is_monotonic: raise ValueError("Partial indexing only valid for ordered time series") - key, parsed, reso = parse_time_string(key, self.freq) + parsed, reso = parse_time_string(key, self.freq) grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) if reso in ["day", "hour", "minute", "second"] and not grp < freqn: @@ -761,8 +697,7 @@ def _get_string_slice(self, key): t1, t2 = self._parsed_string_to_bounds(reso, parsed) return slice( - self.searchsorted(t1.ordinal, side="left"), - self.searchsorted(t2.ordinal, side="right"), + self.searchsorted(t1, side="left"), self.searchsorted(t2, side="right") ) def _convert_tolerance(self, tolerance, target): @@ -805,9 +740,8 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) return self._apply_meta(result), lidx, ridx return self._apply_meta(result) - @Appender(Index.intersection.__doc__) - def intersection(self, other, sort=False): - return Index.intersection(self, other, sort=sort) + # ------------------------------------------------------------------------ + # Set Operation Methods def _assert_can_do_setop(self, other): super()._assert_can_do_setop(other) @@ -815,51 +749,81 @@ def _assert_can_do_setop(self, other): # *Can't* use PeriodIndexes of different freqs # *Can* use PeriodIndex/DatetimeIndex if isinstance(other, PeriodIndex) and self.freq != other.freq: - msg = DIFFERENT_FREQ.format( - cls=type(self).__name__, own_freq=self.freqstr, other_freq=other.freqstr - ) - raise IncompatibleFrequency(msg) + raise raise_on_incompatible(self, other) - def _wrap_setop_result(self, other, result): - name = get_op_result_name(self, other) - result = self._apply_meta(result) - result.name = name + def intersection(self, other, sort=False): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + res_name = get_op_result_name(self, other) + other = ensure_index(other) + + if self.equals(other): + return self._get_reconciled_name_object(other) + + if not is_dtype_equal(self.dtype, other.dtype): + # TODO: fastpath for if we have a different PeriodDtype + this = self.astype("O") + other = other.astype("O") + return this.intersection(other, sort=sort) + + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self.intersection(i8other, sort=sort) + + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) return result - def _apply_meta(self, rawarr): - if not isinstance(rawarr, PeriodIndex): - rawarr = PeriodIndex._simple_new(rawarr, freq=self.freq, name=self.name) - return rawarr + def difference(self, other, sort=None): + self._validate_sort_keyword(sort) + self._assert_can_do_setop(other) + res_name = get_op_result_name(self, other) + other = ensure_index(other) - def __setstate__(self, state): - """Necessary for making this object picklable""" + if self.equals(other): + # pass an empty PeriodArray with the appropriate dtype + return self._shallow_copy(self._data[:0]) - if isinstance(state, dict): - super().__setstate__(state) + if is_object_dtype(other): + return self.astype(object).difference(other).astype(self.dtype) - elif isinstance(state, tuple): + elif not is_dtype_equal(self.dtype, other.dtype): + return self - # < 0.15 compat - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self.difference(i8other, sort=sort) - # backcompat - freq = Period._maybe_convert_freq(own_state[1]) + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(self, state) - freq = None # ? + def _union(self, other, sort): + if not len(other) or self.equals(other) or not len(self): + return super()._union(other, sort=sort) - data = PeriodArray(data, freq=freq) - self._data = data + # We are called by `union`, which is responsible for this validation + assert isinstance(other, type(self)) - else: - raise Exception("invalid pickle state") + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype("O") + other = other.astype("O") + return this._union(other, sort=sort) + + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self._union(i8other, sort=sort) + + res_name = get_op_result_name(self, other) + result = self._shallow_copy(np.asarray(i8result, dtype=np.int64), name=res_name) + return result - _unpickle_compat = __setstate__ + # ------------------------------------------------------------------------ + + def _apply_meta(self, rawarr): + if not isinstance(rawarr, PeriodIndex): + if not isinstance(rawarr, PeriodArray): + rawarr = PeriodArray(rawarr, freq=self.freq) + rawarr = PeriodIndex._simple_new(rawarr, name=self.name) + return rawarr def memory_usage(self, deep=False): result = super().memory_usage(deep=deep) @@ -868,10 +832,8 @@ def memory_usage(self, deep=False): return result -PeriodIndex._add_comparison_ops() PeriodIndex._add_numeric_methods_disabled() PeriodIndex._add_logical_methods_disabled() -PeriodIndex._add_datetimelike_methods() def period_range( @@ -907,7 +869,7 @@ def period_range( must be specified. To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 6ad70841a48b0..22940f851ddb0 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Optional, Union +from typing import Any, Optional import warnings import numpy as np @@ -14,6 +14,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, ensure_python_int, + is_float, is_integer, is_integer_dtype, is_list_like, @@ -26,12 +27,14 @@ import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import _index_shared_docs, maybe_extract_name from pandas.core.indexes.numeric import Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.io.formats.printing import pprint_thing +_empty_range = range(0) + class RangeIndex(Int64Index): """ @@ -85,10 +88,10 @@ def __new__( ): cls._validate_dtype(dtype) + name = maybe_extract_name(name, start, cls) # RangeIndex if isinstance(start, RangeIndex): - name = start.name if name is None else name start = start._range return cls._simple_new(start, dtype=dtype, name=name) @@ -111,7 +114,7 @@ def __new__( return cls._simple_new(rng, dtype=dtype, name=name) @classmethod - def from_range(cls, data, name=None, dtype=None): + def from_range(cls, data: range, name=None, dtype=None) -> "RangeIndex": """ Create RangeIndex from a range object. @@ -129,15 +132,10 @@ def from_range(cls, data, name=None, dtype=None): return cls._simple_new(data, dtype=dtype, name=name) @classmethod - def _simple_new(cls, values, name=None, dtype=None): + def _simple_new(cls, values: range, name=None, dtype=None) -> "RangeIndex": result = object.__new__(cls) - # handle passed None, non-integers - if values is None: - # empty - values = range(0, 0, 1) - elif not isinstance(values, range): - return Index(values, dtype=dtype, name=name) + assert isinstance(values, range) result._range = values result.name = name @@ -225,7 +223,7 @@ def _start(self): """ warnings.warn( self._deprecation_message.format("_start", "start"), - DeprecationWarning, + FutureWarning, stacklevel=2, ) return self.start @@ -248,7 +246,7 @@ def _stop(self): # GH 25710 warnings.warn( self._deprecation_message.format("_stop", "stop"), - DeprecationWarning, + FutureWarning, stacklevel=2, ) return self.stop @@ -272,7 +270,7 @@ def _step(self): # GH 25710 warnings.warn( self._deprecation_message.format("_step", "step"), - DeprecationWarning, + FutureWarning, stacklevel=2, ) return self.step @@ -334,7 +332,7 @@ def is_monotonic_decreasing(self) -> bool: def has_duplicates(self) -> bool: return False - def __contains__(self, key: Union[int, np.integer]) -> bool: + def __contains__(self, key: Any) -> bool: hash(key) try: key = ensure_python_int(key) @@ -344,12 +342,14 @@ def __contains__(self, key: Union[int, np.integer]) -> bool: @Appender(_index_shared_docs["get_loc"]) def get_loc(self, key, method=None, tolerance=None): - if is_integer(key) and method is None and tolerance is None: - new_key = int(key) - try: - return self._range.index(new_key) - except ValueError: - raise KeyError(key) + if method is None and tolerance is None: + if is_integer(key) or (is_float(key) and key.is_integer()): + new_key = int(key) + try: + return self._range.index(new_key) + except ValueError: + raise KeyError(key) + raise KeyError(key) return super().get_loc(key, method=method, tolerance=tolerance) @Appender(_index_shared_docs["get_indexer"]) @@ -421,7 +421,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): nv.validate_max(args, kwargs) return self._minmax("max") - def argsort(self, *args, **kwargs): + def argsort(self, *args, **kwargs) -> np.ndarray: """ Returns the indices that would sort the index and its underlying data. @@ -441,7 +441,7 @@ def argsort(self, *args, **kwargs): else: return np.arange(len(self) - 1, -1, -1) - def equals(self, other): + def equals(self, other) -> bool: """ Determines if two Index objects contain the same elements. """ @@ -479,7 +479,7 @@ def intersection(self, other, sort=False): return super().intersection(other, sort=sort) if not len(self) or not len(other): - return self._simple_new(None) + return self._simple_new(_empty_range) first = self._range[::-1] if self.step < 0 else self._range second = other._range[::-1] if other.step < 0 else other._range @@ -489,7 +489,7 @@ def intersection(self, other, sort=False): int_low = max(first.start, second.start) int_high = min(first.stop, second.stop) if int_high <= int_low: - return self._simple_new(None) + return self._simple_new(_empty_range) # Method hint: linear Diophantine equation # solve intersection problem @@ -499,7 +499,7 @@ def intersection(self, other, sort=False): # check whether element sets intersect if (first.start - second.start) % gcd: - return self._simple_new(None) + return self._simple_new(_empty_range) # calculate parameters for the RangeIndex describing the # intersection disregarding the lower bounds diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 889075ebe4e31..1dd5c065ec216 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -3,12 +3,11 @@ import numpy as np -from pandas._libs import NaT, Timedelta, index as libindex, join as libjoin, lib -from pandas.util._decorators import Appender, Substitution +from pandas._libs import NaT, Timedelta, index as libindex +from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( _TD_DTYPE, - ensure_int64, is_float, is_integer, is_list_like, @@ -17,22 +16,19 @@ is_timedelta64_ns_dtype, pandas_dtype, ) -from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import isna from pandas.core.accessor import delegate_names from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.timedeltas import TimedeltaArray, _is_convertible_to_td -from pandas.core.base import _shared_docs import pandas.core.common as com -from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name from pandas.core.indexes.datetimelike import ( DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, - ea_passthrough, + DatetimeTimedeltaMixin, ) -from pandas.core.indexes.numeric import Int64Index -from pandas.core.ops import get_op_result_name +from pandas.core.indexes.extension import inherit_names from pandas.tseries.frequencies import to_offset @@ -42,18 +38,28 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): # Some are "raw" methods, the result is not re-boxed in an Index # We also have a few "extra" attrs, which may or may not be raw, # which we don't want to expose in the .dt accessor. - _delegate_class = TimedeltaArray - _delegated_properties = TimedeltaArray._datetimelike_ops + ["components"] - _delegated_methods = TimedeltaArray._datetimelike_methods + [ - "_box_values", - "__neg__", - "__pos__", - "__abs__", - ] - _raw_properties = {"components"} - _raw_methods = {"to_pytimedelta"} + _raw_properties = {"components", "_box_func"} + _raw_methods = {"to_pytimedelta", "sum", "std", "median", "_format_native_types"} + + _delegated_properties = TimedeltaArray._datetimelike_ops + list(_raw_properties) + _delegated_methods = ( + TimedeltaArray._datetimelike_methods + + list(_raw_methods) + + ["_box_values", "__neg__", "__pos__", "__abs__"] + ) +@inherit_names( + [ + "_bool_ops", + "_object_ops", + "_field_ops", + "_datetimelike_ops", + "_datetimelike_methods", + "_other_ops", + ], + TimedeltaArray, +) @delegate_names( TimedeltaArray, TimedeltaDelegateMixin._delegated_properties, typ="property" ) @@ -64,7 +70,7 @@ class TimedeltaDelegateMixin(DatetimelikeDelegateMixin): overwrite=True, ) class TimedeltaIndex( - DatetimeIndexOpsMixin, dtl.TimelikeOps, Int64Index, TimedeltaDelegateMixin + DatetimeTimedeltaMixin, dtl.TimelikeOps, TimedeltaDelegateMixin, ): """ Immutable ndarray of timedelta64 data, represented internally as int64, and @@ -115,21 +121,10 @@ class TimedeltaIndex( Notes ----- To learn more about the frequency strings, please see `this link - `__. + `__. """ _typ = "timedeltaindex" - _join_precedence = 10 - - def _join_i8_wrapper(joinf, **kwargs): - return DatetimeIndexOpsMixin._join_i8_wrapper(joinf, dtype="m8[ns]", **kwargs) - - _inner_indexer = _join_i8_wrapper(libjoin.inner_join_indexer) - _outer_indexer = _join_i8_wrapper(libjoin.outer_join_indexer) - _left_indexer = _join_i8_wrapper(libjoin.left_join_indexer) - _left_indexer_unique = _join_i8_wrapper( - libjoin.left_join_indexer_unique, with_indexers=False - ) _engine_type = libindex.TimedeltaEngine @@ -138,18 +133,6 @@ def _join_i8_wrapper(joinf, **kwargs): _is_numeric_dtype = True _infer_as_myclass = True - _freq = None - - _bool_ops = TimedeltaArray._bool_ops - _object_ops = TimedeltaArray._object_ops - _field_ops = TimedeltaArray._field_ops - _datetimelike_ops = TimedeltaArray._datetimelike_ops - _datetimelike_methods = TimedeltaArray._datetimelike_methods - _other_ops = TimedeltaArray._other_ops - sum = ea_passthrough(TimedeltaArray.sum) - std = ea_passthrough(TimedeltaArray.std) - median = ea_passthrough(TimedeltaArray.median) - # ------------------------------------------------------------------- # Constructors @@ -163,6 +146,7 @@ def __new__( copy=False, name=None, ): + name = maybe_extract_name(name, data, cls) if is_scalar(data): raise TypeError( @@ -192,12 +176,13 @@ def __new__( tdarr = TimedeltaArray._from_sequence( data, freq=freq, unit=unit, dtype=dtype, copy=copy ) - return cls._simple_new(tdarr._data, freq=tdarr.freq, name=name) + return cls._simple_new(tdarr, name=name) @classmethod def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): # `dtype` is passed by _shallow_copy in corner cases, should always # be timedelta64[ns] if present + if not isinstance(values, TimedeltaArray): values = TimedeltaArray._simple_new(values, dtype=dtype, freq=freq) else: @@ -210,24 +195,13 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): tdarr = TimedeltaArray._simple_new(values._data, freq=freq) result = object.__new__(cls) result._data = tdarr - result.name = name + result._name = name # For groupby perf. See note in indexes/base about _index_data result._index_data = tdarr._data result._reset_identity() return result - # ------------------------------------------------------------------- - - def __setstate__(self, state): - """Necessary for making this object picklable""" - if isinstance(state, dict): - super().__setstate__(state) - else: - raise Exception("invalid pickle state") - - _unpickle_compat = __setstate__ - # ------------------------------------------------------------------- # Rendering Methods @@ -237,33 +211,6 @@ def _formatter_func(self): return _get_format_timedelta64(self, box=True) - def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import Timedelta64Formatter - - return np.asarray( - Timedelta64Formatter( - values=self, nat_rep=na_rep, justify="all" - ).get_result() - ) - - # ------------------------------------------------------------------- - # Wrapping TimedeltaArray - - # Compat for frequency inference, see GH#23789 - _is_monotonic_increasing = Index.is_monotonic_increasing - _is_monotonic_decreasing = Index.is_monotonic_decreasing - _is_unique = Index.is_unique - - @property - def _box_func(self): - return lambda x: Timedelta(x, unit="ns") - - def __getitem__(self, key): - result = self._data.__getitem__(key) - if is_scalar(result): - return result - return type(self)(result, name=self.name) - # ------------------------------------------------------------------- @Appender(_index_shared_docs["astype"]) @@ -279,145 +226,6 @@ def astype(self, dtype, copy=True): return Index(result.astype("i8"), name=self.name) return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy) - def _union(self, other, sort): - if len(other) == 0 or self.equals(other) or len(self) == 0: - return super()._union(other, sort=sort) - - if not isinstance(other, TimedeltaIndex): - try: - other = TimedeltaIndex(other) - except (TypeError, ValueError): - pass - this, other = self, other - - if this._can_fast_union(other): - return this._fast_union(other) - else: - result = Index._union(this, other, sort=sort) - if isinstance(result, TimedeltaIndex): - if result.freq is None: - # TODO: find a less code-smelly way to set this - result._data._freq = to_offset(result.inferred_freq) - return result - - def join(self, other, how="left", level=None, return_indexers=False, sort=False): - """ - See Index.join - """ - if _is_convertible_to_index(other): - try: - other = TimedeltaIndex(other) - except (TypeError, ValueError): - pass - - return Index.join( - self, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - - def intersection(self, other, sort=False): - """ - Specialized intersection for TimedeltaIndex objects. - May be much faster than Index.intersection - - Parameters - ---------- - other : TimedeltaIndex or array-like - sort : False or None, default False - Sort the resulting index if possible. - - .. versionadded:: 0.24.0 - - .. versionchanged:: 0.24.1 - - Changed the default to ``False`` to match the behaviour - from before 0.24.0. - - .. versionchanged:: 0.25.0 - - The `sort` keyword is added - - Returns - ------- - y : Index or TimedeltaIndex - """ - return super().intersection(other, sort=sort) - - @Appender(Index.difference.__doc__) - def difference(self, other, sort=None): - new_idx = super().difference(other, sort=sort) - # TODO: find a less code-smelly way to set this - new_idx._data._freq = None - return new_idx - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if ( - isinstance(other, TimedeltaIndex) - and self.freq == other.freq - and self._can_fast_union(other) - ): - joined = self._shallow_copy(joined, name=name) - return joined - else: - return self._simple_new(joined, name) - - def _can_fast_union(self, other): - if not isinstance(other, TimedeltaIndex): - return False - - freq = self.freq - - if freq is None or freq != other.freq: - return False - - if not self.is_monotonic or not other.is_monotonic: - return False - - if len(self) == 0 or len(other) == 0: - return True - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - right_start = right[0] - left_end = left[-1] - - # Only need to "adjoin", not overlap - return (right_start == left_end + freq) or right_start in left - - def _fast_union(self, other): - if len(other) == 0: - return self.view(type(self)) - - if len(self) == 0: - return other.view(type(self)) - - # to make our life easier, "sort" the two ranges - if self[0] <= other[0]: - left, right = self, other - else: - left, right = other, self - - left_end = left[-1] - right_end = right[-1] - - # concatenate - if left_end < right_end: - loc = right.searchsorted(left_end, side="right") - right_chunk = right.values[loc:] - dates = concat_compat((left.values, right_chunk)) - return self._shallow_copy(dates) - else: - return left - def _maybe_promote(self, other): if other.inferred_type == "timedelta": other = TimedeltaIndex(other) @@ -429,29 +237,22 @@ def get_value(self, series, key): know what you're doing """ - if _is_convertible_to_td(key): - key = Timedelta(key) - return self.get_value_maybe_box(series, key) - - try: - return com.maybe_box(self, Index.get_value(self, series, key), series, key) - except KeyError: + if isinstance(key, str): try: - loc = self._get_string_slice(key) - return series[loc] - except (TypeError, ValueError, KeyError): - pass - - try: - return self.get_value_maybe_box(series, key) - except (TypeError, ValueError, KeyError): + key = Timedelta(key) + except ValueError: raise KeyError(key) - def get_value_maybe_box(self, series, key): - if not isinstance(key, Timedelta): + if isinstance(key, self._data._recognized_scalars) or key is NaT: key = Timedelta(key) - values = self._engine.get_value(com.values_from_object(series), key) - return com.maybe_box(self, values, series, key) + return self.get_value_maybe_box(series, key) + + value = Index.get_value(self, series, key) + return com.maybe_box(self, value, series, key) + + def get_value_maybe_box(self, series, key: Timedelta): + loc = self.get_loc(key) + return self._get_values_for_loc(series, loc) def get_loc(self, key, method=None, tolerance=None): """ @@ -480,19 +281,7 @@ def get_loc(self, key, method=None, tolerance=None): key = Timedelta(key) return Index.get_loc(self, key, method, tolerance) - try: - return Index.get_loc(self, key, method, tolerance) - except (KeyError, ValueError, TypeError): - try: - return self._get_string_slice(key) - except (TypeError, KeyError, ValueError): - pass - - try: - stamp = Timedelta(key) - return Index.get_loc(self, stamp, method, tolerance) - except (KeyError, ValueError): - raise KeyError(key) + return Index.get_loc(self, key, method, tolerance) def _maybe_cast_slice_bound(self, label, side, kind): """ @@ -502,13 +291,13 @@ def _maybe_cast_slice_bound(self, label, side, kind): ---------- label : object side : {'left', 'right'} - kind : {'ix', 'loc', 'getitem'} + kind : {'loc', 'getitem'} or None Returns ------- label : object """ - assert kind in ["ix", "loc", "getitem", None] + assert kind in ["loc", "getitem", None] if isinstance(label, str): parsed = Timedelta(label) @@ -522,30 +311,12 @@ def _maybe_cast_slice_bound(self, label, side, kind): return label - def _get_string_slice(self, key): - if is_integer(key) or is_float(key) or key is NaT: - self._invalid_indexer("slice", key) - loc = self._partial_td_slice(key) - return loc - - def _partial_td_slice(self, key): - + def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): + # TODO: Check for non-True use_lhs/use_rhs + assert isinstance(key, str), type(key) # given a key, try to figure out a location for a partial slice - if not isinstance(key, str): - return key - raise NotImplementedError - @Substitution(klass="TimedeltaIndex") - @Appender(_shared_docs["searchsorted"]) - def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, (np.ndarray, Index)): - value = np.array(value, dtype=_TD_DTYPE, copy=False) - else: - value = Timedelta(value).asm8.view(_TD_DTYPE) - - return self.values.searchsorted(value, side=side, sorter=sorter) - def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "timedelta" @@ -553,110 +324,8 @@ def is_type_compatible(self, typ) -> bool: def inferred_type(self) -> str: return "timedelta64" - @property - def is_all_dates(self) -> bool: - return True - - def insert(self, loc, item): - """ - Make new Index inserting new item at location - - Parameters - ---------- - loc : int - item : object - If not either a Python datetime or a numpy integer-like, returned - Index dtype will be object rather than datetime. - - Returns - ------- - new_index : Index - """ - # try to convert if possible - if _is_convertible_to_td(item): - try: - item = Timedelta(item) - except ValueError: - # e.g. str that can't be parsed to timedelta - pass - elif is_scalar(item) and isna(item): - # GH 18295 - item = self._na_value - - freq = None - if isinstance(item, Timedelta) or (is_scalar(item) and isna(item)): - - # check freq can be preserved on edge cases - if self.freq is not None: - if (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: - freq = self.freq - elif (loc == len(self)) and item - self.freq == self[-1]: - freq = self.freq - item = Timedelta(item).asm8.view(_TD_DTYPE) - - try: - new_tds = np.concatenate( - (self[:loc].asi8, [item.view(np.int64)], self[loc:].asi8) - ) - return self._shallow_copy(new_tds, freq=freq) - - except (AttributeError, TypeError): - # fall back to object index - if isinstance(item, str): - return self.astype(object).insert(loc, item) - raise TypeError("cannot insert TimedeltaIndex with incompatible label") - - def delete(self, loc): - """ - Make a new TimedeltaIndex with passed location(s) deleted. - - Parameters - ---------- - loc: int, slice or array of ints - Indicate which sub-arrays to remove. - - Returns - ------- - new_index : TimedeltaIndex - """ - new_tds = np.delete(self.asi8, loc) - - freq = "infer" - if is_integer(loc): - if loc in (0, -len(self), -1, len(self) - 1): - freq = self.freq - else: - if is_list_like(loc): - loc = lib.maybe_indices_to_slice(ensure_int64(np.array(loc)), len(self)) - if isinstance(loc, slice) and loc.step in (1, None): - if loc.start in (0, None) or loc.stop in (len(self), None): - freq = self.freq - - return TimedeltaIndex(new_tds, name=self.name, freq=freq) - - -TimedeltaIndex._add_comparison_ops() TimedeltaIndex._add_logical_methods_disabled() -TimedeltaIndex._add_datetimelike_methods() - - -def _is_convertible_to_index(other) -> bool: - """ - return a boolean whether I can attempt conversion to a TimedeltaIndex - """ - if isinstance(other, TimedeltaIndex): - return True - elif len(other) > 0 and other.inferred_type not in ( - "floating", - "mixed-integer", - "integer", - "integer-na", - "mixed-integer-float", - "mixed", - ): - return True - return False def timedelta_range( @@ -694,7 +363,7 @@ def timedelta_range( ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link - `__. + `__. Examples -------- @@ -732,4 +401,4 @@ def timedelta_range( freq, freq_infer = dtl.maybe_infer_freq(freq) tdarr = TimedeltaArray._generate_range(start, end, periods, freq, closed=closed) - return TimedeltaIndex._simple_new(tdarr._data, freq=tdarr.freq, name=name) + return TimedeltaIndex._simple_new(tdarr, name=name) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index b86293e78a80d..63a86792082da 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,4 +1,4 @@ -from typing import Tuple +from typing import Hashable, List, Tuple, Union import numpy as np @@ -22,20 +22,12 @@ from pandas.core.dtypes.missing import _infer_fill_value, isna import pandas.core.common as com -from pandas.core.indexers import is_list_like_indexer, length_of_indexer -from pandas.core.indexes.api import Index, InvalidIndexError - - -# the supported indexers -def get_indexers_list(): - - return [ - ("iloc", _iLocIndexer), - ("loc", _LocIndexer), - ("at", _AtIndexer), - ("iat", _iAtIndexer), - ] - +from pandas.core.indexers import ( + check_bool_array_indexer, + is_list_like_indexer, + length_of_indexer, +) +from pandas.core.indexes.api import Index # "null slice" _NS = slice(None, None) @@ -94,6 +86,486 @@ class IndexingError(Exception): pass +class IndexingMixin: + """Mixin for adding .loc/.iloc/.at/.iat to Datafames and Series. + """ + + @property + def iloc(self) -> "_iLocIndexer": + """ + Purely integer-location based indexing for selection by position. + + ``.iloc[]`` is primarily integer position based (from ``0`` to + ``length-1`` of the axis), but may also be used with a boolean + array. + + Allowed inputs are: + + - An integer, e.g. ``5``. + - A list or array of integers, e.g. ``[4, 3, 0]``. + - A slice object with ints, e.g. ``1:7``. + - A boolean array. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above). + This is useful in method chains, when you don't have a reference to the + calling object, but would like to base your selection on some value. + + ``.iloc`` will raise ``IndexError`` if a requested indexer is + out-of-bounds, except *slice* indexers which allow out-of-bounds + indexing (this conforms with python/numpy *slice* semantics). + + See more at :ref:`Selection by Position `. + + See Also + -------- + DataFrame.iat : Fast integer location scalar accessor. + DataFrame.loc : Purely label-location based indexer for selection by label. + Series.iloc : Purely integer-location based indexing for + selection by position. + + Examples + -------- + + >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, + ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, + ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] + >>> df = pd.DataFrame(mydict) + >>> df + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + **Indexing just the rows** + + With a scalar integer. + + >>> type(df.iloc[0]) + + >>> df.iloc[0] + a 1 + b 2 + c 3 + d 4 + Name: 0, dtype: int64 + + With a list of integers. + + >>> df.iloc[[0]] + a b c d + 0 1 2 3 4 + >>> type(df.iloc[[0]]) + + + >>> df.iloc[[0, 1]] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + + With a `slice` object. + + >>> df.iloc[:3] + a b c d + 0 1 2 3 4 + 1 100 200 300 400 + 2 1000 2000 3000 4000 + + With a boolean mask the same length as the index. + + >>> df.iloc[[True, False, True]] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + With a callable, useful in method chains. The `x` passed + to the ``lambda`` is the DataFrame being sliced. This selects + the rows whose index label even. + + >>> df.iloc[lambda x: x.index % 2 == 0] + a b c d + 0 1 2 3 4 + 2 1000 2000 3000 4000 + + **Indexing both axes** + + You can mix the indexer types for the index and columns. Use ``:`` to + select the entire axis. + + With scalar integers. + + >>> df.iloc[0, 1] + 2 + + With lists of integers. + + >>> df.iloc[[0, 2], [1, 3]] + b d + 0 2 4 + 2 2000 4000 + + With `slice` objects. + + >>> df.iloc[1:3, 0:3] + a b c + 1 100 200 300 + 2 1000 2000 3000 + + With a boolean array whose length matches the columns. + + >>> df.iloc[:, [True, False, True, False]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + + With a callable function that expects the Series or DataFrame. + + >>> df.iloc[:, lambda df: [0, 2]] + a c + 0 1 3 + 1 100 300 + 2 1000 3000 + """ + return _iLocIndexer("iloc", self) + + @property + def loc(self) -> "_LocIndexer": + """ + Access a group of rows and columns by label(s) or a boolean array. + + ``.loc[]`` is primarily label based, but may also be used with a + boolean array. + + Allowed inputs are: + + - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is + interpreted as a *label* of the index, and **never** as an + integer position along the index). + - A list or array of labels, e.g. ``['a', 'b', 'c']``. + - A slice object with labels, e.g. ``'a':'f'``. + + .. warning:: Note that contrary to usual python slices, **both** the + start and the stop are included + + - A boolean array of the same length as the axis being sliced, + e.g. ``[True, False, True]``. + - A ``callable`` function with one argument (the calling Series or + DataFrame) and that returns valid output for indexing (one of the above) + + See more at :ref:`Selection by Label ` + + Raises + ------ + KeyError + If any items are not found. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.iloc : Access group of rows and columns by integer position(s). + DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the + Series/DataFrame. + Series.loc : Access group of values using labels. + + Examples + -------- + **Getting values** + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=['cobra', 'viper', 'sidewinder'], + ... columns=['max_speed', 'shield']) + >>> df + max_speed shield + cobra 1 2 + viper 4 5 + sidewinder 7 8 + + Single label. Note this returns the row as a Series. + + >>> df.loc['viper'] + max_speed 4 + shield 5 + Name: viper, dtype: int64 + + List of labels. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[['viper', 'sidewinder']] + max_speed shield + viper 4 5 + sidewinder 7 8 + + Single label for row and column + + >>> df.loc['cobra', 'shield'] + 2 + + Slice with labels for row and single label for column. As mentioned + above, note that both the start and stop of the slice are included. + + >>> df.loc['cobra':'viper', 'max_speed'] + cobra 1 + viper 4 + Name: max_speed, dtype: int64 + + Boolean list with the same length as the row axis + + >>> df.loc[[False, False, True]] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series + + >>> df.loc[df['shield'] > 6] + max_speed shield + sidewinder 7 8 + + Conditional that returns a boolean Series with column labels specified + + >>> df.loc[df['shield'] > 6, ['max_speed']] + max_speed + sidewinder 7 + + Callable that returns a boolean Series + + >>> df.loc[lambda df: df['shield'] == 8] + max_speed shield + sidewinder 7 8 + + **Setting values** + + Set value for all items matching the list of labels + + >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 + >>> df + max_speed shield + cobra 1 2 + viper 4 50 + sidewinder 7 50 + + Set value for an entire row + + >>> df.loc['cobra'] = 10 + >>> df + max_speed shield + cobra 10 10 + viper 4 50 + sidewinder 7 50 + + Set value for an entire column + + >>> df.loc[:, 'max_speed'] = 30 + >>> df + max_speed shield + cobra 30 10 + viper 30 50 + sidewinder 30 50 + + Set value for rows matching callable condition + + >>> df.loc[df['shield'] > 35] = 0 + >>> df + max_speed shield + cobra 30 10 + viper 0 0 + sidewinder 0 0 + + **Getting values on a DataFrame with an index that has integer labels** + + Another example using integers for the index + + >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], + ... index=[7, 8, 9], columns=['max_speed', 'shield']) + >>> df + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + Slice with integer labels for rows. As mentioned above, note that both + the start and stop of the slice are included. + + >>> df.loc[7:9] + max_speed shield + 7 1 2 + 8 4 5 + 9 7 8 + + **Getting values with a MultiIndex** + + A number of examples using a DataFrame with a MultiIndex + + >>> tuples = [ + ... ('cobra', 'mark i'), ('cobra', 'mark ii'), + ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), + ... ('viper', 'mark ii'), ('viper', 'mark iii') + ... ] + >>> index = pd.MultiIndex.from_tuples(tuples) + >>> values = [[12, 2], [0, 4], [10, 20], + ... [1, 4], [7, 1], [16, 36]] + >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) + >>> df + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Single label. Note this returns a DataFrame with a single index. + + >>> df.loc['cobra'] + max_speed shield + mark i 12 2 + mark ii 0 4 + + Single index tuple. Note this returns a Series. + + >>> df.loc[('cobra', 'mark ii')] + max_speed 0 + shield 4 + Name: (cobra, mark ii), dtype: int64 + + Single label for row and column. Similar to passing in a tuple, this + returns a Series. + + >>> df.loc['cobra', 'mark i'] + max_speed 12 + shield 2 + Name: (cobra, mark i), dtype: int64 + + Single tuple. Note using ``[[]]`` returns a DataFrame. + + >>> df.loc[[('cobra', 'mark ii')]] + max_speed shield + cobra mark ii 0 4 + + Single tuple for the index with a single label for the column + + >>> df.loc[('cobra', 'mark i'), 'shield'] + 2 + + Slice from index tuple to single label + + >>> df.loc[('cobra', 'mark i'):'viper'] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + mark iii 16 36 + + Slice from index tuple to index tuple + + >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] + max_speed shield + cobra mark i 12 2 + mark ii 0 4 + sidewinder mark i 10 20 + mark ii 1 4 + viper mark ii 7 1 + """ + return _LocIndexer("loc", self) + + @property + def at(self) -> "_AtIndexer": + """ + Access a single value for a row/column label pair. + + Similar to ``loc``, in that both provide label-based lookups. Use + ``at`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + KeyError + If 'label' does not exist in DataFrame. + + See Also + -------- + DataFrame.iat : Access a single value for a row/column pair by integer + position. + DataFrame.loc : Access a group of rows and columns by label(s). + Series.at : Access a single value using a label. + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... index=[4, 5, 6], columns=['A', 'B', 'C']) + >>> df + A B C + 4 0 2 3 + 5 0 4 1 + 6 10 20 30 + + Get value at specified row/column pair + + >>> df.at[4, 'B'] + 2 + + Set value at specified row/column pair + + >>> df.at[4, 'B'] = 10 + >>> df.at[4, 'B'] + 10 + + Get value within a Series + + >>> df.loc[5].at['B'] + 4 + """ + return _AtIndexer("at", self) + + @property + def iat(self) -> "_iAtIndexer": + """ + Access a single value for a row/column pair by integer position. + + Similar to ``iloc``, in that both provide integer-based lookups. Use + ``iat`` if you only need to get or set a single value in a DataFrame + or Series. + + Raises + ------ + IndexError + When integer position is out of bounds. + + See Also + -------- + DataFrame.at : Access a single value for a row/column label pair. + DataFrame.loc : Access a group of rows and columns by label(s). + DataFrame.iloc : Access a group of rows and columns by integer position(s). + + Examples + -------- + >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], + ... columns=['A', 'B', 'C']) + >>> df + A B C + 0 0 2 3 + 1 0 4 1 + 2 10 20 30 + + Get value at specified row/column pair + + >>> df.iat[1, 2] + 1 + + Set value at specified row/column pair + + >>> df.iat[1, 2] = 10 + >>> df.iat[1, 2] + 10 + + Get value within a series + + >>> df.loc[0].iat[1] + 2 + """ + return _iAtIndexer("iat", self) + + class _NDFrameIndexer(_NDFrameIndexerBase): _valid_types: str axis = None @@ -107,39 +579,6 @@ def __call__(self, axis=None): new_self.axis = axis return new_self - # TODO: remove once geopandas no longer needs this - def __getitem__(self, key): - # Used in ix and downstream in geopandas _CoordinateIndexer - if type(key) is tuple: - # Note: we check the type exactly instead of with isinstance - # because NamedTuple is checked separately. - key = tuple(com.apply_if_callable(x, self.obj) for x in key) - try: - values = self.obj._get_value(*key) - except (KeyError, TypeError, InvalidIndexError, AttributeError): - # TypeError occurs here if the key has non-hashable entries, - # generally slice or list. - # TODO(ix): most/all of the TypeError cases here are for ix, - # so this check can be removed once ix is removed. - # The InvalidIndexError is only catched for compatibility - # with geopandas, see - # https://github.com/pandas-dev/pandas/issues/27258 - # TODO: The AttributeError is for IntervalIndex which - # incorrectly implements get_value, see - # https://github.com/pandas-dev/pandas/issues/27865 - pass - else: - if is_scalar(values): - return values - - return self._getitem_tuple(key) - else: - # we by definition only have the 0th axis - axis = self.axis or 0 - - key = com.apply_if_callable(key, self.obj) - return self._getitem_axis(key, axis=axis) - def _get_label(self, label, axis: int): if self.ndim == 1: # for perf reasons we want to try _xs first @@ -180,9 +619,8 @@ def _get_setitem_indexer(self, key): if isinstance(key, range): return list(key) - axis = self.axis or 0 try: - return self._convert_to_indexer(key, axis=axis) + return self._convert_to_indexer(key, axis=0) except TypeError as e: # invalid indexer type vs 'other' indexing errors @@ -868,9 +1306,6 @@ def _multi_take(self, tup: Tuple): } return o._reindex_with_indexers(d, copy=True, allow_dups=True) - def _convert_for_reindex(self, key, axis: int): - return key - def _handle_lowerdim_multi_index_axis0(self, tup: Tuple): # we have an axis0 multi-index, handle or raise axis = self.axis or 0 @@ -991,42 +1426,6 @@ def _getitem_nested_tuple(self, tup: Tuple): return obj - # TODO: remove once geopandas no longer needs __getitem__ - def _getitem_axis(self, key, axis: int): - if is_iterator(key): - key = list(key) - self._validate_key(key, axis) - - labels = self.obj._get_axis(axis) - if isinstance(key, slice): - return self._get_slice_axis(key, axis=axis) - elif is_list_like_indexer(key) and not ( - isinstance(key, tuple) and isinstance(labels, ABCMultiIndex) - ): - - if hasattr(key, "ndim") and key.ndim > 1: - raise ValueError("Cannot index with multidimensional key") - - return self._getitem_iterable(key, axis=axis) - else: - - # maybe coerce a float scalar to integer - key = labels._maybe_cast_indexer(key) - - if is_integer(key): - if axis == 0 and isinstance(labels, ABCMultiIndex): - try: - return self._get_label(key, axis=axis) - except (KeyError, TypeError): - if self.obj.index.levels[0].is_integer(): - raise - - # this is the fallback! (for a non-float, non-integer index) - if not labels.is_floating() and not labels.is_integer(): - return self._get_loc(key, axis=axis) - - return self._get_label(key, axis=axis) - def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): """ Transform a list-like of keys into a new index and an indexer. @@ -1067,18 +1466,12 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): return ax[indexer], indexer if ax.is_unique and not getattr(ax, "is_overlapping", False): - # If we are trying to get actual keys from empty Series, we - # patiently wait for a KeyError later on - otherwise, convert - if len(ax) or not len(key): - key = self._convert_for_reindex(key, axis) indexer = ax.get_indexer_for(key) keyarr = ax.reindex(keyarr)[0] else: keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr) - self._validate_read_indexer( - keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing - ) + self._validate_read_indexer(keyarr, indexer, axis, raise_missing=raise_missing) return keyarr, indexer def _getitem_iterable(self, key, axis: int): @@ -1278,13 +1671,16 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): class _LocationIndexer(_NDFrameIndexer): + _takeable: bool = False + def __getitem__(self, key): if type(key) is tuple: key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): try: - return self._getitem_scalar(key) + return self.obj._get_value(*key, takeable=self._takeable) except (KeyError, IndexError, AttributeError): + # AttributeError for IntervalTree get_value pass return self._getitem_tuple(key) else: @@ -1297,9 +1693,6 @@ def __getitem__(self, key): def _is_scalar_access(self, key: Tuple): raise NotImplementedError() - def _getitem_scalar(self, key): - raise NotImplementedError() - def _getitem_axis(self, key, axis: int): raise NotImplementedError() @@ -1332,244 +1725,8 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): return self.obj.take(indexer, axis=axis) +@Appender(IndexingMixin.loc.__doc__) class _LocIndexer(_LocationIndexer): - """ - Access a group of rows and columns by label(s) or a boolean array. - - ``.loc[]`` is primarily label based, but may also be used with a - boolean array. - - Allowed inputs are: - - - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is - interpreted as a *label* of the index, and **never** as an - integer position along the index). - - A list or array of labels, e.g. ``['a', 'b', 'c']``. - - A slice object with labels, e.g. ``'a':'f'``. - - .. warning:: Note that contrary to usual python slices, **both** the - start and the stop are included - - - A boolean array of the same length as the axis being sliced, - e.g. ``[True, False, True]``. - - A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above) - - See more at :ref:`Selection by Label ` - - Raises - ------ - KeyError - If any items are not found. - - See Also - -------- - DataFrame.at : Access a single value for a row/column label pair. - DataFrame.iloc : Access group of rows and columns by integer position(s). - DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. - Series.loc : Access group of values using labels. - - Examples - -------- - **Getting values** - - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=['cobra', 'viper', 'sidewinder'], - ... columns=['max_speed', 'shield']) - >>> df - max_speed shield - cobra 1 2 - viper 4 5 - sidewinder 7 8 - - Single label. Note this returns the row as a Series. - - >>> df.loc['viper'] - max_speed 4 - shield 5 - Name: viper, dtype: int64 - - List of labels. Note using ``[[]]`` returns a DataFrame. - - >>> df.loc[['viper', 'sidewinder']] - max_speed shield - viper 4 5 - sidewinder 7 8 - - Single label for row and column - - >>> df.loc['cobra', 'shield'] - 2 - - Slice with labels for row and single label for column. As mentioned - above, note that both the start and stop of the slice are included. - - >>> df.loc['cobra':'viper', 'max_speed'] - cobra 1 - viper 4 - Name: max_speed, dtype: int64 - - Boolean list with the same length as the row axis - - >>> df.loc[[False, False, True]] - max_speed shield - sidewinder 7 8 - - Conditional that returns a boolean Series - - >>> df.loc[df['shield'] > 6] - max_speed shield - sidewinder 7 8 - - Conditional that returns a boolean Series with column labels specified - - >>> df.loc[df['shield'] > 6, ['max_speed']] - max_speed - sidewinder 7 - - Callable that returns a boolean Series - - >>> df.loc[lambda df: df['shield'] == 8] - max_speed shield - sidewinder 7 8 - - **Setting values** - - Set value for all items matching the list of labels - - >>> df.loc[['viper', 'sidewinder'], ['shield']] = 50 - >>> df - max_speed shield - cobra 1 2 - viper 4 50 - sidewinder 7 50 - - Set value for an entire row - - >>> df.loc['cobra'] = 10 - >>> df - max_speed shield - cobra 10 10 - viper 4 50 - sidewinder 7 50 - - Set value for an entire column - - >>> df.loc[:, 'max_speed'] = 30 - >>> df - max_speed shield - cobra 30 10 - viper 30 50 - sidewinder 30 50 - - Set value for rows matching callable condition - - >>> df.loc[df['shield'] > 35] = 0 - >>> df - max_speed shield - cobra 30 10 - viper 0 0 - sidewinder 0 0 - - **Getting values on a DataFrame with an index that has integer labels** - - Another example using integers for the index - - >>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], - ... index=[7, 8, 9], columns=['max_speed', 'shield']) - >>> df - max_speed shield - 7 1 2 - 8 4 5 - 9 7 8 - - Slice with integer labels for rows. As mentioned above, note that both - the start and stop of the slice are included. - - >>> df.loc[7:9] - max_speed shield - 7 1 2 - 8 4 5 - 9 7 8 - - **Getting values with a MultiIndex** - - A number of examples using a DataFrame with a MultiIndex - - >>> tuples = [ - ... ('cobra', 'mark i'), ('cobra', 'mark ii'), - ... ('sidewinder', 'mark i'), ('sidewinder', 'mark ii'), - ... ('viper', 'mark ii'), ('viper', 'mark iii') - ... ] - >>> index = pd.MultiIndex.from_tuples(tuples) - >>> values = [[12, 2], [0, 4], [10, 20], - ... [1, 4], [7, 1], [16, 36]] - >>> df = pd.DataFrame(values, columns=['max_speed', 'shield'], index=index) - >>> df - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - mark iii 16 36 - - Single label. Note this returns a DataFrame with a single index. - - >>> df.loc['cobra'] - max_speed shield - mark i 12 2 - mark ii 0 4 - - Single index tuple. Note this returns a Series. - - >>> df.loc[('cobra', 'mark ii')] - max_speed 0 - shield 4 - Name: (cobra, mark ii), dtype: int64 - - Single label for row and column. Similar to passing in a tuple, this - returns a Series. - - >>> df.loc['cobra', 'mark i'] - max_speed 12 - shield 2 - Name: (cobra, mark i), dtype: int64 - - Single tuple. Note using ``[[]]`` returns a DataFrame. - - >>> df.loc[[('cobra', 'mark ii')]] - max_speed shield - cobra mark ii 0 4 - - Single tuple for the index with a single label for the column - - >>> df.loc[('cobra', 'mark i'), 'shield'] - 2 - - Slice from index tuple to single label - - >>> df.loc[('cobra', 'mark i'):'viper'] - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - mark iii 16 36 - - Slice from index tuple to index tuple - - >>> df.loc[('cobra', 'mark i'):('viper', 'mark ii')] - max_speed shield - cobra mark i 12 2 - mark ii 0 4 - sidewinder mark i 10 20 - mark ii 1 4 - viper mark ii 7 1 - """ - _valid_types = ( "labels (MUST BE IN THE INDEX), slices of labels (BOTH " "endpoints included! Can be slices of integers if the " @@ -1624,12 +1781,6 @@ def _is_scalar_access(self, key: Tuple) -> bool: return True - def _getitem_scalar(self, key): - # a fast-path to scalar access - # if not, raise - values = self.obj._get_value(*key) - return values - def _get_partial_string_timestamp_match_key(self, key, labels): """ Translate any partial string timestamp matches in key, returning the @@ -1728,147 +1879,14 @@ def _getitem_axis(self, key, axis: int): return self._get_label(key, axis=axis) +@Appender(IndexingMixin.iloc.__doc__) class _iLocIndexer(_LocationIndexer): - """ - Purely integer-location based indexing for selection by position. - - ``.iloc[]`` is primarily integer position based (from ``0`` to - ``length-1`` of the axis), but may also be used with a boolean - array. - - Allowed inputs are: - - - An integer, e.g. ``5``. - - A list or array of integers, e.g. ``[4, 3, 0]``. - - A slice object with ints, e.g. ``1:7``. - - A boolean array. - - A ``callable`` function with one argument (the calling Series or - DataFrame) and that returns valid output for indexing (one of the above). - This is useful in method chains, when you don't have a reference to the - calling object, but would like to base your selection on some value. - - ``.iloc`` will raise ``IndexError`` if a requested indexer is - out-of-bounds, except *slice* indexers which allow out-of-bounds - indexing (this conforms with python/numpy *slice* semantics). - - See more at :ref:`Selection by Position `. - - See Also - -------- - DataFrame.iat : Fast integer location scalar accessor. - DataFrame.loc : Purely label-location based indexer for selection by label. - Series.iloc : Purely integer-location based indexing for - selection by position. - - Examples - -------- - - >>> mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4}, - ... {'a': 100, 'b': 200, 'c': 300, 'd': 400}, - ... {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }] - >>> df = pd.DataFrame(mydict) - >>> df - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - 2 1000 2000 3000 4000 - - **Indexing just the rows** - - With a scalar integer. - - >>> type(df.iloc[0]) - - >>> df.iloc[0] - a 1 - b 2 - c 3 - d 4 - Name: 0, dtype: int64 - - With a list of integers. - - >>> df.iloc[[0]] - a b c d - 0 1 2 3 4 - >>> type(df.iloc[[0]]) - - - >>> df.iloc[[0, 1]] - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - - With a `slice` object. - - >>> df.iloc[:3] - a b c d - 0 1 2 3 4 - 1 100 200 300 400 - 2 1000 2000 3000 4000 - - With a boolean mask the same length as the index. - - >>> df.iloc[[True, False, True]] - a b c d - 0 1 2 3 4 - 2 1000 2000 3000 4000 - - With a callable, useful in method chains. The `x` passed - to the ``lambda`` is the DataFrame being sliced. This selects - the rows whose index label even. - - >>> df.iloc[lambda x: x.index % 2 == 0] - a b c d - 0 1 2 3 4 - 2 1000 2000 3000 4000 - - **Indexing both axes** - - You can mix the indexer types for the index and columns. Use ``:`` to - select the entire axis. - - With scalar integers. - - >>> df.iloc[0, 1] - 2 - - With lists of integers. - - >>> df.iloc[[0, 2], [1, 3]] - b d - 0 2 4 - 2 2000 4000 - - With `slice` objects. - - >>> df.iloc[1:3, 0:3] - a b c - 1 100 200 300 - 2 1000 2000 3000 - - With a boolean array whose length matches the columns. - - >>> df.iloc[:, [True, False, True, False]] - a c - 0 1 3 - 1 100 300 - 2 1000 3000 - - With a callable function that expects the Series or DataFrame. - - >>> df.iloc[:, lambda df: [0, 2]] - a c - 0 1 3 - 1 100 300 - 2 1000 3000 - """ - _valid_types = ( "integer, integer slice (START point is INCLUDED, END " "point is EXCLUDED), listlike of integers, boolean array" ) _get_slice_axis = _NDFrameIndexer._get_slice_axis + _takeable = True def _validate_key(self, key, axis: int): if com.is_bool_indexer(key): @@ -1933,12 +1951,6 @@ def _is_scalar_access(self, key: Tuple) -> bool: return True - def _getitem_scalar(self, key): - # a fast-path to scalar access - # if not, raise - values = self.obj._get_value(*key, takeable=True) - return values - def _validate_integer(self, key: int, axis: int) -> None: """ Check that 'key' is a valid position in the desired axis. @@ -2091,53 +2103,8 @@ def __setitem__(self, key, value): self.obj._set_value(*key, takeable=self._takeable) +@Appender(IndexingMixin.at.__doc__) class _AtIndexer(_ScalarAccessIndexer): - """ - Access a single value for a row/column label pair. - - Similar to ``loc``, in that both provide label-based lookups. Use - ``at`` if you only need to get or set a single value in a DataFrame - or Series. - - Raises - ------ - KeyError - If 'label' does not exist in DataFrame. - - See Also - -------- - DataFrame.iat : Access a single value for a row/column pair by integer - position. - DataFrame.loc : Access a group of rows and columns by label(s). - Series.at : Access a single value using a label. - - Examples - -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... index=[4, 5, 6], columns=['A', 'B', 'C']) - >>> df - A B C - 4 0 2 3 - 5 0 4 1 - 6 10 20 30 - - Get value at specified row/column pair - - >>> df.at[4, 'B'] - 2 - - Set value at specified row/column pair - - >>> df.at[4, 'B'] = 10 - >>> df.at[4, 'B'] - 10 - - Get value within a Series - - >>> df.loc[5].at['B'] - 4 - """ - _takeable = False def _convert_key(self, key, is_setter: bool = False): @@ -2166,52 +2133,8 @@ def _convert_key(self, key, is_setter: bool = False): return key +@Appender(IndexingMixin.iat.__doc__) class _iAtIndexer(_ScalarAccessIndexer): - """ - Access a single value for a row/column pair by integer position. - - Similar to ``iloc``, in that both provide integer-based lookups. Use - ``iat`` if you only need to get or set a single value in a DataFrame - or Series. - - Raises - ------ - IndexError - When integer position is out of bounds. - - See Also - -------- - DataFrame.at : Access a single value for a row/column label pair. - DataFrame.loc : Access a group of rows and columns by label(s). - DataFrame.iloc : Access a group of rows and columns by integer position(s). - - Examples - -------- - >>> df = pd.DataFrame([[0, 2, 3], [0, 4, 1], [10, 20, 30]], - ... columns=['A', 'B', 'C']) - >>> df - A B C - 0 0 2 3 - 1 0 4 1 - 2 10 20 30 - - Get value at specified row/column pair - - >>> df.iat[1, 2] - 1 - - Set value at specified row/column pair - - >>> df.iat[1, 2] = 10 - >>> df.iat[1, 2] - 10 - - Get value within a series - - >>> df.loc[0].iat[1] - 2 - """ - _takeable = True def _convert_key(self, key, is_setter: bool = False): @@ -2224,7 +2147,7 @@ def _convert_key(self, key, is_setter: bool = False): return key -def _tuplify(ndim: int, loc) -> tuple: +def _tuplify(ndim: int, loc: Hashable) -> Tuple[Union[Hashable, slice], ...]: """ Given an indexer for the first dimension, create an equivalent tuple for indexing over all dimensions. @@ -2238,9 +2161,10 @@ def _tuplify(ndim: int, loc) -> tuple: ------- tuple """ - tup = [slice(None, None) for _ in range(ndim)] - tup[0] = loc - return tuple(tup) + _tup: List[Union[Hashable, slice]] + _tup = [slice(None, None) for _ in range(ndim)] + _tup[0] = loc + return tuple(_tup) def convert_to_index_sliceable(obj, key): @@ -2308,13 +2232,7 @@ def check_bool_indexer(index: Index, key) -> np.ndarray: else: if is_sparse(result): result = result.to_dense() - result = np.asarray(result, dtype=bool) - - # GH26658 - if len(result) != len(index): - raise IndexError( - f"Item wrong length {len(result)} instead of {len(index)}." - ) + result = check_bool_array_indexer(index, result) return result diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index 8ac0df2fa4e0a..37a3405554745 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,4 +1,4 @@ -from .blocks import ( # noqa: F401 +from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, BoolBlock, CategoricalBlock, @@ -10,19 +10,38 @@ IntBlock, ObjectBlock, TimeDeltaBlock, + _block_shape, + _safe_reshape, + make_block, ) -from .managers import ( # noqa: F401 +from pandas.core.internals.managers import ( BlockManager, SingleBlockManager, - create_block_manager_from_arrays, - create_block_manager_from_blocks, -) - -from .blocks import _safe_reshape # noqa: F401; io.packers -from .blocks import make_block # noqa: F401; io.pytables, io.packers -from .managers import ( # noqa: F401; reshape.concat, reshape.merge _transform_index, concatenate_block_managers, + create_block_manager_from_arrays, + create_block_manager_from_blocks, ) -from .blocks import _block_shape # noqa:F401; io.pytables +__all__ = [ + "Block", + "BoolBlock", + "CategoricalBlock", + "ComplexBlock", + "DatetimeBlock", + "DatetimeTZBlock", + "ExtensionBlock", + "FloatBlock", + "IntBlock", + "ObjectBlock", + "TimeDeltaBlock", + "_safe_reshape", + "make_block", + "_block_shape", + "BlockManager", + "SingleBlockManager", + "_transform_index", + "concatenate_block_managers", + "create_block_manager_from_arrays", + "create_block_manager_from_blocks", +] diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index eb5b5181d894d..cb702a81d2bde 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -192,13 +192,19 @@ def is_categorical_astype(self, dtype): return False - def external_values(self, dtype=None): - """ return an outside world format, currently just the ndarray """ + def external_values(self): + """ + The array that Series.values returns (public attribute). + This has some historical constraints, and is overridden in block + subclasses to return the correct array (e.g. period returns + object ndarray and datetimetz a datetime64[ns] ndarray instead of + proper extension array). + """ return self.values - def internal_values(self, dtype=None): - """ return an internal format, currently just the ndarray - this should be the pure internal API format + def internal_values(self): + """ + The array that Series._values returns (internal values). """ return self.values @@ -242,7 +248,7 @@ def array_dtype(self): """ return self.dtype - def make_block(self, values, placement=None): + def make_block(self, values, placement=None) -> "Block": """ Create a new block, with type inference propagate any values that are not specified @@ -362,16 +368,31 @@ def delete(self, loc): self.values = np.delete(self.values, loc, 0) self.mgr_locs = self.mgr_locs.delete(loc) - def apply(self, func, **kwargs): + def apply(self, func, **kwargs) -> List["Block"]: """ apply the function to my values; return a block if we are not one """ with np.errstate(all="ignore"): result = func(self.values, **kwargs) + + return self._split_op_result(result) + + def _split_op_result(self, result) -> List["Block"]: + # See also: split_and_operate + if is_extension_array_dtype(result) and result.ndim > 1: + # if we get a 2D ExtensionArray, we need to split it into 1D pieces + nbs = [] + for i, loc in enumerate(self.mgr_locs): + vals = result[i] + nv = _block_shape(vals, ndim=self.ndim) + block = self.make_block(values=nv, placement=[loc]) + nbs.append(block) + return nbs + if not isinstance(result, Block): result = self.make_block(values=_block_shape(result, ndim=self.ndim)) - return result + return [result] def fillna(self, value, limit=None, inplace=False, downcast=None): """ fillna on the block with the value. If we fail, then convert to @@ -646,9 +667,9 @@ def to_native_types(self, slicer=None, na_rep="nan", quoting=None, **kwargs): if slicer is not None: values = values[:, slicer] mask = isna(values) + itemsize = writers.word_len(na_rep) - if not self.is_object and not quoting: - itemsize = writers.word_len(na_rep) + if not self.is_object and not quoting and itemsize: values = values.astype(f" 1: + # GH#12513 a EA dtype passed with a 2D array, split into + # multiple EAs that view the values + values = [values[:, n] for n in range(values.shape[1])] + else: + values = [values] + if columns is None: - columns = [0] - return arrays_to_mgr([values], columns, index, columns, dtype=dtype) + columns = list(range(len(values))) + return arrays_to_mgr(values, columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a81209229a3b8..847f543ebca4d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -34,10 +34,7 @@ from pandas.core.base import PandasObject from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, MultiIndex, ensure_index - -from pandas.io.formats.printing import pprint_thing - -from .blocks import ( +from pandas.core.internals.blocks import ( Block, CategoricalBlock, DatetimeTZBlock, @@ -49,13 +46,15 @@ get_block_type, make_block, ) -from .concat import ( # all for concatenate_block_managers +from pandas.core.internals.concat import ( # all for concatenate_block_managers combine_concat_plans, concatenate_join_units, get_mgr_concatenation_plan, is_uniform_join_units, ) +from pandas.io.formats.printing import pprint_thing + # TODO: flexible with index=None and/or items=None @@ -280,30 +279,7 @@ def unpickle_block(values, mgr_locs): unpickle_block(b["values"], b["mgr_locs"]) for b in state["blocks"] ) else: - # discard anything after 3rd, support beta pickling format for a - # little while longer - ax_arrays, bvalues, bitems = state[:3] - - self.axes = [ensure_index(ax) for ax in ax_arrays] - - if len(bitems) == 1 and self.axes[0].equals(bitems[0]): - # This is a workaround for pre-0.14.1 pickles that didn't - # support unpickling multi-block frames/panels with non-unique - # columns/items, because given a manager with items ["a", "b", - # "a"] there's no way of knowing which block's "a" is where. - # - # Single-block case can be supported under the assumption that - # block items corresponded to manager items 1-to-1. - all_mgr_locs = [slice(0, len(bitems[0]))] - else: - all_mgr_locs = [ - self.axes[0].get_indexer(blk_items) for blk_items in bitems - ] - - self.blocks = tuple( - unpickle_block(values, mgr_locs) - for values, mgr_locs in zip(bvalues, all_mgr_locs) - ) + raise NotImplementedError("pre-0.14.1 pickles are no longer supported") self._post_setstate() @@ -340,13 +316,39 @@ def _verify_integrity(self): f"tot_items: {tot_items}" ) - def apply(self, f: str, filter=None, **kwargs): + def reduce(self, func, *args, **kwargs): + # If 2D, we assume that we're operating column-wise + if self.ndim == 1: + # we'll be returning a scalar + blk = self.blocks[0] + return func(blk.values, *args, **kwargs) + + res = {} + for blk in self.blocks: + bres = func(blk.values, *args, **kwargs) + + if np.ndim(bres) == 0: + # EA + assert blk.shape[0] == 1 + new_res = zip(blk.mgr_locs.as_array, [bres]) + else: + assert bres.ndim == 1, bres.shape + assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs) + new_res = zip(blk.mgr_locs.as_array, bres) + + nr = dict(new_res) + assert not any(key in res for key in nr) + res.update(nr) + + return res + + def apply(self, f, filter=None, **kwargs): """ Iterate over the blocks, collect and create a new BlockManager. Parameters ---------- - f : str + f : str or callable Name of the Block method to apply. filter : list, if supplied, only call the block if the filter is in the block @@ -411,7 +413,10 @@ def apply(self, f: str, filter=None, **kwargs): axis = obj._info_axis_number kwargs[k] = obj.reindex(b_items, axis=axis, copy=align_copy) - applied = getattr(b, f)(**kwargs) + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) result_blocks = _extend_blocks(applied, result_blocks) if len(result_blocks) == 0: @@ -641,12 +646,6 @@ def is_numeric_mixed_type(self): self._consolidate_inplace() return all(block.is_numeric for block in self.blocks) - @property - def is_datelike_mixed_type(self): - # Warning, consolidation needs to get checked upstairs - self._consolidate_inplace() - return any(block.is_datelike for block in self.blocks) - @property def any_extension_types(self): """Whether any of the blocks in this manager are extension blocks""" @@ -709,16 +708,16 @@ def combine(self, blocks, copy=True): return type(self)(new_blocks, axes, do_integrity_check=False) - def get_slice(self, slobj, axis=0): + def get_slice(self, slobj: slice, axis: int = 0): if axis >= self.ndim: raise IndexError("Requested axis not found in manager") if axis == 0: new_blocks = self._slice_take_blocks_ax0(slobj) else: - slicer = [slice(None)] * (axis + 1) - slicer[axis] = slobj - slicer = tuple(slicer) + _slicer = [slice(None)] * (axis + 1) + _slicer[axis] = slobj + slicer = tuple(_slicer) new_blocks = [blk.getitem_block(slicer) for blk in self.blocks] new_axes = list(self.axes) @@ -728,11 +727,11 @@ def get_slice(self, slobj, axis=0): bm._consolidate_inplace() return bm - def __contains__(self, item): + def __contains__(self, item) -> bool: return item in self.items @property - def nblocks(self): + def nblocks(self) -> int: return len(self.blocks) def copy(self, deep=True): @@ -741,16 +740,17 @@ def copy(self, deep=True): Parameters ---------- - deep : boolean o rstring, default True + deep : bool or string, default True If False, return shallow copy (do not copy data) If 'all', copy data and a deep copy of the index Returns ------- - copy : BlockManager + BlockManager """ # this preserves the notion of view copying of axes if deep: + # hit in e.g. tests.io.json.test_pandas def copy_func(ax): if deep == "all": @@ -761,20 +761,19 @@ def copy_func(ax): new_axes = [copy_func(ax) for ax in self.axes] else: new_axes = list(self.axes) + res = self.apply("copy", deep=deep) res.axes = new_axes return res - def as_array(self, transpose=False, items=None): - """Convert the blockmanager data into an numpy array. + def as_array(self, transpose: bool = False) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. Parameters ---------- transpose : boolean, default False If True, transpose the return array - items : list of strings or None - Names of block items that will be included in the returned - array. ``None`` means that all block items will be used Returns ------- @@ -784,10 +783,7 @@ def as_array(self, transpose=False, items=None): arr = np.empty(self.shape, dtype=float) return arr.transpose() if transpose else arr - if items is not None: - mgr = self.reindex_axis(items, axis=0) - else: - mgr = self + mgr = self if self._is_single_block and mgr.blocks[0].is_datetimetz: # TODO(Block.get_values): Make DatetimeTZBlock.get_values @@ -1311,7 +1307,7 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): # only one item and each mgr loc is a copy of that single # item. for mgr_loc in mgr_locs: - newblk = blk.copy(deep=True) + newblk = blk.copy(deep=False) newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1) blocks.append(newblk) @@ -1530,9 +1526,11 @@ def get_dtypes(self): return np.array([self._block.dtype]) def external_values(self): + """The array that Series.values returns""" return self._block.external_values() def internal_values(self): + """The array that Series._values returns""" return self._block.internal_values() def get_values(self): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index f27e3d4527921..2bf2be082f639 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -8,6 +8,7 @@ from pandas._config import get_option from pandas._libs import NaT, Timedelta, Timestamp, iNaT, lib +from pandas._typing import Dtype, Scalar from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask @@ -37,7 +38,7 @@ _USE_BOTTLENECK = False -def set_use_bottleneck(v=True): +def set_use_bottleneck(v: bool = True) -> None: # set/unset to use bottleneck global _USE_BOTTLENECK if _BOTTLENECK_INSTALLED: @@ -93,7 +94,9 @@ def __call__(self, alt): bn_func = None @functools.wraps(alt) - def f(values, axis=None, skipna=True, **kwds): + def f( + values: np.ndarray, axis: Optional[int] = None, skipna: bool = True, **kwds + ): if len(self.kwargs) > 0: for k, v in self.kwargs.items(): if k not in kwds: @@ -129,10 +132,10 @@ def f(values, axis=None, skipna=True, **kwds): return f -def _bn_ok_dtype(dt, name: str) -> bool: +def _bn_ok_dtype(dtype: Dtype, name: str) -> bool: # Bottleneck chokes on datetime64 - if not is_object_dtype(dt) and not ( - is_datetime_or_timedelta_dtype(dt) or is_datetime64tz_dtype(dt) + if not is_object_dtype(dtype) and not ( + is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype) ): # GH 15507 @@ -163,7 +166,9 @@ def _has_infs(result) -> bool: return False -def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): +def _get_fill_value( + dtype: Dtype, fill_value: Optional[Scalar] = None, fill_value_typ=None +): """ return the correct fill value for the dtype of the values """ if fill_value is not None: return fill_value @@ -326,12 +331,12 @@ def _get_values( return values, mask, dtype, dtype_max, fill_value -def _na_ok_dtype(dtype): +def _na_ok_dtype(dtype) -> bool: # TODO: what about datetime64tz? PeriodDtype? return not issubclass(dtype.type, (np.integer, np.timedelta64, np.datetime64)) -def _wrap_results(result, dtype, fill_value=None): +def _wrap_results(result, dtype: Dtype, fill_value=None): """ wrap our results if needed """ if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): @@ -362,7 +367,9 @@ def _wrap_results(result, dtype, fill_value=None): return result -def _na_for_min_count(values, axis: Optional[int]): +def _na_for_min_count( + values: np.ndarray, axis: Optional[int] +) -> Union[Scalar, np.ndarray]: """ Return the missing value for `values`. @@ -393,7 +400,12 @@ def _na_for_min_count(values, axis: Optional[int]): return result -def nanany(values, axis=None, skipna: bool = True, mask=None): +def nanany( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> bool: """ Check if any elements along an axis evaluate to True. @@ -425,7 +437,12 @@ def nanany(values, axis=None, skipna: bool = True, mask=None): return values.any(axis) -def nanall(values, axis=None, skipna: bool = True, mask=None): +def nanall( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> bool: """ Check if all elements along an axis evaluate to True. @@ -458,7 +475,13 @@ def nanall(values, axis=None, skipna: bool = True, mask=None): @disallow("M8") -def nansum(values, axis=None, skipna=True, min_count=0, mask=None): +def nansum( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + min_count: int = 0, + mask: Optional[np.ndarray] = None, +) -> float: """ Sum the elements along an axis ignoring NaNs @@ -629,7 +652,7 @@ def _get_counts_nanvar( mask: Optional[np.ndarray], axis: Optional[int], ddof: int, - dtype=float, + dtype: Dtype = float, ) -> Tuple[Union[int, np.ndarray], Union[int, np.ndarray]]: """ Get the count of non-null values along an axis, accounting for degrees of freedom. @@ -776,7 +799,13 @@ def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): @disallow("M8", "m8") -def nansem(values, axis=None, skipna=True, ddof=1, mask=None): +def nansem( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + ddof: int = 1, + mask: Optional[np.ndarray] = None, +) -> float: """ Compute the standard error in the mean along given axis while ignoring NaNs @@ -821,7 +850,12 @@ def nansem(values, axis=None, skipna=True, ddof=1, mask=None): def _nanminmax(meth, fill_value_typ): @bottleneck_switch(name="nan" + meth) - def reduction(values, axis=None, skipna=True, mask=None): + def reduction( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, + ) -> Dtype: values, mask, dtype, dtype_max, fill_value = _get_values( values, skipna, fill_value_typ=fill_value_typ, mask=mask @@ -831,7 +865,7 @@ def reduction(values, axis=None, skipna=True, mask=None): try: result = getattr(values, meth)(axis, dtype=dtype_max) result.fill(np.nan) - except (AttributeError, TypeError, ValueError, np.core._internal.AxisError): + except (AttributeError, TypeError, ValueError): result = np.nan else: result = getattr(values, meth)(axis) @@ -847,7 +881,12 @@ def reduction(values, axis=None, skipna=True, mask=None): @disallow("O") -def nanargmax(values, axis=None, skipna=True, mask=None): +def nanargmax( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> int: """ Parameters ---------- @@ -878,7 +917,12 @@ def nanargmax(values, axis=None, skipna=True, mask=None): @disallow("O") -def nanargmin(values, axis=None, skipna=True, mask=None): +def nanargmin( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> int: """ Parameters ---------- @@ -909,7 +953,12 @@ def nanargmin(values, axis=None, skipna=True, mask=None): @disallow("M8", "m8") -def nanskew(values, axis=None, skipna=True, mask=None): +def nanskew( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> float: """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized @@ -987,7 +1036,12 @@ def nanskew(values, axis=None, skipna=True, mask=None): @disallow("M8", "m8") -def nankurt(values, axis=None, skipna=True, mask=None): +def nankurt( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> float: """ Compute the sample excess kurtosis @@ -1075,7 +1129,13 @@ def nankurt(values, axis=None, skipna=True, mask=None): @disallow("M8", "m8") -def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): +def nanprod( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + min_count: int = 0, + mask: Optional[np.ndarray] = None, +) -> float: """ Parameters ---------- @@ -1088,7 +1148,8 @@ def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): Returns ------- - result : dtype + Dtype + The product of all elements on a given axis. ( NaNs are treated as 1) Examples -------- @@ -1096,10 +1157,6 @@ def nanprod(values, axis=None, skipna=True, min_count=0, mask=None): >>> s = pd.Series([1, 2, 3, np.nan]) >>> nanops.nanprod(s) 6.0 - - Returns - ------- - The product of all elements on a given axis. ( NaNs are treated as 1) """ mask = _maybe_get_mask(values, skipna, mask) @@ -1138,7 +1195,7 @@ def _get_counts( values_shape: Tuple[int], mask: Optional[np.ndarray], axis: Optional[int], - dtype=float, + dtype: Dtype = float, ) -> Union[int, np.ndarray]: """ Get the count of non-null values along an axis @@ -1184,7 +1241,13 @@ def _maybe_null_out( mask: Optional[np.ndarray], shape: Tuple, min_count: int = 1, -) -> np.ndarray: +) -> float: + """ + Returns + ------- + Dtype + The product of all elements on a given axis. ( NaNs are treated as 1) + """ if mask is not None and axis is not None and getattr(result, "ndim", False): null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 if np.any(null_mask): @@ -1218,7 +1281,9 @@ def _zero_out_fperr(arg): @disallow("M8", "m8") -def nancorr(a, b, method="pearson", min_periods=None): +def nancorr( + a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None, +): """ a, b: ndarrays """ @@ -1243,8 +1308,14 @@ def nancorr(a, b, method="pearson", min_periods=None): def get_corr_func(method): if method in ["kendall", "spearman"]: from scipy.stats import kendalltau, spearmanr + elif method in ["pearson"]: + pass elif callable(method): return method + else: + raise ValueError( + f"Unkown method '{method}', expected one of 'kendall', 'spearman'" + ) def _pearson(a, b): return np.corrcoef(a, b)[0, 1] @@ -1262,7 +1333,7 @@ def _spearman(a, b): @disallow("M8", "m8") -def nancov(a, b, min_periods=None): +def nancov(a: np.ndarray, b: np.ndarray, min_periods: Optional[int] = None): if len(a) != len(b): raise AssertionError("Operands to nancov must have same size") @@ -1335,9 +1406,11 @@ def f(x, y): nanne = make_nancomp(operator.ne) -def _nanpercentile_1d(values, mask, q, na_value, interpolation): +def _nanpercentile_1d( + values: np.ndarray, mask: np.ndarray, q, na_value: Scalar, interpolation +) -> Union[Scalar, np.ndarray]: """ - Wraper for np.percentile that skips missing values, specialized to + Wrapper for np.percentile that skips missing values, specialized to 1-dimensional case. Parameters @@ -1366,9 +1439,17 @@ def _nanpercentile_1d(values, mask, q, na_value, interpolation): return np.percentile(values, q, interpolation=interpolation) -def nanpercentile(values, q, axis, na_value, mask, ndim, interpolation): +def nanpercentile( + values: np.ndarray, + q, + axis: int, + na_value, + mask: np.ndarray, + ndim: int, + interpolation, +): """ - Wraper for np.percentile that skips missing values. + Wrapper for np.percentile that skips missing values. Parameters ---------- diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 14705f4d22e9b..1355060efd097 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -10,6 +10,7 @@ import numpy as np from pandas._libs import Timedelta, Timestamp, lib +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas.util._decorators import Appender from pandas.core.dtypes.common import is_list_like, is_timedelta64_dtype @@ -26,11 +27,11 @@ arithmetic_op, comparison_op, define_na_arithmetic_op, + get_array_op, logical_op, ) from pandas.core.ops.array_ops import comp_method_OBJECT_ARRAY # noqa:F401 from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.core.ops.dispatch import maybe_dispatch_ufunc_to_dunder_op # noqa:F401 from pandas.core.ops.dispatch import should_series_dispatch from pandas.core.ops.docstrings import ( _arith_doc_FRAME, @@ -301,7 +302,7 @@ def _get_op_name(op, special): """ opname = op.__name__.strip("_") if special: - opname = "__{opname}__".format(opname=opname) + opname = f"__{opname}__" return opname @@ -372,8 +373,10 @@ def dispatch_to_series(left, right, func, str_rep=None, axis=None): right = lib.item_from_zerodim(right) if lib.is_scalar(right) or np.ndim(right) == 0: - def column_op(a, b): - return {i: func(a.iloc[:, i], b) for i in range(len(a.columns))} + # Get the appropriate array-op to apply to each block's values. + array_op = get_array_op(func, str_rep=str_rep) + bm = left._data.apply(array_op, right=right) + return type(left)(bm) elif isinstance(right, ABCDataFrame): assert right._indexed_same(left) @@ -382,7 +385,7 @@ def column_op(a, b): return {i: func(a.iloc[:, i], b.iloc[:, i]) for i in range(len(a.columns))} elif isinstance(right, ABCSeries) and axis == "columns": - # We only get here if called via _combine_frame_series, + # We only get here if called via _combine_series_frame, # in which case we specifically want to operate row-by-row assert right.index.equals(left.columns) @@ -600,9 +603,7 @@ def _combine_series_frame(self, other, func, fill_value=None, axis=None, level=N result : DataFrame """ if fill_value is not None: - raise NotImplementedError( - "fill_value {fill} not supported.".format(fill=fill_value) - ) + raise NotImplementedError(f"fill_value {fill_value} not supported.") if axis is None: # default axis is columns @@ -658,15 +659,12 @@ def to_series(right): else: raise ValueError( "Unable to coerce to DataFrame, shape " - "must be {req_shape}: given {given_shape}".format( - req_shape=left.shape, given_shape=right.shape - ) + f"must be {left.shape}: given {right.shape}" ) elif right.ndim > 2: raise ValueError( - "Unable to coerce to Series/DataFrame, dim " - "must be <= 2: {dim}".format(dim=right.shape) + f"Unable to coerce to Series/DataFrame, dim must be <= 2: {right.shape}" ) elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): @@ -699,7 +697,11 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): # Another DataFrame pass_op = op if should_series_dispatch(self, other, op) else na_op pass_op = pass_op if not is_logical else op - return self._combine_frame(other, pass_op, fill_value, level) + + left, right = self.align(other, join="outer", level=level, copy=False) + new_data = left._combine_frame(right, pass_op, fill_value) + return left._construct_result(new_data) + elif isinstance(other, ABCSeries): # For these values of `axis`, we end up dispatching to Series op, # so do not want the masked op. @@ -713,7 +715,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): if fill_value is not None: self = self.fillna(fill_value) - new_data = dispatch_to_series(self, other, op) + new_data = dispatch_to_series(self, other, op, str_rep) return self._construct_result(new_data) f.__name__ = op_name @@ -760,7 +762,7 @@ def _comp_method_FRAME(cls, op, special): str_rep = _get_opstr(op) op_name = _get_op_name(op, special) - @Appender("Wrapper for comparison method {name}".format(name=op_name)) + @Appender(f"Wrapper for comparison method {op_name}") def f(self, other): other = _align_method_FRAME(self, other, axis=None) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 40bf19c60e144..b84d468fff736 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -2,8 +2,9 @@ Functions for arithmetic and comparison operations on NumPy arrays and ExtensionArrays. """ +from functools import partial import operator -from typing import Any, Union +from typing import Any, Optional, Union import numpy as np @@ -51,10 +52,10 @@ def comp_method_OBJECT_ARRAY(op, x, y): if isinstance(y, (ABCSeries, ABCIndex)): y = y.values - result = libops.vec_compare(x, y, op) + result = libops.vec_compare(x.ravel(), y, op) else: - result = libops.scalar_compare(x, y, op) - return result + result = libops.scalar_compare(x.ravel(), y, op) + return result.reshape(x.shape) def masked_arith_op(x, y, op): @@ -237,15 +238,15 @@ def comparison_op( elif is_scalar(rvalues) and isna(rvalues): # numpy does not like comparisons vs None if op is operator.ne: - res_values = np.ones(len(lvalues), dtype=bool) + res_values = np.ones(lvalues.shape, dtype=bool) else: - res_values = np.zeros(len(lvalues), dtype=bool) + res_values = np.zeros(lvalues.shape, dtype=bool) elif is_object_dtype(lvalues.dtype): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: - op_name = "__{op}__".format(op=op.__name__) + op_name = f"__{op.__name__}__" method = getattr(lvalues, op_name) with np.errstate(all="ignore"): res_values = method(rvalues) @@ -253,9 +254,8 @@ def comparison_op( if res_values is NotImplemented: res_values = invalid_comparison(lvalues, rvalues, op) if is_scalar(res_values): - raise TypeError( - "Could not compare {typ} type with Series".format(typ=type(rvalues)) - ) + typ = type(rvalues) + raise TypeError(f"Could not compare {typ} type with Series") return res_values @@ -292,11 +292,10 @@ def na_logical_op(x: np.ndarray, y, op): OverflowError, NotImplementedError, ): + typ = type(y).__name__ raise TypeError( - "Cannot perform '{op}' with a dtyped [{dtype}] array " - "and scalar of type [{typ}]".format( - op=op.__name__, dtype=x.dtype, typ=type(y).__name__ - ) + f"Cannot perform '{op.__name__}' with a dtyped [{x.dtype}] array " + f"and scalar of type [{typ}]" ) return result @@ -367,3 +366,27 @@ def fill_bool(x, left=None): res_values = filler(res_values) # type: ignore return res_values + + +def get_array_op(op, str_rep: Optional[str] = None): + """ + Return a binary array operation corresponding to the given operator op. + + Parameters + ---------- + op : function + Binary operator from operator or roperator module. + str_rep : str or None, default None + str_rep to pass to arithmetic_op + + Returns + ------- + function + """ + op_name = op.__name__.strip("_") + if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: + return partial(comparison_op, op=op) + elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: + return partial(logical_op, op=op) + else: + return partial(arithmetic_op, op=op, str_rep=str_rep) diff --git a/pandas/core/ops/dispatch.py b/pandas/core/ops/dispatch.py index 1eb952c1394ac..61a3032c7a02c 100644 --- a/pandas/core/ops/dispatch.py +++ b/pandas/core/ops/dispatch.py @@ -1,12 +1,10 @@ """ Functions for defining unary operations. """ -from typing import Any, Callable, Union +from typing import Any, Union import numpy as np -from pandas._typing import ArrayLike - from pandas.core.dtypes.common import ( is_datetime64_dtype, is_extension_array_dtype, @@ -126,94 +124,3 @@ def dispatch_to_extension_op( # on the ExtensionArray res_values = op(left, right) return res_values - - -def maybe_dispatch_ufunc_to_dunder_op( - self: ArrayLike, ufunc: Callable, method: str, *inputs: ArrayLike, **kwargs: Any -): - """ - Dispatch a ufunc to the equivalent dunder method. - - Parameters - ---------- - self : ArrayLike - The array whose dunder method we dispatch to - ufunc : Callable - A NumPy ufunc - method : {'reduce', 'accumulate', 'reduceat', 'outer', 'at', '__call__'} - inputs : ArrayLike - The input arrays. - kwargs : Any - The additional keyword arguments, e.g. ``out``. - - Returns - ------- - result : Any - The result of applying the ufunc - """ - # special has the ufuncs we dispatch to the dunder op on - special = { - "add", - "sub", - "mul", - "pow", - "mod", - "floordiv", - "truediv", - "divmod", - "eq", - "ne", - "lt", - "gt", - "le", - "ge", - "remainder", - "matmul", - "or", - "xor", - "and", - } - aliases = { - "subtract": "sub", - "multiply": "mul", - "floor_divide": "floordiv", - "true_divide": "truediv", - "power": "pow", - "remainder": "mod", - "divide": "div", - "equal": "eq", - "not_equal": "ne", - "less": "lt", - "less_equal": "le", - "greater": "gt", - "greater_equal": "ge", - "bitwise_or": "or", - "bitwise_and": "and", - "bitwise_xor": "xor", - } - - # For op(., Array) -> Array.__r{op}__ - flipped = { - "lt": "__gt__", - "le": "__ge__", - "gt": "__lt__", - "ge": "__le__", - "eq": "__eq__", - "ne": "__ne__", - } - - op_name = ufunc.__name__ - op_name = aliases.get(op_name, op_name) - - def not_implemented(*args, **kwargs): - return NotImplemented - - if method == "__call__" and op_name in special and kwargs.get("out") is None: - if isinstance(inputs[0], type(self)): - name = "__{}__".format(op_name) - return getattr(self, name, not_implemented)(inputs[1]) - else: - name = flipped.get(op_name, "__r{}__".format(op_name)) - return getattr(self, name, not_implemented)(inputs[0]) - else: - return NotImplemented diff --git a/pandas/core/ops/invalid.py b/pandas/core/ops/invalid.py index 013ff7689b221..cc4a1f11edd2b 100644 --- a/pandas/core/ops/invalid.py +++ b/pandas/core/ops/invalid.py @@ -30,11 +30,8 @@ def invalid_comparison(left, right, op): elif op is operator.ne: res_values = np.ones(left.shape, dtype=bool) else: - raise TypeError( - "Invalid comparison between dtype={dtype} and {typ}".format( - dtype=left.dtype, typ=type(right).__name__ - ) - ) + typ = type(right).__name__ + raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}") return res_values @@ -52,10 +49,8 @@ def make_invalid_op(name: str): """ def invalid_op(self, other=None): - raise TypeError( - "cannot perform {name} with this index type: " - "{typ}".format(name=name, typ=type(self).__name__) - ) + typ = type(self).__name__ + raise TypeError(f"cannot perform {name} with this index type: {typ}") invalid_op.__name__ = name return invalid_op diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index 8c66eea270c76..c04658565f235 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -102,7 +102,8 @@ def f(self, other): return self - f.__name__ = "__i{name}__".format(name=method.__name__.strip("__")) + name = method.__name__.strip("__") + f.__name__ = f"__i{name}__" return f new_methods.update( @@ -214,7 +215,7 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): ) if special: - dunderize = lambda x: "__{name}__".format(name=x.strip("_")) + dunderize = lambda x: f"__{x.strip('_')}__" else: dunderize = lambda x: x new_methods = {dunderize(k): v for k, v in new_methods.items()} diff --git a/pandas/core/ops/missing.py b/pandas/core/ops/missing.py index 96a615d488bf2..5039ffab33fbd 100644 --- a/pandas/core/ops/missing.py +++ b/pandas/core/ops/missing.py @@ -27,7 +27,7 @@ from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype, is_scalar -from .roperator import rdivmod, rfloordiv, rmod +from pandas.core.ops.roperator import rdivmod, rfloordiv, rmod def fill_zeros(result, x, y): diff --git a/pandas/core/ops/roperator.py b/pandas/core/ops/roperator.py index 4cb02238aea16..e6691ddf8984e 100644 --- a/pandas/core/ops/roperator.py +++ b/pandas/core/ops/roperator.py @@ -34,9 +34,8 @@ def rmod(left, right): # formatting operation; this is a TypeError # otherwise perform the op if isinstance(right, str): - raise TypeError( - "{typ} cannot perform the operation mod".format(typ=type(left).__name__) - ) + typ = type(left).__name__ + raise TypeError(f"{typ} cannot perform the operation mod") return right % left diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 2294c846e81c7..fb837409a00f5 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -96,7 +96,7 @@ def __str__(self) -> str: ) return f"{type(self).__name__} [{', '.join(attrs)}]" - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self._attributes: @@ -131,7 +131,7 @@ def ax(self): return self.groupby.ax @property - def _typ(self): + def _typ(self) -> str: """ Masquerade for compat as a Series or a DataFrame. """ @@ -140,7 +140,7 @@ def _typ(self): return "dataframe" @property - def _from_selection(self): + def _from_selection(self) -> bool: """ Is the resampling from a DataFrame column or MultiIndex level. """ @@ -316,7 +316,7 @@ def _downsample(self, f): def _upsample(self, f, limit=None, fill_value=None): raise AbstractMethodError(self) - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, key, ndim: int, subset=None): """ Sub-classes to define. Return a sliced object. @@ -1025,8 +1025,7 @@ def _downsample(self, how, **kwargs): if not len(ax): # reset to the new freq obj = obj.copy() - # TODO: find a less code-smelly way to set this - obj.index._data._freq = self.freq + obj.index._set_freq(self.freq) return obj # do we have a regular frequency @@ -1077,10 +1076,9 @@ def _upsample(self, method, limit=None, fill_value=None): raise AssertionError("axis must be 0") if self._from_selection: raise ValueError( - "Upsampling from level= or on= selection" - " is not supported, use .set_index(...)" - " to explicitly set index to" - " datetime-like" + "Upsampling from level= or on= selection " + "is not supported, use .set_index(...) " + "to explicitly set index to datetime-like" ) ax = self.ax @@ -1136,9 +1134,9 @@ def _convert_obj(self, obj): if self._from_selection: # see GH 14008, GH 12871 msg = ( - "Resampling from level= or on= selection" - " with a PeriodIndex is not currently supported," - " use .set_index(...) to explicitly set index" + "Resampling from level= or on= selection " + "with a PeriodIndex is not currently supported, " + "use .set_index(...) to explicitly set index" ) raise NotImplementedError(msg) @@ -1264,7 +1262,7 @@ def _constructor(self): return TimedeltaIndexResampler -def resample(obj, kind=None, **kwds): +def get_resampler(obj, kind=None, **kwds): """ Create a TimeGrouper and return our resampler. """ @@ -1272,7 +1270,7 @@ def resample(obj, kind=None, **kwds): return tg._get_resampler(obj, kind=kind) -resample.__doc__ = Resampler.__doc__ +get_resampler.__doc__ = Resampler.__doc__ def get_resampler_for_grouping( @@ -1408,7 +1406,7 @@ def _get_resampler(self, obj, kind=None): f"but got an instance of '{type(ax).__name__}'" ) - def _get_grouper(self, obj, validate=True): + def _get_grouper(self, obj, validate: bool = True): # create the resampler and return our binner r = self._get_resampler(obj) r._set_binner() @@ -1588,7 +1586,10 @@ def _get_period_bins(self, ax): rng += freq_mult # adjust bin edge indexes to account for base rng -= bin_shift - bins = memb.searchsorted(rng, side="left") + + # Wrap in PeriodArray for PeriodArray.searchsorted + prng = type(memb._data)(rng, dtype=memb.dtype) + bins = memb.searchsorted(prng, side="left") if nat_count > 0: # NaT handling as in pandas._lib.lib.generate_bins_dt64() @@ -1700,8 +1701,8 @@ def _get_period_range_edges(first, last, offset, closed="left", base=0): # GH 23882 first = first.to_timestamp() last = last.to_timestamp() - adjust_first = not offset.onOffset(first) - adjust_last = offset.onOffset(last) + adjust_first = not offset.is_on_offset(first) + adjust_last = offset.is_on_offset(last) first, last = _get_timestamp_range_edges( first, last, offset, closed=closed, base=base diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index cea70012b47ea..9528de36a3664 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -2,10 +2,12 @@ concat routines """ -from typing import List +from typing import Hashable, Iterable, List, Mapping, Optional, Union, overload import numpy as np +from pandas._typing import FrameOrSeriesUnion + from pandas import DataFrame, Index, MultiIndex, Series from pandas.core.arrays.categorical import ( factorize_from_iterable, @@ -26,8 +28,27 @@ # Concatenate DataFrame objects +@overload +def concat( + objs: Union[Iterable["DataFrame"], Mapping[Optional[Hashable], "DataFrame"]], + axis=0, + join: str = "outer", + ignore_index: bool = False, + keys=None, + levels=None, + names=None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> "DataFrame": + ... + + +@overload def concat( - objs, + objs: Union[ + Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] + ], axis=0, join: str = "outer", ignore_index: bool = False, @@ -37,7 +58,24 @@ def concat( verify_integrity: bool = False, sort: bool = False, copy: bool = True, -): +) -> FrameOrSeriesUnion: + ... + + +def concat( + objs: Union[ + Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] + ], + axis=0, + join="outer", + ignore_index: bool = False, + keys=None, + levels=None, + names=None, + verify_integrity: bool = False, + sort: bool = False, + copy: bool = True, +) -> FrameOrSeriesUnion: """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. @@ -109,7 +147,7 @@ def concat( A walkthrough of how this method fits in with other tools for combining pandas objects can be found `here - `__. + `__. Examples -------- @@ -267,8 +305,7 @@ def __init__( if isinstance(objs, (NDFrame, str)): raise TypeError( "first argument must be an iterable of pandas " - "objects, you passed an object of type " - '"{name}"'.format(name=type(objs).__name__) + f'objects, you passed an object of type "{type(objs).__name__}"' ) if join == "outer": @@ -313,8 +350,8 @@ def __init__( for obj in objs: if not isinstance(obj, (Series, DataFrame)): msg = ( - "cannot concatenate object of type '{typ}';" - " only Series and DataFrame objs are valid".format(typ=type(obj)) + "cannot concatenate object of type '{typ}'; " + "only Series and DataFrame objs are valid".format(typ=type(obj)) ) raise TypeError(msg) @@ -364,8 +401,8 @@ def __init__( self._is_series = isinstance(sample, Series) if not 0 <= axis <= sample.ndim: raise AssertionError( - "axis must be between 0 and {ndim}, input was" - " {axis}".format(ndim=sample.ndim, axis=axis) + "axis must be between 0 and {ndim}, input was " + "{axis}".format(ndim=sample.ndim, axis=axis) ) # if we have mixed ndims, then convert to highest ndim @@ -462,9 +499,7 @@ def get_result(self): new_data._consolidate_inplace() cons = self.objs[0]._constructor - return cons._from_axes(new_data, self.new_axes).__finalize__( - self, method="concat" - ) + return cons(new_data).__finalize__(self, method="concat") def _get_result_dim(self) -> int: if self._is_series and self.axis == 1: @@ -472,17 +507,12 @@ def _get_result_dim(self) -> int: else: return self.objs[0].ndim - def _get_new_axes(self): + def _get_new_axes(self) -> List[Index]: ndim = self._get_result_dim() - new_axes = [None] * ndim - - for i in range(ndim): - if i == self.axis: - continue - new_axes[i] = self._get_comb_axis(i) - - new_axes[self.axis] = self._get_concat_axis() - return new_axes + return [ + self._get_concat_axis() if i == self.axis else self._get_comb_axis(i) + for i in range(ndim) + ] def _get_comb_axis(self, i: int) -> Index: data_axis = self.objs[0]._get_block_manager_axis(i) @@ -501,7 +531,7 @@ def _get_concat_axis(self) -> Index: idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: - names: List = [None] * len(self.objs) + names: List[Optional[Hashable]] = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): @@ -544,10 +574,7 @@ def _maybe_check_integrity(self, concat_index: Index): if self.verify_integrity: if not concat_index.is_unique: overlap = concat_index[concat_index.duplicated()].unique() - raise ValueError( - "Indexes have overlapping values: " - "{overlap!s}".format(overlap=overlap) - ) + raise ValueError(f"Indexes have overlapping values: {overlap}") def _concat_indexes(indexes) -> Index: @@ -615,8 +642,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError( - "Cannot concat indices that do" - " not have the same number of levels" + "Cannot concat indices that do not have the same number of levels" ) # also copies diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 38bda94489d01..d04287e1e9088 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -51,9 +51,8 @@ def melt( missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError( - "The following 'id_vars' are not present" - " in the DataFrame: {missing}" - "".format(missing=list(missing)) + "The following 'id_vars' are not present " + f"in the DataFrame: {list(missing)}" ) else: id_vars = [] @@ -73,9 +72,8 @@ def melt( missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError( - "The following 'value_vars' are not present in" - " the DataFrame: {missing}" - "".format(missing=list(missing)) + "The following 'value_vars' are not present in " + f"the DataFrame: {list(missing)}" ) frame = frame.loc[:, id_vars + value_vars] else: @@ -192,7 +190,9 @@ def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFr return data._constructor(mdata, columns=id_cols + pivot_cols) -def wide_to_long(df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"): +def wide_to_long( + df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" +) -> DataFrame: r""" Wide panel to long format. Less flexible but more user-friendly than melt. diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 37ec05c40940e..ceee2f66dba42 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -41,6 +41,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype from pandas import Categorical, Index, MultiIndex +from pandas.core import groupby import pandas.core.algorithms as algos from pandas.core.arrays.categorical import _recode_for_categories import pandas.core.common as com @@ -68,7 +69,7 @@ def merge( copy: bool = True, indicator: bool = False, validate=None, -): +) -> "DataFrame": op = _MergeOperation( left, right, @@ -113,6 +114,7 @@ def _groupby_and_merge( by = [by] lby = left.groupby(by, sort=False) + rby: Optional[groupby.DataFrameGroupBy] = None # if we can groupby the rhs # then we can get vastly better perf @@ -132,7 +134,7 @@ def _groupby_and_merge( try: rby = right.groupby(by, sort=False) except KeyError: - rby = None + pass for key, lhs in lby: @@ -183,7 +185,7 @@ def merge_ordered( fill_method=None, suffixes=("_x", "_y"), how: str = "outer", -): +) -> "DataFrame": """ Perform merge with optional filling/interpolation. @@ -317,7 +319,7 @@ def merge_asof( tolerance=None, allow_exact_matches: bool = True, direction: str = "backward", -): +) -> "DataFrame": """ Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys. @@ -598,13 +600,11 @@ def __init__( if not is_bool(left_index): raise ValueError( - "left_index parameter must be of type bool, not " - "{left_index}".format(left_index=type(left_index)) + f"left_index parameter must be of type bool, not {type(left_index)}" ) if not is_bool(right_index): raise ValueError( - "right_index parameter must be of type bool, not " - "{right_index}".format(right_index=type(right_index)) + f"right_index parameter must be of type bool, not {type(right_index)}" ) # warn user when merging between different levels @@ -1071,9 +1071,8 @@ def _maybe_coerce_merge_keys(self): continue msg = ( - "You are trying to merge on {lk_dtype} and " - "{rk_dtype} columns. If you wish to proceed " - "you should use pd.concat".format(lk_dtype=lk.dtype, rk_dtype=rk.dtype) + f"You are trying to merge on {lk.dtype} and " + f"{rk.dtype} columns. If you wish to proceed you should use pd.concat" ) # if we are numeric, then allow differing @@ -1090,8 +1089,7 @@ def _maybe_coerce_merge_keys(self): warnings.warn( "You are merging on int and float " "columns where the float values " - "are not equal to their int " - "representation", + "are not equal to their int representation", UserWarning, ) continue @@ -1101,8 +1099,7 @@ def _maybe_coerce_merge_keys(self): warnings.warn( "You are merging on int and float " "columns where the float values " - "are not equal to their int " - "representation", + "are not equal to their int representation", UserWarning, ) continue @@ -1244,32 +1241,29 @@ def _validate(self, validate: str): if validate in ["one_to_one", "1:1"]: if not left_unique and not right_unique: raise MergeError( - "Merge keys are not unique in either left" - " or right dataset; not a one-to-one merge" + "Merge keys are not unique in either left " + "or right dataset; not a one-to-one merge" ) elif not left_unique: raise MergeError( - "Merge keys are not unique in left dataset;" - " not a one-to-one merge" + "Merge keys are not unique in left dataset; not a one-to-one merge" ) elif not right_unique: raise MergeError( - "Merge keys are not unique in right dataset;" - " not a one-to-one merge" + "Merge keys are not unique in right dataset; not a one-to-one merge" ) elif validate in ["one_to_many", "1:m"]: if not left_unique: raise MergeError( - "Merge keys are not unique in left dataset;" - " not a one-to-many merge" + "Merge keys are not unique in left dataset; not a one-to-many merge" ) elif validate in ["many_to_one", "m:1"]: if not right_unique: raise MergeError( - "Merge keys are not unique in right dataset;" - " not a many-to-one merge" + "Merge keys are not unique in right dataset; " + "not a many-to-one merge" ) elif validate in ["many_to_many", "m:m"]: @@ -1831,8 +1825,7 @@ def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = raise AssertionError( "If more than one join key is given then " "'right_ax' must be a MultiIndex and the " - "number of join keys must be the number of " - "levels in right_ax" + "number of join keys must be the number of levels in right_ax" ) left_indexer, right_indexer = _get_multiindex_indexer( @@ -2002,8 +1995,7 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame": return obj.to_frame() else: raise TypeError( - "Can only merge Series or DataFrame objects, " - "a {obj} was passed".format(obj=type(obj)) + f"Can only merge Series or DataFrame objects, a {type(obj)} was passed" ) @@ -2019,10 +2011,7 @@ def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): return left, right if not lsuffix and not rsuffix: - raise ValueError( - "columns overlap but no suffix specified: " - "{rename}".format(rename=to_rename) - ) + raise ValueError(f"columns overlap but no suffix specified: {to_rename}") def renamer(x, suffix): """ @@ -2041,7 +2030,7 @@ def renamer(x, suffix): x : renamed column name """ if x in to_rename and suffix is not None: - return "{x}{suffix}".format(x=x, suffix=suffix) + return f"{x}{suffix}" return x lrenamer = partial(renamer, suffix=lsuffix) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 4b21045cd0217..930ff5f454a7b 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Callable, Dict, Tuple, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Tuple, Union import numpy as np @@ -35,12 +35,12 @@ def pivot_table( dropna=True, margins_name="All", observed=False, -): +) -> "DataFrame": index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): - pieces = [] + pieces: List[DataFrame] = [] keys = [] for func in aggfunc: table = pivot_table( @@ -117,7 +117,9 @@ def pivot_table( agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged - if table.index.nlevels > 1: + + # GH17038, this check should only happen if index is defined (not None) + if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. @@ -148,7 +150,7 @@ def pivot_table( table = table.sort_index(axis=1) if fill_value is not None: - table = table.fillna(value=fill_value, downcast="infer") + table = table._ensure_type(table.fillna(fill_value, downcast="infer")) if margins: if dropna: @@ -426,7 +428,10 @@ def _convert_by(by): @Substitution("\ndata : DataFrame") @Appender(_shared_docs["pivot"], indents=1) -def pivot(data: "DataFrame", index=None, columns=None, values=None): +def pivot(data: "DataFrame", index=None, columns=None, values=None) -> "DataFrame": + if columns is None: + raise TypeError("pivot() missing 1 required argument: 'columns'") + if values is None: cols = [columns] if index is None else [index, columns] append = index is None @@ -459,7 +464,7 @@ def crosstab( margins_name: str = "All", dropna: bool = True, normalize=False, -): +) -> "DataFrame": """ Compute a simple cross tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an @@ -579,6 +584,8 @@ def crosstab( from pandas import DataFrame df = DataFrame(data, index=common_idx) + original_df_cols = df.columns + if values is None: df["__dummy__"] = 0 kwargs = {"aggfunc": len, "fill_value": 0} @@ -587,7 +594,7 @@ def crosstab( kwargs = {"aggfunc": aggfunc} table = df.pivot_table( - "__dummy__", + ["__dummy__"], index=rownames, columns=colnames, margins=margins, @@ -596,6 +603,12 @@ def crosstab( **kwargs, ) + # GH18321, after pivoting, an extra top level of column index of `__dummy__` is + # created, and this extra level should not be included in the further steps + if not table.empty: + cols_diff = df.columns.difference(original_df_cols)[0] + table = table[cols_diff] + # Post-process if normalize is not False: table = _normalize( diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 004bd0199eb58..97f416e32d07b 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,5 +1,6 @@ from functools import partial import itertools +from typing import List import numpy as np @@ -357,7 +358,7 @@ def _unstack_multiple(data, clocs, fill_value=None): result = data for i in range(len(clocs)): val = clocs[i] - result = result.unstack(val) + result = result.unstack(val, fill_value=fill_value) clocs = [v if i > v else v - 1 for v in clocs] return result @@ -755,7 +756,7 @@ def get_dummies( sparse=False, drop_first=False, dtype=None, -): +) -> "DataFrame": """ Convert categorical variable into dummy/indicator variables. @@ -899,7 +900,7 @@ def check_len(item, name): if data_to_encode.shape == data.shape: # Encoding the entire df, do not prepend any dropped columns - with_dummies = [] + with_dummies: List[DataFrame] = [] elif columns is not None: # Encoding only cols specified in columns. Get all cols not in # columns to prepend to result. diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index ceb4e3290ff75..5a444d908b786 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -4,7 +4,6 @@ import numpy as np from pandas._libs import Timedelta, Timestamp -from pandas._libs.interval import Interval from pandas._libs.lib import infer_dtype from pandas.core.dtypes.common import ( @@ -16,6 +15,7 @@ is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_integer, + is_list_like, is_scalar, is_timedelta64_dtype, ) @@ -66,11 +66,12 @@ def cut( ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` indicate (1,2], (2,3], (3,4]. This argument is ignored when `bins` is an IntervalIndex. - labels : array or bool, optional + labels : array or False, default None Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the bins. This affects the type of the output container (see below). - This argument is ignored when `bins` is an IntervalIndex. + This argument is ignored when `bins` is an IntervalIndex. If True, + raises an error. retbins : bool, default False Whether to return the bins or not. Useful when bins is provided as a scalar. @@ -287,10 +288,10 @@ def qcut( q : int or list-like of int Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles. - labels : array or bool, default None + labels : array or False, default None Used as labels for the resulting bins. Must be of the same length as the resulting bins. If False, return only integer indicators of the - bins. + bins. If True, raises an error. retbins : bool, optional Whether to return the (bins, labels) or not. Can be useful if bins is given as a scalar. @@ -362,8 +363,7 @@ def _bins_to_cuts( if duplicates not in ["raise", "drop"]: raise ValueError( - "invalid value for 'duplicates' parameter, " - "valid options are: raise, drop" + "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) if isinstance(bins, IntervalIndex): @@ -392,15 +392,23 @@ def _bins_to_cuts( has_nas = na_mask.any() if labels is not False: - if labels is None: + if not (labels is None or is_list_like(labels)): + raise ValueError( + "Bin labels must either be False, None or passed in as a " + "list-like argument" + ) + + elif labels is None: labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) + else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) + if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) @@ -516,17 +524,11 @@ def _format_labels( adjust = lambda x: x - 10 ** (-precision) breaks = [formatter(b) for b in bins] - labels = IntervalIndex.from_breaks(breaks, closed=closed) - if right and include_lowest: - # we will adjust the left hand side by precision to - # account that we are all right closed - v = adjust(labels[0].left) - - i = IntervalIndex([Interval(v, labels[0].right, closed="right")]) - labels = i.append(labels[1:]) + # adjust lhs of first interval by precision to account for being right closed + breaks[0] = adjust(breaks[0]) - return labels + return IntervalIndex.from_breaks(breaks, closed=closed) def _preprocess_for_cut(x): diff --git a/pandas/core/series.py b/pandas/core/series.py index 54c163330e6ee..ffe0642f799fa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4,7 +4,18 @@ from io import StringIO from shutil import get_terminal_size from textwrap import dedent -from typing import Any, Callable, Hashable, List, Optional +from typing import ( + IO, + TYPE_CHECKING, + Any, + Callable, + Hashable, + Iterable, + List, + Optional, + Tuple, + Type, +) import warnings import numpy as np @@ -12,6 +23,7 @@ from pandas._config import get_option from pandas._libs import index as libindex, lib, reshape, tslibs +from pandas._typing import Label from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -33,7 +45,6 @@ ) from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeArray, ABCDatetimeIndex, ABCSeries, ABCSparseArray, @@ -80,6 +91,10 @@ import pandas.io.formats.format as fmt import pandas.plotting +if TYPE_CHECKING: + from pandas.core.frame import DataFrame + from pandas.core.groupby.generic import SeriesGroupBy + __all__ = ["Series"] _shared_doc_kwargs = dict( @@ -159,7 +174,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): _typ = "series" - _metadata: List[str] = [] + _name: Optional[Hashable] + _metadata: List[str] = ["name"] _accessors = {"dt", "cat", "str", "sparse"} _deprecations = ( base.IndexOpsMixin._deprecations @@ -181,6 +197,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): def __init__( self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False ): + # we are called internally, so short-circuit if fastpath: @@ -194,6 +211,8 @@ def __init__( else: + name = ibase.maybe_extract_name(name, data, type(self)) + if is_empty_data(data) and dtype is None: # gh-17261 warnings.warn( @@ -219,8 +238,6 @@ def __init__( "initializing a Series from a MultiIndex is not supported" ) elif isinstance(data, Index): - if name is None: - name = data.name if dtype is not None: # astype copies @@ -235,16 +252,21 @@ def __init__( copy = False elif isinstance(data, np.ndarray): + if len(data.dtype): + # GH#13296 we are dealing with a compound dtype, which + # should be treated as 2D + raise ValueError( + "Cannot construct a Series from an ndarray with " + "compound dtype. Use DataFrame instead." + ) pass elif isinstance(data, ABCSeries): - if name is None: - name = data.name if index is None: index = data.index else: data = data.reindex(index, copy=copy) data = data._data - elif isinstance(data, dict): + elif is_dict_like(data): data, index = self._init_dict(data, index, dtype) dtype = None copy = False @@ -349,11 +371,11 @@ def _init_dict(self, data, index=None, dtype=None): # ---------------------------------------------------------------------- @property - def _constructor(self): + def _constructor(self) -> Type["Series"]: return Series @property - def _constructor_expanddim(self): + def _constructor_expanddim(self) -> Type["DataFrame"]: from pandas.core.frame import DataFrame return DataFrame @@ -365,7 +387,7 @@ def _can_hold_na(self): _index = None - def _set_axis(self, axis, labels, fastpath=False): + def _set_axis(self, axis, labels, fastpath=False) -> None: """ Override generic, we want to set the _typ here. """ @@ -419,13 +441,13 @@ def dtypes(self): @property def name(self) -> Optional[Hashable]: - return self.attrs.get("name", None) + return self._name @name.setter def name(self, value: Optional[Hashable]) -> None: if not is_hashable(value): raise TypeError("Series.name must be a hashable type") - self.attrs["name"] = value + object.__setattr__(self, "_name", value) @property def values(self): @@ -504,37 +526,13 @@ def ravel(self, order="C"): """ return self._values.ravel(order=order) - def compress(self, condition, *args, **kwargs): - """ - Return selected slices of an array along given axis as a Series. - - .. deprecated:: 0.24.0 - - Returns - ------- - Series - Series without the slices for which condition is false. - - See Also - -------- - numpy.ndarray.compress - """ - msg = ( - "Series.compress(condition) is deprecated. " - "Use 'Series[condition]' or " - "'np.asarray(series).compress(condition)' instead." - ) - warnings.warn(msg, FutureWarning, stacklevel=2) - nv.validate_compress(args, kwargs) - return self[condition] - def __len__(self) -> int: """ Return the length of the Series. """ return len(self._data) - def view(self, dtype=None): + def view(self, dtype=None) -> "Series": """ Create a new view of the Series. @@ -687,7 +685,7 @@ def construct_return(result): else: return construct_return(result) - def __array__(self, dtype=None): + def __array__(self, dtype=None) -> np.ndarray: """ Return the values as a NumPy array. @@ -703,7 +701,7 @@ def __array__(self, dtype=None): Returns ------- numpy.ndarray - The values in the series converted to a :class:`numpy.ndarary` + The values in the series converted to a :class:`numpy.ndarray` with the specified `dtype`. See Also @@ -727,28 +725,13 @@ def __array__(self, dtype=None): Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')], dtype=object) - Or the values may be localized to UTC and the tzinfo discared with + Or the values may be localized to UTC and the tzinfo discarded with ``dtype='datetime64[ns]'`` >>> np.asarray(tzser, dtype="datetime64[ns]") # doctest: +ELLIPSIS array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ - if ( - dtype is None - and isinstance(self.array, ABCDatetimeArray) - and getattr(self.dtype, "tz", None) - ): - msg = ( - "Converting timezone-aware DatetimeArray to timezone-naive " - "ndarray with 'datetime64[ns]' dtype. In the future, this " - "will return an ndarray with 'object' dtype where each " - "element is a 'pandas.Timestamp' with the correct 'tz'.\n\t" - "To accept the future behavior, pass 'dtype=object'.\n\t" - "To keep the old behavior, pass 'dtype=\"datetime64[ns]\"'." - ) - warnings.warn(msg, FutureWarning, stacklevel=3) - dtype = "M8[ns]" return np.asarray(self.array, dtype) # ---------------------------------------------------------------------- @@ -761,38 +744,9 @@ def __array__(self, dtype=None): # ---------------------------------------------------------------------- - def _unpickle_series_compat(self, state): - if isinstance(state, dict): - self._data = state["_data"] - self.name = state["name"] - self.index = self._data.index - - elif isinstance(state, tuple): - - # < 0.12 series pickle - - nd_state, own_state = state - - # recreate the ndarray - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - - # backwards compat - index, name = own_state[0], None - if len(own_state) > 1: - name = own_state[1] - - # recreate - self._data = SingleBlockManager(data, index, fastpath=True) - self._index = index - self.name = name - - else: - raise Exception(f"cannot unpickle legacy formats -> [{state}]") - # indexers @property - def axes(self): + def axes(self) -> List[Index]: """ Return a list of the row axis labels. """ @@ -802,7 +756,7 @@ def axes(self): # Indexing Methods @Appender(generic.NDFrame.take.__doc__) - def take(self, indices, axis=0, is_copy=False, **kwargs): + def take(self, indices, axis=0, is_copy=False, **kwargs) -> "Series": nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) @@ -848,27 +802,19 @@ def _ixs(self, i: int, axis: int = 0): else: return values[i] - def _slice(self, slobj: slice, axis: int = 0, kind=None): + def _slice(self, slobj: slice, axis: int = 0, kind=None) -> "Series": slobj = self.index._convert_slice_indexer(slobj, kind=kind or "getitem") return self._get_values(slobj) def __getitem__(self, key): key = com.apply_if_callable(key, self) + + if key is Ellipsis: + return self + try: result = self.index.get_value(self, key) - if not is_scalar(result): - if is_list_like(result) and not isinstance(result, Series): - - # we need to box if loc of the key isn't scalar here - # otherwise have inline ndarray/lists - try: - if not is_scalar(self.index.get_loc(key)): - result = self._constructor( - result, index=[key] * len(result), dtype=self.dtype - ).__finalize__(self) - except KeyError: - pass return result except InvalidIndexError: pass @@ -876,8 +822,6 @@ def __getitem__(self, key): if isinstance(key, tuple) and isinstance(self.index, MultiIndex): # kludge pass - elif key is Ellipsis: - return self elif com.is_bool_indexer(key): pass else: @@ -985,7 +929,7 @@ def _get_value(self, label, takeable: bool = False): """ if takeable: return com.maybe_box_datetimelike(self._values[label]) - return self.index.get_value(self._values, label) + return self.index.get_value(self, label) def __setitem__(self, key, value): key = com.apply_if_callable(key, self) @@ -1132,7 +1076,7 @@ def _set_value(self, label, value, takeable: bool = False): def _is_mixed_type(self): return False - def repeat(self, repeats, axis=None): + def repeat(self, repeats, axis=None) -> "Series": """ Repeat elements of a Series. @@ -1421,8 +1365,8 @@ def to_string( # catch contract violations if not isinstance(result, str): raise AssertionError( - "result must be of type str, type" - f" of result is {repr(type(result).__name__)}" + "result must be of type str, type " + f"of result is {repr(type(result).__name__)}" ) if buf is None: @@ -1434,9 +1378,30 @@ def to_string( with open(buf, "w") as f: f.write(result) + @Appender( + """ + Examples + -------- + >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") + >>> print(s.to_markdown()) + | | animal | + |---:|:---------| + | 0 | elk | + | 1 | pig | + | 2 | dog | + | 3 | quetzal | + """ + ) + @Substitution(klass="Series") + @Appender(generic._shared_docs["to_markdown"]) + def to_markdown( + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + ) -> Optional[str]: + return self.to_frame().to_markdown(buf, mode, **kwargs) + # ---------------------------------------------------------------------- - def items(self): + def items(self) -> Iterable[Tuple[Label, Any]]: """ Lazily iterate over (index, value) tuples. @@ -1466,13 +1431,13 @@ def items(self): return zip(iter(self.index), iter(self)) @Appender(items.__doc__) - def iteritems(self): + def iteritems(self) -> Iterable[Tuple[Label, Any]]: return self.items() # ---------------------------------------------------------------------- # Misc public methods - def keys(self): + def keys(self) -> Index: """ Return alias for index. @@ -1518,7 +1483,7 @@ def to_dict(self, into=dict): into_c = com.standardize_mapping(into) return into_c(self.items()) - def to_frame(self, name=None): + def to_frame(self, name=None) -> "DataFrame": """ Convert Series to DataFrame. @@ -1550,7 +1515,7 @@ def to_frame(self, name=None): return df - def _set_name(self, name, inplace=False): + def _set_name(self, name, inplace=False) -> "Series": """ Set the Series name. @@ -1565,6 +1530,90 @@ def _set_name(self, name, inplace=False): ser.name = name return ser + @Appender( + """ +Examples +-------- +>>> ser = pd.Series([390., 350., 30., 20.], +... index=['Falcon', 'Falcon', 'Parrot', 'Parrot'], name="Max Speed") +>>> ser +Falcon 390.0 +Falcon 350.0 +Parrot 30.0 +Parrot 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(["a", "b", "a", "b"]).mean() +a 210.0 +b 185.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(ser > 100).mean() +Max Speed +False 25.0 +True 370.0 +Name: Max Speed, dtype: float64 + +**Grouping by Indexes** + +We can groupby different levels of a hierarchical index +using the `level` parameter: + +>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], +... ['Captive', 'Wild', 'Captive', 'Wild']] +>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) +>>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed") +>>> ser +Animal Type +Falcon Captive 390.0 + Wild 350.0 +Parrot Captive 30.0 + Wild 20.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level=0).mean() +Animal +Falcon 370.0 +Parrot 25.0 +Name: Max Speed, dtype: float64 +>>> ser.groupby(level="Type").mean() +Type +Captive 210.0 +Wild 185.0 +Name: Max Speed, dtype: float64 +""" + ) + @Appender(generic._shared_docs["groupby"] % _shared_doc_kwargs) + def groupby( + self, + by=None, + axis=0, + level=None, + as_index: bool = True, + sort: bool = True, + group_keys: bool = True, + squeeze: bool = False, + observed: bool = False, + ) -> "SeriesGroupBy": + from pandas.core.groupby.generic import SeriesGroupBy + + if level is None and by is None: + raise TypeError("You have to supply one of 'by' and 'level'") + axis = self._get_axis_number(axis) + + return SeriesGroupBy( + obj=self, + keys=by, + axis=axis, + level=level, + as_index=as_index, + sort=sort, + group_keys=group_keys, + squeeze=squeeze, + observed=observed, + ) + # ---------------------------------------------------------------------- # Statistics, overridden ndarray methods @@ -1609,7 +1658,7 @@ def count(self, level=None): out = np.bincount(obs, minlength=len(lev) or None) return self._constructor(out, index=lev, dtype="int64").__finalize__(self) - def mode(self, dropna=True): + def mode(self, dropna=True) -> "Series": """ Return the mode(s) of the dataset. @@ -1694,7 +1743,7 @@ def unique(self): result = super().unique() return result - def drop_duplicates(self, keep="first", inplace=False): + def drop_duplicates(self, keep="first", inplace=False) -> "Series": """ Return Series with duplicate values removed. @@ -1771,7 +1820,7 @@ def drop_duplicates(self, keep="first", inplace=False): """ return super().drop_duplicates(keep=keep, inplace=inplace) - def duplicated(self, keep="first"): + def duplicated(self, keep="first") -> "Series": """ Indicate duplicate Series values. @@ -1990,7 +2039,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): return np.nan return self.index[i] - def round(self, decimals=0, *args, **kwargs): + def round(self, decimals=0, *args, **kwargs) -> "Series": """ Round each value in a Series to the given number of decimals. @@ -2085,7 +2134,7 @@ def quantile(self, q=0.5, interpolation="linear"): # scalar return result.iloc[0] - def corr(self, other, method="pearson", min_periods=None): + def corr(self, other, method="pearson", min_periods=None) -> float: """ Compute correlation with `other` Series, excluding missing values. @@ -2138,7 +2187,7 @@ def corr(self, other, method="pearson", min_periods=None): f"'{method}' was supplied" ) - def cov(self, other, min_periods=None): + def cov(self, other, min_periods=None) -> float: """ Compute covariance with Series, excluding missing values. @@ -2167,7 +2216,7 @@ def cov(self, other, min_periods=None): return np.nan return nanops.nancov(this.values, other.values, min_periods=min_periods) - def diff(self, periods=1): + def diff(self, periods=1) -> "Series": """ First discrete difference of element. @@ -2231,7 +2280,7 @@ def diff(self, periods=1): result = algorithms.diff(com.values_from_object(self), periods) return self._constructor(result, index=self.index).__finalize__(self) - def autocorr(self, lag=1): + def autocorr(self, lag=1) -> float: """ Compute the lag-N autocorrelation. @@ -2374,7 +2423,7 @@ def searchsorted(self, value, side="left", sorter=None): # ------------------------------------------------------------------- # Combination - def append(self, to_append, ignore_index=False, verify_integrity=False): + def append(self, to_append, ignore_index=False, verify_integrity=False) -> "Series": """ Concatenate two or more Series. @@ -2451,8 +2500,10 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): to_concat.extend(to_append) else: to_concat = [self, to_append] - return concat( - to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + return self._ensure_type( + concat( + to_concat, ignore_index=ignore_index, verify_integrity=verify_integrity + ) ) def _binop(self, other, func, level=None, fill_value=None): @@ -2494,7 +2545,7 @@ def _binop(self, other, func, level=None, fill_value=None): ret = ops._construct_result(self, result, new_index, name) return ret - def combine(self, other, func, fill_value=None): + def combine(self, other, func, fill_value=None) -> "Series": """ Combine the Series with a Series or scalar according to `func`. @@ -2591,7 +2642,7 @@ def combine(self, other, func, fill_value=None): new_values = try_cast_to_ea(self._values, new_values) return self._constructor(new_values, index=new_index, name=new_name) - def combine_first(self, other): + def combine_first(self, other) -> "Series": """ Combine Series values, choosing the calling Series's values first. @@ -2631,7 +2682,7 @@ def combine_first(self, other): return this.where(notna(this), other) - def update(self, other): + def update(self, other) -> None: """ Modify Series in place using non-NA values from passed Series. Aligns on index. @@ -2690,9 +2741,10 @@ def sort_values( self, axis=0, ascending=True, - inplace=False, - kind="quicksort", - na_position="last", + inplace: bool = False, + kind: str = "quicksort", + na_position: str = "last", + ignore_index: bool = False, ): """ Sort by the values. @@ -2715,6 +2767,10 @@ def sort_values( na_position : {'first' or 'last'}, default 'last' Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at the end. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -2820,7 +2876,7 @@ def _try_kind_sort(arr): return arr.argsort(kind="quicksort") arr = self._values - sortedIdx = np.empty(len(self), dtype=np.int32) + sorted_index = np.empty(len(self), dtype=np.int32) bad = isna(arr) @@ -2844,16 +2900,19 @@ def _try_kind_sort(arr): if na_position == "last": n = good.sum() - sortedIdx[:n] = idx[good][argsorted] - sortedIdx[n:] = idx[bad] + sorted_index[:n] = idx[good][argsorted] + sorted_index[n:] = idx[bad] elif na_position == "first": n = bad.sum() - sortedIdx[n:] = idx[good][argsorted] - sortedIdx[:n] = idx[bad] + sorted_index[n:] = idx[good][argsorted] + sorted_index[:n] = idx[bad] else: raise ValueError(f"invalid na_position: {na_position}") - result = self._constructor(arr[sortedIdx], index=self.index[sortedIdx]) + result = self._constructor(arr[sorted_index], index=self.index[sorted_index]) + + if ignore_index: + result.index = ibase.default_index(len(sorted_index)) if inplace: self._update_inplace(result) @@ -2869,6 +2928,7 @@ def sort_index( kind="quicksort", na_position="last", sort_remaining=True, + ignore_index: bool = False, ): """ Sort Series by index labels. @@ -2897,6 +2957,10 @@ def sort_index( sort_remaining : bool, default True If True and sorting by level and index is multilevel, sort by other levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.0.0 Returns ------- @@ -3024,19 +3088,22 @@ def sort_index( new_values = self._values.take(indexer) result = self._constructor(new_values, index=new_index) + if ignore_index: + result.index = ibase.default_index(len(result)) + if inplace: self._update_inplace(result) else: return result.__finalize__(self) - def argsort(self, axis=0, kind="quicksort", order=None): + def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": """ Override ndarray.argsort. Argsorts the value, omitting NA/null values, and places the result in the same locations as the non-NA values. Parameters ---------- - axis : int + axis : {0 or "index"} Has no effect but is accepted for compatibility with numpy. kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' Choice of sorting algorithm. See np.sort for more @@ -3067,7 +3134,7 @@ def argsort(self, axis=0, kind="quicksort", order=None): np.argsort(values, kind=kind), index=self.index, dtype="int64" ).__finalize__(self) - def nlargest(self, n=5, keep="first"): + def nlargest(self, n=5, keep="first") -> "Series": """ Return the largest `n` elements. @@ -3165,7 +3232,7 @@ def nlargest(self, n=5, keep="first"): """ return algorithms.SelectNSeries(self, n=n, keep=keep).nlargest() - def nsmallest(self, n=5, keep="first"): + def nsmallest(self, n=5, keep="first") -> "Series": """ Return the smallest `n` elements. @@ -3262,7 +3329,7 @@ def nsmallest(self, n=5, keep="first"): """ return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest() - def swaplevel(self, i=-2, j=-1, copy=True): + def swaplevel(self, i=-2, j=-1, copy=True) -> "Series": """ Swap levels i and j in a :class:`MultiIndex`. @@ -3285,7 +3352,7 @@ def swaplevel(self, i=-2, j=-1, copy=True): self ) - def reorder_levels(self, order): + def reorder_levels(self, order) -> "Series": """ Rearrange index levels using input order. @@ -3409,7 +3476,7 @@ def unstack(self, level=-1, fill_value=None): # ---------------------------------------------------------------------- # function application - def map(self, arg, na_action=None): + def map(self, arg, na_action=None) -> "Series": """ Map values of Series according to input correspondence. @@ -3419,7 +3486,7 @@ def map(self, arg, na_action=None): Parameters ---------- - arg : function, dict, or Series + arg : function, collections.abc.Mapping subclass or Series Mapping correspondence. na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the @@ -3487,7 +3554,7 @@ def map(self, arg, na_action=None): new_values = super()._map_values(arg, na_action=na_action) return self._constructor(new_values, index=self.index).__finalize__(self) - def _gotitem(self, key, ndim, subset=None): + def _gotitem(self, key, ndim, subset=None) -> "Series": """ Sub-classes to define. Return a sliced object. @@ -3805,7 +3872,16 @@ def align( broadcast_axis=broadcast_axis, ) - def rename(self, index=None, **kwargs): + def rename( + self, + index=None, + *, + axis=None, + copy=True, + inplace=False, + level=None, + errors="ignore", + ): """ Alter Series index labels or name. @@ -3819,6 +3895,8 @@ def rename(self, index=None, **kwargs): Parameters ---------- + axis : {0 or "index"} + Unused. Accepted for compatability with DataFrame method only. index : scalar, hashable sequence, dict-like or function, optional Functions or dict-like are transformations to apply to the index. @@ -3836,6 +3914,7 @@ def rename(self, index=None, **kwargs): See Also -------- + DataFrame.rename : Corresponding DataFrame method. Series.rename_axis : Set the name of the axis. Examples @@ -3862,12 +3941,38 @@ def rename(self, index=None, **kwargs): 5 3 dtype: int64 """ - kwargs["inplace"] = validate_bool_kwarg(kwargs.get("inplace", False), "inplace") - if callable(index) or is_dict_like(index): - return super().rename(index=index, **kwargs) + return super().rename( + index, copy=copy, inplace=inplace, level=level, errors=errors + ) else: - return self._set_name(index, inplace=kwargs.get("inplace")) + return self._set_name(index, inplace=inplace) + + @Appender( + """ + >>> s = pd.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + + >>> s.set_axis(['a', 'b', 'c'], axis=0) + a 1 + b 2 + c 3 + dtype: int64 + """ + ) + @Substitution( + **_shared_doc_kwargs, + extended_summary_sub="", + axis_description_sub="", + see_also_sub="", + ) + @Appender(generic.NDFrame.set_axis.__doc__) + def set_axis(self, labels, axis=0, inplace=False): + return super().set_axis(labels, axis=axis, inplace=inplace) @Substitution(**_shared_doc_kwargs) @Appender(generic.NDFrame.reindex.__doc__) @@ -3883,7 +3988,7 @@ def drop( level=None, inplace=False, errors="raise", - ): + ) -> "Series": """ Return Series with specified index labels removed. @@ -3994,8 +4099,7 @@ def fillna( inplace=False, limit=None, downcast=None, - **kwargs, - ): + ) -> Optional["Series"]: return super().fillna( value=value, method=method, @@ -4003,7 +4107,6 @@ def fillna( inplace=inplace, limit=limit, downcast=downcast, - **kwargs, ) @Appender(generic._shared_docs["replace"] % _shared_doc_kwargs) @@ -4026,7 +4129,7 @@ def replace( ) @Appender(generic._shared_docs["shift"] % _shared_doc_kwargs) - def shift(self, periods=1, freq=None, axis=0, fill_value=None): + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) @@ -4085,7 +4188,7 @@ def memory_usage(self, index=True, deep=False): v += self.index.memory_usage(deep=deep) return v - def isin(self, values): + def isin(self, values) -> "Series": """ Check whether `values` are contained in Series. @@ -4141,7 +4244,7 @@ def isin(self, values): result = algorithms.isin(self, values) return self._constructor(result, index=self.index).__finalize__(self) - def between(self, left, right, inclusive=True): + def between(self, left, right, inclusive=True) -> "Series": """ Return boolean Series equivalent to left <= series <= right. @@ -4217,19 +4320,19 @@ def between(self, left, right, inclusive=True): return lmask & rmask @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) - def isna(self): + def isna(self) -> "Series": return super().isna() @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) - def isnull(self): + def isnull(self) -> "Series": return super().isnull() @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) - def notna(self): + def notna(self) -> "Series": return super().notna() @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) - def notnull(self): + def notnull(self) -> "Series": return super().notnull() def dropna(self, axis=0, inplace=False, how=None): @@ -4323,7 +4426,7 @@ def dropna(self, axis=0, inplace=False, how=None): # ---------------------------------------------------------------------- # Time series-oriented methods - def to_timestamp(self, freq=None, how="start", copy=True): + def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": """ Cast to DatetimeIndex of Timestamps, at *beginning* of period. @@ -4348,7 +4451,7 @@ def to_timestamp(self, freq=None, how="start", copy=True): new_index = self.index.to_timestamp(freq=freq, how=how) return self._constructor(new_values, index=new_index).__finalize__(self) - def to_period(self, freq=None, copy=True): + def to_period(self, freq=None, copy=True) -> "Series": """ Convert Series from DatetimeIndex to PeriodIndex with desired frequency (inferred from index if not passed). @@ -4386,11 +4489,8 @@ def to_period(self, freq=None, copy=True): hist = pandas.plotting.hist_series -Series._setup_axes( - ["index"], docs={"index": "The index (axis labels) of the Series."}, -) +Series._setup_axes(["index"], docs={"index": "The index (axis labels) of the Series."}) Series._add_numeric_operations() -Series._add_series_only_operations() Series._add_series_or_dataframe_operations() # Add arithmetic! diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 02f4eb47ba914..4bcf2943e3d6e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -8,6 +8,7 @@ import numpy as np import pandas._libs.lib as lib +import pandas._libs.missing as libmissing import pandas._libs.ops as libops from pandas._typing import ArrayLike, Dtype from pandas.util._decorators import Appender @@ -118,12 +119,15 @@ def cat_safe(list_of_columns: List, sep: str): return result -def _na_map(f, arr, na_result=np.nan, dtype=object): - # should really _check_ for NA +def _na_map(f, arr, na_result=None, dtype=object): if is_extension_array_dtype(arr.dtype): + if na_result is None: + na_result = libmissing.NA # just StringDtype arr = extract_array(arr) return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) + if na_result is None: + na_result = np.nan return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) @@ -438,8 +442,8 @@ def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): if regex.groups > 0: warnings.warn( - "This pattern has match groups. To actually get the" - " groups, use str.extract.", + "This pattern has match groups. To actually get the " + "groups, use str.extract.", UserWarning, stacklevel=3, ) @@ -880,11 +884,12 @@ def _str_extract_noexpand(arr, pat, flags=0): if arr.empty: result = DataFrame(columns=columns, dtype=object) else: + dtype = _result_dtype(arr) result = DataFrame( [groups_or_na(val) for val in arr], columns=columns, index=arr.index, - dtype=object, + dtype=dtype, ) return result, name diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index f193865d90b71..3a9d0623ff4a6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -12,7 +12,6 @@ DateParseError, _format_is_iso, _guess_datetime_format, - parse_time_string, ) from pandas._libs.tslibs.strptime import array_strptime from pandas._typing import ArrayLike @@ -38,8 +37,10 @@ ) from pandas.core.dtypes.missing import notna +from pandas.arrays import DatetimeArray, IntegerArray from pandas.core import algorithms from pandas.core.algorithms import unique +from pandas.core.arrays.datetimes import tz_to_dtype # --------------------------------------------------------------------- # types used in annotations @@ -230,9 +231,7 @@ def _return_parsed_timezone_results(result, timezones, tz, name): """ if tz is not None: raise ValueError( - "Cannot pass a tz argument when " - "parsing strings with timezone " - "information." + "Cannot pass a tz argument when parsing strings with timezone information." ) tz_results = np.array( [Timestamp(res).tz_localize(zone) for res, zone in zip(result, timezones)] @@ -284,7 +283,6 @@ def _convert_listlike_datetimes( Index-like of parsed dates """ from pandas import DatetimeIndex - from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import ( maybe_convert_dtype, objects_to_datetime64ns, @@ -316,8 +314,21 @@ def _convert_listlike_datetimes( elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") - arg = getattr(arg, "values", arg) - result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) + arg = getattr(arg, "_values", arg) + + # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime + # because it expects an ndarray argument + if isinstance(arg, IntegerArray): + # Explicitly pass NaT mask to array_with_unit_to_datetime + mask = arg.isna() + arg = arg._ndarray_values + else: + mask = None + + result, tz_parsed = tslib.array_with_unit_to_datetime( + arg, mask, unit, errors=errors + ) + if errors == "ignore": from pandas import Index @@ -416,7 +427,8 @@ def _convert_listlike_datetimes( # datetime objects are found without passing `utc=True` try: values, tz = conversion.datetime_to_datetime64(arg) - return DatetimeIndex._simple_new(values, name=name, tz=tz) + dta = DatetimeArray(values, dtype=tz_to_dtype(tz)) + return DatetimeIndex._simple_new(dta, name=name) except (ValueError, TypeError): raise e @@ -436,7 +448,8 @@ def _convert_listlike_datetimes( if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC - return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed) + dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) + return DatetimeIndex._simple_new(dta, name=name) utc = tz == "utc" return _box_as_indexlike(result, utc=utc, name=name) @@ -631,7 +644,7 @@ def to_datetime( dtype: datetime64[ns] If a date does not meet the `timestamp limitations - `_, passing errors='ignore' will return the original input instead of raising any exception. @@ -803,8 +816,7 @@ def f(value): required = ",".join(req) raise ValueError( "to assemble mappings requires at least that " - f"[year, month, day] be specified: [{required}] " - "is missing" + f"[year, month, day] be specified: [{required}] is missing" ) # keys we don't recognize diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index fa3582755a202..3366f10b92604 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -2,6 +2,7 @@ data hash pandas / numpy objects """ import itertools +from typing import Optional import numpy as np @@ -58,7 +59,7 @@ def hash_pandas_object( obj, index: bool = True, encoding: str = "utf8", - hash_key: str = _default_hash_key, + hash_key: Optional[str] = _default_hash_key, categorize: bool = True, ): """ @@ -82,14 +83,18 @@ def hash_pandas_object( """ from pandas import Series + if hash_key is None: + hash_key = _default_hash_key + if isinstance(obj, ABCMultiIndex): return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) - if isinstance(obj, ABCIndexClass): + elif isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False ) h = Series(h, index=obj, dtype="uint64", copy=False) + elif isinstance(obj, ABCSeries): h = hash_array(obj.values, encoding, hash_key, categorize).astype( "uint64", copy=False diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index c7d856e9a1e88..ed0b816f64800 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -70,6 +70,7 @@ def _apply( floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, + use_numba_cache: bool = False, **kwargs, ): """ @@ -97,14 +98,13 @@ def _flex_binary_moment(arg1, arg2, f, pairwise=False): and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) ): raise TypeError( - "arguments to moment function must be of type " - "np.ndarray/Series/DataFrame" + "arguments to moment function must be of type np.ndarray/Series/DataFrame" ) if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( arg2, (np.ndarray, ABCSeries) ): - X, Y = _prep_binary(arg1, arg2) + X, Y = prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, ABCDataFrame): @@ -151,7 +151,7 @@ def dataframe_from_int_dict(data, frame_template): results[i][j] = results[j][i] else: results[i][j] = f( - *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) + *prep_binary(arg1.iloc[:, i], arg2.iloc[:, j]) ) from pandas import concat @@ -212,7 +212,7 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'pairwise' is not True/False") else: results = { - i: f(*_prep_binary(arg1.iloc[:, i], arg2)) + i: f(*prep_binary(arg1.iloc[:, i], arg2)) for i, col in enumerate(arg1.columns) } return dataframe_from_int_dict(results, arg1) @@ -249,31 +249,10 @@ def _get_center_of_mass(comass, span, halflife, alpha): return float(comass) -def _offset(window, center): +def calculate_center_offset(window): if not is_integer(window): window = len(window) - offset = (window - 1) / 2.0 if center else 0 - try: - return int(offset) - except TypeError: - return offset.astype(int) - - -def _require_min_periods(p): - def _check_func(minp, window): - if minp is None: - return window - else: - return max(p, minp) - - return _check_func - - -def _use_window(minp, window): - if minp is None: - return window - else: - return minp + return int((window - 1) / 2.0) def calculate_min_periods( @@ -311,7 +290,7 @@ def calculate_min_periods( return max(min_periods, floor) -def _zsqrt(x): +def zsqrt(x): with np.errstate(all="ignore"): result = np.sqrt(x) mask = x < 0 @@ -326,7 +305,7 @@ def _zsqrt(x): return result -def _prep_binary(arg1, arg2): +def prep_binary(arg1, arg2): if not isinstance(arg2, type(arg1)): raise Exception("Input arrays must be of the same type!") @@ -335,3 +314,12 @@ def _prep_binary(arg1, arg2): Y = arg2 + 0 * arg1 return X, Y + + +def get_weighted_roll_func(cfunc: Callable) -> Callable: + def func(arg, window, min_periods=None): + if min_periods is None: + min_periods = len(window) + return cfunc(arg, window, min_periods) + + return func diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index baecba7e78384..37e3cd42f2115 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -9,8 +9,13 @@ from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.base import DataError -from pandas.core.window.common import _doc_template, _get_center_of_mass, _shared_docs -from pandas.core.window.rolling import _flex_binary_moment, _Rolling, _zsqrt +from pandas.core.window.common import ( + _doc_template, + _get_center_of_mass, + _shared_docs, + zsqrt, +) +from pandas.core.window.rolling import _flex_binary_moment, _Rolling _bias_template = """ Parameters @@ -89,7 +94,7 @@ class EWM(_Rolling): (if adjust is True), and 1-alpha and alpha (if adjust is False). More details can be found at - http://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows + https://pandas.pydata.org/pandas-docs/stable/user_guide/computation.html#exponentially-weighted-windows Examples -------- @@ -269,7 +274,7 @@ def std(self, bias=False, *args, **kwargs): Exponential weighted moving stddev. """ nv.validate_window_func("std", args, kwargs) - return _zsqrt(self.var(bias=bias, **kwargs)) + return zsqrt(self.var(bias=bias, **kwargs)) vol = std @@ -314,7 +319,7 @@ def cov(self, other=None, pairwise=None, bias=False, **kwargs): inputs. In the case of missing elements, only complete pairwise observations will be used. bias : bool, default False - Use a standard estimation bias correction + Use a standard estimation bias correction. **kwargs Keyword arguments to be passed into func. """ @@ -390,7 +395,7 @@ def _cov(x, y): cov = _cov(x_values, y_values) x_var = _cov(x_values, x_values) y_var = _cov(y_values, y_values) - corr = cov / _zsqrt(x_var * y_var) + corr = cov / zsqrt(x_var * y_var) return X._wrap_result(corr) return _flex_binary_moment( diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 68c3514308cbc..a0bf3376d2352 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -1,4 +1,5 @@ from textwrap import dedent +from typing import Dict, Optional from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution @@ -148,8 +149,23 @@ def count(self, **kwargs): @Substitution(name="expanding") @Appender(_shared_docs["apply"]) - def apply(self, func, raw=False, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) + def apply( + self, + func, + raw: bool = False, + engine: str = "cython", + engine_kwargs: Optional[Dict[str, bool]] = None, + args=None, + kwargs=None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) @Substitution(name="expanding") @Appender(_shared_docs["sum"]) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 0fa24a0ba1b5a..921cdb3c2523f 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -32,7 +32,7 @@ class BaseIndexer: - """Base class for window bounds calculations""" + """Base class for window bounds calculations.""" def __init__( self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py new file mode 100644 index 0000000000000..127957943d2ff --- /dev/null +++ b/pandas/core/window/numba_.py @@ -0,0 +1,127 @@ +import types +from typing import Any, Callable, Dict, Optional, Tuple + +import numpy as np + +from pandas._typing import Scalar +from pandas.compat._optional import import_optional_dependency + + +def make_rolling_apply( + func: Callable[..., Scalar], + args: Tuple, + nogil: bool, + parallel: bool, + nopython: bool, +): + """ + Creates a JITted rolling apply function with a JITted version of + the user's function. + + Parameters + ---------- + func : function + function to be applied to each window and will be JITed + args : tuple + *args to be passed into the function + nogil : bool + nogil parameter from engine_kwargs for numba.jit + parallel : bool + parallel parameter from engine_kwargs for numba.jit + nopython : bool + nopython parameter from engine_kwargs for numba.jit + + Returns + ------- + Numba function + """ + numba = import_optional_dependency("numba") + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + if isinstance(func, numba.targets.registry.CPUDispatcher): + # Don't jit a user passed jitted function + numba_func = func + else: + + @numba.generated_jit(nopython=nopython, nogil=nogil, parallel=parallel) + def numba_func(window, *_args): + if getattr(np, func.__name__, False) is func or isinstance( + func, types.BuiltinFunctionType + ): + jf = func + else: + jf = numba.jit(func, nopython=nopython, nogil=nogil) + + def impl(window, *_args): + return jf(window, *_args) + + return impl + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def roll_apply( + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, + ) -> np.ndarray: + result = np.empty(len(begin)) + for i in loop_range(len(result)): + start = begin[i] + stop = end[i] + window = values[start:stop] + count_nan = np.sum(np.isnan(window)) + if len(window) - count_nan >= minimum_periods: + result[i] = numba_func(window, *args) + else: + result[i] = np.nan + return result + + return roll_apply + + +def generate_numba_apply_func( + args: Tuple, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], +): + """ + Generate a numba jitted apply function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a rolling apply function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + + Returns + ------- + Numba function + """ + + if engine_kwargs is None: + engine_kwargs = {} + + nopython = engine_kwargs.get("nopython", True) + nogil = engine_kwargs.get("nogil", False) + parallel = engine_kwargs.get("parallel", False) + + if kwargs and nopython: + raise ValueError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) + + return make_rolling_apply(func, args, nogil, parallel, nopython) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 07b484321a665..f7efa69778c44 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -24,7 +24,6 @@ is_integer_dtype, is_list_like, is_scalar, - is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.generic import ( @@ -43,17 +42,18 @@ WindowGroupByMixin, _doc_template, _flex_binary_moment, - _offset, _shared_docs, - _use_window, - _zsqrt, + calculate_center_offset, calculate_min_periods, + get_weighted_roll_func, + zsqrt, ) from pandas.core.window.indexers import ( BaseIndexer, FixedWindowIndexer, VariableWindowIndexer, ) +from pandas.core.window.numba_ import generate_numba_apply_func class _Window(PandasObject, ShallowMixin, SelectionMixin): @@ -92,6 +92,7 @@ def __init__( self.win_freq = None self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() + self._numba_func_cache: Dict[Optional[str], Callable] = dict() @property def _constructor(self): @@ -182,7 +183,7 @@ def _gotitem(self, key, ndim, subset=None): self._selection = key return self - def __getattr__(self, attr): + def __getattr__(self, attr: str): if attr in self._internal_names_set: return object.__getattribute__(self, attr) if attr in self.obj: @@ -250,19 +251,6 @@ def __iter__(self): url = "https://github.com/pandas-dev/pandas/issues/11704" raise NotImplementedError(f"See issue #11704 {url}") - def _get_index(self) -> Optional[np.ndarray]: - """ - Return integer representations as an ndarray if index is frequency. - - Returns - ------- - None or ndarray - """ - - if self.is_freq_type: - return self._on.asi8 - return None - def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: """Convert input to numpy arrays for Cython routines""" if values is None: @@ -303,17 +291,6 @@ def _wrap_result(self, result, block=None, obj=None): if isinstance(result, np.ndarray): - # coerce if necessary - if block is not None: - if is_timedelta64_dtype(block.values.dtype): - # TODO: do we know what result.dtype is at this point? - # i.e. can we just do an astype? - from pandas import to_timedelta - - result = to_timedelta(result.ravel(), unit="ns").values.reshape( - result.shape - ) - if result.ndim == 1: from pandas import Series @@ -382,14 +359,11 @@ def _center_window(self, result, window) -> np.ndarray: if self.axis > result.ndim - 1: raise ValueError("Requested axis is larger then no. of argument dimensions") - offset = _offset(window, True) + offset = calculate_center_offset(window) if offset > 0: - if isinstance(result, (ABCSeries, ABCDataFrame)): - result = result.slice_shift(-offset, axis=self.axis) - else: - lead_indexer = [slice(None)] * result.ndim - lead_indexer[self.axis] = slice(offset, None) - result = np.copy(result[tuple(lead_indexer)]) + lead_indexer = [slice(None)] * result.ndim + lead_indexer[self.axis] = slice(offset, None) + result = np.copy(result[tuple(lead_indexer)]) return result def _get_roll_func(self, func_name: str) -> Callable: @@ -422,17 +396,15 @@ def _get_cython_func_type(self, func: str) -> Callable: return self._get_roll_func(f"{func}_variable") return partial(self._get_roll_func(f"{func}_fixed"), win=self._get_window()) - def _get_window_indexer( - self, index_as_array: Optional[np.ndarray], window: int - ) -> BaseIndexer: + def _get_window_indexer(self, window: int) -> BaseIndexer: """ Return an indexer class that will compute the window start and end bounds """ if isinstance(self.window, BaseIndexer): return self.window if self.is_freq_type: - return VariableWindowIndexer(index_array=index_as_array, window_size=window) - return FixedWindowIndexer(index_array=index_as_array, window_size=window) + return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) + return FixedWindowIndexer(window_size=window) def _apply( self, @@ -442,6 +414,7 @@ def _apply( floor: int = 1, is_weighted: bool = False, name: Optional[str] = None, + use_numba_cache: bool = False, **kwargs, ): """ @@ -454,10 +427,13 @@ def _apply( func : callable function to apply center : bool require_min_periods : int - floor: int - is_weighted - name: str, + floor : int + is_weighted : bool + name : str, compatibility with groupby.rolling + use_numba_cache : bool + whether to cache a numba compiled function. Only available for numba + enabled methods (so far only apply) **kwargs additional arguments for rolling function and window function @@ -470,8 +446,7 @@ def _apply( blocks, obj = self._create_blocks() block_list = list(blocks) - index_as_array = self._get_index() - window_indexer = self._get_window_indexer(index_as_array, window) + window_indexer = self._get_window_indexer(window) results = [] exclude: List[Scalar] = [] @@ -492,7 +467,7 @@ def _apply( continue # calculation function - offset = _offset(window, center) if center else 0 + offset = calculate_center_offset(window) if center else 0 additional_nans = np.array([np.nan] * offset) if not is_weighted: @@ -532,6 +507,9 @@ def calc(x): result = calc(values) result = np.asarray(result) + if use_numba_cache: + self._numba_func_cache[name] = func + if center: result = self._center_window(result, window) @@ -847,7 +825,7 @@ class Window(_Window): changed to the center of the window by setting ``center=True``. To learn more about the offsets & frequency strings, please see `this link - `__. + `__. The recognized win_types are: @@ -894,6 +872,17 @@ class Window(_Window): 3 NaN 4 NaN + Rolling sum with a window length of 2, using the 'gaussian' + window type (note how we need to specify std). + + >>> df.rolling(2, win_type='gaussian').sum(std=3) + B + 0 NaN + 1 0.986207 + 2 2.958621 + 3 NaN + 4 NaN + Rolling sum with a window length of 2, min_periods defaults to the window length. @@ -1042,15 +1031,6 @@ def _get_window( # GH #15662. `False` makes symmetric window, rather than periodic. return sig.get_window(win_type, window, False).astype(float) - def _get_weighted_roll_func( - self, cfunc: Callable, check_minp: Callable, **kwargs - ) -> Callable: - def func(arg, window, min_periods=None, closed=None): - minp = check_minp(min_periods, len(window)) - return cfunc(arg, window, minp, **kwargs) - - return func - _agg_see_also_doc = dedent( """ See Also @@ -1118,7 +1098,7 @@ def aggregate(self, func, *args, **kwargs): def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = self._get_roll_func("roll_weighted_sum") - window_func = self._get_weighted_roll_func(window_func, _use_window) + window_func = get_weighted_roll_func(window_func) return self._apply( window_func, center=self.center, is_weighted=True, name="sum", **kwargs ) @@ -1128,7 +1108,7 @@ def sum(self, *args, **kwargs): def mean(self, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) window_func = self._get_roll_func("roll_weighted_mean") - window_func = self._get_weighted_roll_func(window_func, _use_window) + window_func = get_weighted_roll_func(window_func) return self._apply( window_func, center=self.center, is_weighted=True, name="mean", **kwargs ) @@ -1138,7 +1118,7 @@ def mean(self, *args, **kwargs): def var(self, ddof=1, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(self._get_roll_func("roll_weighted_var"), ddof=ddof) - window_func = self._get_weighted_roll_func(window_func, _use_window) + window_func = get_weighted_roll_func(window_func) kwargs.pop("name", None) return self._apply( window_func, center=self.center, is_weighted=True, name="var", **kwargs @@ -1148,7 +1128,7 @@ def var(self, ddof=1, *args, **kwargs): @Appender(_shared_docs["std"]) def std(self, ddof=1, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - return _zsqrt(self.var(ddof=ddof, name="std", **kwargs)) + return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) class _Rolling(_Window): @@ -1202,8 +1182,6 @@ class _Rolling_and_Expanding(_Rolling): def count(self): blocks, obj = self._create_blocks() - # Validate the index - self._get_index() window = self._get_window() window = min(window, len(obj)) if not self.center else window @@ -1225,13 +1203,17 @@ def count(self): _shared_docs["apply"] = dedent( r""" - The %(name)s function's apply function. + Apply an arbitrary function to each %(name)s window. Parameters ---------- func : function Must produce a single value from an ndarray input if ``raw=True`` - or a single value from a Series if ``raw=False``. + or a single value from a Series if ``raw=False``. Can also accept a + Numba JIT function with ``engine='numba'`` specified. + + .. versionchanged:: 1.0.0 + raw : bool, default None * ``False`` : passes each row or column as a Series to the function. @@ -1239,9 +1221,27 @@ def count(self): objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. - - *args, **kwargs - Arguments and keyword arguments to be passed into func. + engine : str, default 'cython' + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + + .. versionadded:: 1.0.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to both the ``func`` and the ``apply`` rolling aggregation. + + .. versionadded:: 1.0.0 + + args : tuple, default None + Positional arguments to be passed into func. + kwargs : dict, default None + Keyword arguments to be passed into func. Returns ------- @@ -1252,19 +1252,66 @@ def count(self): -------- Series.%(name)s : Series %(name)s. DataFrame.%(name)s : DataFrame %(name)s. + + Notes + ----- + See :ref:`stats.rolling_apply` for extended documentation and performance + considerations for the Numba engine. """ ) - def apply(self, func, raw=False, args=(), kwargs={}): - from pandas import Series - + def apply( + self, + func, + raw: bool = False, + engine: str = "cython", + engine_kwargs: Optional[Dict] = None, + args: Optional[Tuple] = None, + kwargs: Optional[Dict] = None, + ): + if args is None: + args = () + if kwargs is None: + kwargs = {} kwargs.pop("_level", None) kwargs.pop("floor", None) window = self._get_window() - offset = _offset(window, self.center) + offset = calculate_center_offset(window) if self.center else 0 if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") + if engine == "cython": + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + apply_func = self._generate_cython_apply_func( + args, kwargs, raw, offset, func + ) + elif engine == "numba": + if raw is False: + raise ValueError("raw must be `True` when using the numba engine") + if func in self._numba_func_cache: + # Return an already compiled version of roll_apply if available + apply_func = self._numba_func_cache[func] + else: + apply_func = generate_numba_apply_func( + args, kwargs, func, engine_kwargs + ) + else: + raise ValueError("engine must be either 'numba' or 'cython'") + + # TODO: Why do we always pass center=False? + # name=func for WindowGroupByMixin._apply + return self._apply( + apply_func, + center=False, + floor=0, + name=func, + use_numba_cache=engine == "numba", + ) + + def _generate_cython_apply_func(self, args, kwargs, raw, offset, func): + from pandas import Series + window_func = partial( self._get_cython_func_type("roll_generic"), args=args, @@ -1279,9 +1326,7 @@ def apply_func(values, begin, end, min_periods, raw=raw): values = Series(values, index=self.obj.index) return window_func(values, begin, end, min_periods) - # TODO: Why do we always pass center=False? - # name=func for WindowGroupByMixin._apply - return self._apply(apply_func, center=False, floor=0, name=func) + return apply_func def sum(self, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) @@ -1402,7 +1447,7 @@ def std(self, ddof=1, *args, **kwargs): window_func = self._get_cython_func_type("roll_var") def zsqrt_func(values, begin, end, min_periods): - return _zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) + return zsqrt(window_func(values, begin, end, min_periods, ddof=ddof)) # ddof passed again for compat with groupby.rolling return self._apply( @@ -1775,8 +1820,7 @@ def _on(self) -> Index: else: raise ValueError( f"invalid on specified as {self.on}, " - "must be a column (of DataFrame), an Index " - "or None" + "must be a column (of DataFrame), an Index or None" ) def validate(self): @@ -1793,9 +1837,8 @@ def validate(self): # we don't allow center if self.center: raise NotImplementedError( - "center is not implemented " - "for datetimelike and offset " - "based windows" + "center is not implemented for " + "datetimelike and offset based windows" ) # this will raise ValueError on non-fixed freqs @@ -1841,8 +1884,7 @@ def _validate_freq(self): except (TypeError, ValueError): raise ValueError( f"passed window {self.window} is not " - "compatible with a datetimelike " - "index" + "compatible with a datetimelike index" ) _agg_see_also_doc = dedent( @@ -1927,8 +1969,23 @@ def count(self): @Substitution(name="rolling") @Appender(_shared_docs["apply"]) - def apply(self, func, raw=False, args=(), kwargs={}): - return super().apply(func, raw=raw, args=args, kwargs=kwargs) + def apply( + self, + func, + raw=False, + engine="cython", + engine_kwargs=None, + args=None, + kwargs=None, + ): + return super().apply( + func, + raw=raw, + engine=engine, + engine_kwargs=engine_kwargs, + args=args, + kwargs=kwargs, + ) @Substitution(name="rolling") @Appender(_shared_docs["sum"]) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 518b940ec5da3..97178261bdf72 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -69,8 +69,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover kwargs["engine"] = "python" elif len(sep) > 1 and kwargs.get("engine") == "c": warnings.warn( - "read_clipboard with regex separator does not work" - " properly with c engine" + "read_clipboard with regex separator does not work properly with c engine" ) return read_csv(StringIO(text), sep=sep, **kwargs) diff --git a/pandas/io/common.py b/pandas/io/common.py index e165f8baef3e6..cf19169214c35 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,25 +1,13 @@ """Common IO api utilities""" import bz2 -import codecs -from collections.abc import Iterator +from collections import abc import gzip from io import BufferedIOBase, BytesIO import mmap import os import pathlib -from typing import ( - IO, - Any, - AnyStr, - BinaryIO, - Dict, - List, - Mapping, - Optional, - Tuple, - Union, -) +from typing import IO, Any, AnyStr, Dict, List, Mapping, Optional, Tuple, Union from urllib.parse import ( # noqa urlencode, urljoin, @@ -90,8 +78,7 @@ def _expand_user( def validate_header_arg(header) -> None: if isinstance(header, bool): raise TypeError( - "Passing a bool to header is invalid. " - "Use header=None for no header or " + "Passing a bool to header is invalid. Use header=None for no header or " "header=int or list-like of ints to specify " "the row(s) making up the column names" ) @@ -419,8 +406,8 @@ def get_handle( raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError( - "Multiple files found in ZIP file." - f" Only one file per ZIP: {zip_names}" + "Multiple files found in ZIP file. " + f"Only one file per ZIP: {zip_names}" ) # XZ Compression @@ -503,7 +490,7 @@ def closed(self): return self.fp is None -class _MMapWrapper(Iterator): +class _MMapWrapper(abc.Iterator): """ Wrapper for the Python's mmap class so that it can be properly read in by Python's csv.reader class. @@ -538,24 +525,3 @@ def __next__(self) -> str: if newline == "": raise StopIteration return newline - - -class UTF8Recoder(Iterator): - """ - Iterator that reads an encoded stream and re-encodes the input to UTF-8 - """ - - def __init__(self, f: BinaryIO, encoding: str): - self.reader = codecs.getreader(encoding)(f) - - def read(self, bytes: int = -1) -> bytes: - return self.reader.read(bytes).encode("utf-8") - - def readline(self) -> bytes: - return self.reader.readline().encode("utf-8") - - def __next__(self) -> bytes: - return next(self.reader).encode("utf-8") - - def close(self): - self.reader.close() diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 7fdca2d65b05d..07919dbda63ae 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -57,8 +57,7 @@ def _check_columns(cols): for i, n in enumerate(map(len, tail)): if n != N: raise AssertionError( - f"All columns must have the same length: {N}; " - f"column {i} has length {n}" + f"All columns must have the same length: {N}; column {i} has length {n}" ) return N diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index cdc7715b68f22..bf2b55ad04f82 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -41,7 +41,7 @@ Parameters ---------- -io : str, ExcelFile, xlrd.Book, path object or file-like object +io : str, bytes, ExcelFile, xlrd.Book, path object, or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.xlsx``. @@ -298,9 +298,7 @@ def read_excel( for arg in ("sheet", "sheetname", "parse_cols"): if arg in kwds: - raise TypeError( - "read_excel() got an unexpected keyword argument `{}`".format(arg) - ) + raise TypeError(f"read_excel() got an unexpected keyword argument `{arg}`") if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -353,6 +351,8 @@ def __init__(self, filepath_or_buffer): self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): self.book = self.load_workbook(filepath_or_buffer) + elif isinstance(filepath_or_buffer, bytes): + self.book = self.load_workbook(BytesIO(filepath_or_buffer)) else: raise ValueError( "Must explicitly set engine if not passing in buffer or path for io." @@ -430,7 +430,7 @@ def parse( for asheetname in sheets: if verbose: - print("Reading sheet {sheet}".format(sheet=asheetname)) + print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) @@ -529,8 +529,10 @@ def parse( class ExcelWriter(metaclass=abc.ABCMeta): """ - Class for writing DataFrame objects into excel sheets, default is to use - xlwt for xls, openpyxl for xlsx. See DataFrame.to_excel for typical usage. + Class for writing DataFrame objects into excel sheets. + + Default is to use xlwt for xls, openpyxl for xlsx. + See DataFrame.to_excel for typical usage. Parameters ---------- @@ -544,7 +546,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): Format string for dates written into Excel files (e.g. 'YYYY-MM-DD'). datetime_format : str, default None Format string for datetime objects written into Excel files. - (e.g. 'YYYY-MM-DD HH:MM:SS') + (e.g. 'YYYY-MM-DD HH:MM:SS'). mode : {'w', 'a'}, default 'w' File mode to use (write or append). @@ -623,11 +625,11 @@ def __new__(cls, path, engine=None, **kwargs): ext = "xlsx" try: - engine = config.get_option("io.excel.{ext}.writer".format(ext=ext)) + engine = config.get_option(f"io.excel.{ext}.writer") if engine == "auto": engine = _get_default_writer(ext) except KeyError: - raise ValueError("No engine for filetype: '{ext}'".format(ext=ext)) + raise ValueError(f"No engine for filetype: '{ext}'") cls = get_writer(engine) return object.__new__(cls) @@ -758,9 +760,8 @@ def check_extension(cls, ext): if ext.startswith("."): ext = ext[1:] if not any(ext in extension for extension in cls.supported_extensions): - msg = "Invalid extension for engine '{engine}': '{ext}'".format( - engine=pprint_thing(cls.engine), ext=pprint_thing(ext) - ) + msg = "Invalid extension for engine" + f"'{pprint_thing(cls.engine)}': '{pprint_thing(ext)}'" raise ValueError(msg) else: return True @@ -808,7 +809,7 @@ def __init__(self, io, engine=None): FutureWarning, ) if engine not in self._engines: - raise ValueError("Unknown engine: {engine}".format(engine=engine)) + raise ValueError(f"Unknown engine: {engine}") self.engine = engine # could be a str, ExcelFile, Book, etc. diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 6b9943136664a..ec5f6fcb17ff8 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -156,7 +156,7 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: # GH5394 cell_value = float(cell.attributes.get((OFFICENS, "value"))) - if cell_value == 0.0 and str(cell) != cell_value: # NA handling + if cell_value == 0.0: # NA handling return str(cell) if convert_float: @@ -178,4 +178,4 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: elif cell_type == "time": return pd.to_datetime(str(cell)).time() else: - raise ValueError("Unrecognized type {}".format(cell_type)) + raise ValueError(f"Unrecognized type {cell_type}") diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 7a264ed2b0850..be52523e486af 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -99,7 +99,7 @@ def _convert_to_style_kwargs(cls, style_dict): for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] - _conv_to_x = getattr(cls, "_convert_to_{k}".format(k=k), lambda x: None) + _conv_to_x = getattr(cls, f"_convert_to_{k}", lambda x: None) new_v = _conv_to_x(v) if new_v: style_kwargs[k] = new_v diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index ee617d2013136..9d284c8031840 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -48,7 +48,7 @@ def get_writer(engine_name): try: return _writers[engine_name] except KeyError: - raise ValueError("No Excel writer '{engine}'".format(engine=engine_name)) + raise ValueError(f"No Excel writer '{engine_name}'") def _excel2num(x): @@ -76,7 +76,7 @@ def _excel2num(x): cp = ord(c) if cp < ord("A") or cp > ord("Z"): - raise ValueError("Invalid column name: {x}".format(x=x)) + raise ValueError(f"Invalid column name: {x}") index = index * 26 + cp - ord("A") + 1 @@ -136,8 +136,7 @@ def _maybe_convert_usecols(usecols): if is_integer(usecols): raise ValueError( "Passing an integer for `usecols` is no longer supported. " - "Please pass in a list of int from 0 to `usecols` " - "inclusive instead." + "Please pass in a list of int from 0 to `usecols` inclusive instead." ) if isinstance(usecols, str): @@ -154,8 +153,8 @@ def _validate_freeze_panes(freeze_panes): return True raise ValueError( - "freeze_panes must be of form (row, column)" - " where row and column are integers" + "freeze_panes must be of form (row, column) " + "where row and column are integers" ) # freeze_panes wasn't specified, return False so it won't be applied diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 996ae1caa14c8..d102a885cef0a 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -97,20 +97,20 @@ def _style_to_xlwt( if hasattr(item, "items"): if firstlevel: it = [ - "{key}: {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + f"{key}: {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = "{sep} ".format(sep=(line_sep).join(it)) + out = f"{(line_sep).join(it)} " return out else: it = [ - "{key} {val}".format(key=key, val=cls._style_to_xlwt(value, False)) + f"{key} {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = "{sep} ".format(sep=(field_sep).join(it)) + out = f"{(field_sep).join(it)} " return out else: - item = "{item}".format(item=item) + item = f"{item}" item = item.replace("True", "on") item = item.replace("False", "off") return item diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index eb05004d9137c..5d4925620e75f 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -37,16 +37,13 @@ def to_feather(df: DataFrame, path): typ = type(df.index) raise ValueError( f"feather does not support serializing {typ} " - "for the index; you can .reset_index() " - "to make the index into column(s)" + "for the index; you can .reset_index() to make the index into column(s)" ) if not df.index.equals(RangeIndex.from_range(range(len(df)))): raise ValueError( - "feather does not support serializing a " - "non-default index for the index; you " - "can .reset_index() to make the index " - "into column(s)" + "feather does not support serializing a non-default index for the index; " + "you can .reset_index() to make the index into column(s)" ) if df.index.name is not None: diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index 583dd49d4c66a..b40d2a57b8106 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -1,4 +1,5 @@ -"""Utilities for interpreting CSS from Stylers for formatting non-HTML outputs +""" +Utilities for interpreting CSS from Stylers for formatting non-HTML outputs. """ import re @@ -6,13 +7,15 @@ class CSSWarning(UserWarning): - """This CSS syntax cannot currently be parsed""" + """ + This CSS syntax cannot currently be parsed. + """ pass def _side_expander(prop_fmt: str): - def expand(self, prop, value): + def expand(self, prop, value: str): tokens = value.split() try: mapping = self.SIDE_SHORTHANDS[len(tokens)] @@ -28,12 +31,13 @@ def expand(self, prop, value): class CSSResolver: - """A callable for parsing and resolving CSS to atomic properties - + """ + A callable for parsing and resolving CSS to atomic properties. """ def __call__(self, declarations_str, inherited=None): - """ the given declarations to atomic properties + """ + The given declarations to atomic properties. Parameters ---------- @@ -46,8 +50,8 @@ def __call__(self, declarations_str, inherited=None): Returns ------- - props : dict - Atomic CSS 2.2 properties + dict + Atomic CSS 2.2 properties. Examples -------- @@ -69,7 +73,6 @@ def __call__(self, declarations_str, inherited=None): ('font-size', '24pt'), ('font-weight', 'bold')] """ - props = dict(self.atomize(self.parse(declarations_str))) if inherited is None: inherited = {} @@ -235,10 +238,15 @@ def atomize(self, declarations): expand_margin = _side_expander("margin-{:s}") expand_padding = _side_expander("padding-{:s}") - def parse(self, declarations_str): - """Generates (prop, value) pairs from declarations + def parse(self, declarations_str: str): + """ + Generates (prop, value) pairs from declarations. In a future version may generate parsed tokens from tinycss/tinycss2 + + Parameters + ---------- + declarations_str : str """ for decl in declarations_str.split(";"): if not decl.strip(): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 72ba1a892cb8f..0d581f30e50e7 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,13 +5,14 @@ import csv as csvlib from io import StringIO import os -from typing import List +from typing import Hashable, List, Mapping, Optional, Sequence, Union import warnings from zipfile import ZipFile import numpy as np from pandas._libs import writers as libwriters +from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -33,27 +34,26 @@ class CSVFormatter: def __init__( self, obj, - path_or_buf=None, - sep=",", - na_rep="", - float_format=None, + path_or_buf: Optional[FilePathOrBuffer[str]] = None, + sep: str = ",", + na_rep: str = "", + float_format: Optional[str] = None, cols=None, - header=True, - index=True, - index_label=None, - mode="w", - encoding=None, - compression="infer", - quoting=None, + header: Union[bool, Sequence[Hashable]] = True, + index: bool = True, + index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, + mode: str = "w", + encoding: Optional[str] = None, + compression: Union[str, Mapping[str, str], None] = "infer", + quoting: Optional[int] = None, line_terminator="\n", - chunksize=None, + chunksize: Optional[int] = None, quotechar='"', - date_format=None, - doublequote=True, - escapechar=None, + date_format: Optional[str] = None, + doublequote: bool = True, + escapechar: Optional[str] = None, decimal=".", ): - self.obj = obj if path_or_buf is None: @@ -154,14 +154,17 @@ def __init__( if not index: self.nlevels = 0 - def save(self): + def save(self) -> None: """ - Create the writer & save + Create the writer & save. """ # GH21227 internal compression is not used when file-like passed. if self.compression and hasattr(self.path_or_buf, "write"): - msg = "compression has no effect when passing file-like object as input." - warnings.warn(msg, RuntimeWarning, stacklevel=2) + warnings.warn( + "compression has no effect when passing file-like object as input.", + RuntimeWarning, + stacklevel=2, + ) # when zip compression is called. is_zip = isinstance(self.path_or_buf, ZipFile) or ( @@ -223,7 +226,6 @@ def save(self): _fh.close() def _save_header(self): - writer = self.writer obj = self.obj index_label = self.index_label @@ -303,8 +305,7 @@ def _save_header(self): encoded_labels.extend([""] * len(columns)) writer.writerow(encoded_labels) - def _save(self): - + def _save(self) -> None: self._save_header() nrows = len(self.data_index) @@ -321,8 +322,7 @@ def _save(self): self._save_chunk(start_i, end_i) - def _save_chunk(self, start_i: int, end_i: int): - + def _save_chunk(self, start_i: int, end_i: int) -> None: data_index = self.data_index # create the data for a chunk diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 18340bc702378..b0e8e4033edf2 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -1,13 +1,17 @@ -"""Utilities for conversion to writer-agnostic Excel representation +""" +Utilities for conversion to writer-agnostic Excel representation. """ from functools import reduce import itertools import re +from typing import Callable, Dict, Optional, Sequence, Union import warnings import numpy as np +from pandas._typing import Label + from pandas.core.dtypes import missing from pandas.core.dtypes.common import is_float, is_scalar from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex @@ -25,7 +29,9 @@ class ExcelCell: __fields__ = ("row", "col", "val", "style", "mergestart", "mergeend") __slots__ = __fields__ - def __init__(self, row, col, val, style=None, mergestart=None, mergeend=None): + def __init__( + self, row: int, col: int, val, style=None, mergestart=None, mergeend=None + ): self.row = row self.col = col self.val = val @@ -56,7 +62,7 @@ class CSSToExcelConverter: # instancemethods so that users can easily experiment with extensions # without monkey-patching. - def __init__(self, inherited=None): + def __init__(self, inherited: Optional[str] = None): if inherited is not None: inherited = self.compute_css(inherited) @@ -64,7 +70,7 @@ def __init__(self, inherited=None): compute_css = CSSResolver() - def __call__(self, declarations_str: str): + def __call__(self, declarations_str: str) -> Dict[str, Dict[str, str]]: """ Convert CSS declarations to ExcelWriter style. @@ -84,7 +90,7 @@ def __call__(self, declarations_str: str): properties = self.compute_css(declarations_str, self.inherited) return self.build_xlstyle(properties) - def build_xlstyle(self, props): + def build_xlstyle(self, props: Dict[str, str]) -> Dict[str, Dict[str, str]]: out = { "alignment": self.build_alignment(props), "border": self.build_border(props), @@ -95,7 +101,7 @@ def build_xlstyle(self, props): # TODO: handle cell width and height: needs support in pandas.io.excel - def remove_none(d): + def remove_none(d: Dict[str, str]) -> None: """Remove key where value is None, through nested dicts""" for k, v in list(d.items()): if v is None: @@ -118,7 +124,7 @@ def remove_none(d): # OpenXML also has 'justify', 'distributed' } - def build_alignment(self, props): + def build_alignment(self, props) -> Dict[str, Optional[Union[bool, str]]]: # TODO: text-indent, padding-left -> alignment.indent return { "horizontal": props.get("text-align"), @@ -130,7 +136,7 @@ def build_alignment(self, props): ), } - def build_border(self, props): + def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: return { side: { "style": self._border_style( @@ -142,7 +148,7 @@ def build_border(self, props): for side in ["top", "right", "bottom", "left"] } - def _border_style(self, style, width): + def _border_style(self, style: Optional[str], width): # convert styles and widths to openxml, one of: # 'dashDot' # 'dashDotDot' @@ -191,7 +197,7 @@ def _border_style(self, style, width): return "dashed" return "mediumDashed" - def build_fill(self, props): + def build_fill(self, props: Dict[str, str]): # TODO: perhaps allow for special properties # -excel-pattern-bgcolor and -excel-pattern-type fill_color = props.get("background-color") @@ -215,7 +221,7 @@ def build_fill(self, props): } ITALIC_MAP = {"normal": False, "italic": True, "oblique": True} - def build_font(self, props): + def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: size = props.get("font-size") if size is not None: assert size.endswith("pt") @@ -311,7 +317,7 @@ def build_font(self, props): "white": "FFFFFF", } - def color_to_excel(self, val): + def color_to_excel(self, val: Optional[str]): if val is None: return None if val.startswith("#") and len(val) == 7: @@ -323,7 +329,7 @@ def color_to_excel(self, val): except KeyError: warnings.warn(f"Unhandled color format: {repr(val)}", CSSWarning) - def build_number_format(self, props): + def build_number_format(self, props: Dict) -> Dict[str, Optional[str]]: return {"format_code": props.get("number-format")} @@ -366,15 +372,15 @@ class ExcelFormatter: def __init__( self, df, - na_rep="", - float_format=None, - cols=None, - header=True, - index=True, - index_label=None, - merge_cells=False, - inf_rep="inf", - style_converter=None, + na_rep: str = "", + float_format: Optional[str] = None, + cols: Optional[Sequence[Label]] = None, + header: Union[Sequence[Label], bool] = True, + index: bool = True, + index_label: Optional[Union[Label, Sequence[Label]]] = None, + merge_cells: bool = False, + inf_rep: str = "inf", + style_converter: Optional[Callable] = None, ): self.rowcounter = 0 self.na_rep = na_rep @@ -442,10 +448,8 @@ def _format_header_mi(self): if self.columns.nlevels > 1: if not self.index: raise NotImplementedError( - "Writing to Excel with MultiIndex" - " columns and no index " - "('index'=False) is not yet " - "implemented." + "Writing to Excel with MultiIndex columns and no " + "index ('index'=False) is not yet implemented." ) has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index)) @@ -540,7 +544,6 @@ def _format_header(self): return itertools.chain(gen, gen2) def _format_body(self): - if isinstance(self.df.index, ABCMultiIndex): return self._format_hierarchical_rows() else: @@ -716,8 +719,7 @@ def write( num_rows, num_cols = self.df.shape if num_rows > self.max_rows or num_cols > self.max_cols: raise ValueError( - "This sheet is too large! Your sheet size is: " - f"{num_rows}, {num_cols} " + f"This sheet is too large! Your sheet size is: {num_rows}, {num_cols} " f"Max sheet size is: {self.max_rows}, {self.max_cols}" ) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 1b18e0fc3f0fa..296b305f41dd2 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -231,7 +231,7 @@ def __init__( self, series: "Series", buf: Optional[IO[str]] = None, - length: bool = True, + length: Union[bool, str] = True, header: bool = True, index: bool = True, na_rep: str = "NaN", @@ -281,7 +281,9 @@ def _chk_truncate(self) -> None: series = series.iloc[:max_rows] else: row_num = max_rows // 2 - series = concat((series.iloc[:row_num], series.iloc[-row_num:])) + series = series._ensure_type( + concat((series.iloc[:row_num], series.iloc[-row_num:])) + ) self.tr_row_num = row_num else: self.tr_row_num = None @@ -450,7 +452,7 @@ def _get_adjustment() -> TextAdjustment: class TableFormatter: - show_dimensions: bool + show_dimensions: Union[bool, str] is_truncated: bool formatters: formatters_type columns: Index @@ -554,7 +556,7 @@ def __init__( max_rows: Optional[int] = None, min_rows: Optional[int] = None, max_cols: Optional[int] = None, - show_dimensions: bool = False, + show_dimensions: Union[bool, str] = False, decimal: str = ".", table_id: Optional[str] = None, render_links: bool = False, @@ -577,8 +579,8 @@ def __init__( else: raise ValueError( ( - "Formatters length({flen}) should match" - " DataFrame number of columns({dlen})" + "Formatters length({flen}) should match " + "DataFrame number of columns({dlen})" ).format(flen=len(formatters), dlen=len(frame.columns)) ) self.na_rep = na_rep @@ -735,12 +737,8 @@ def _to_str_columns(self) -> List[List[str]]: self.header = cast(List[str], self.header) if len(self.header) != len(self.columns): raise ValueError( - ( - "Writing {ncols} cols but got {nalias} " - "aliases".format( - ncols=len(self.columns), nalias=len(self.header) - ) - ) + f"Writing {len(self.columns)} cols " + f"but got {len(self.header)} aliases" ) str_columns = [[label] for label in self.header] else: @@ -1228,7 +1226,7 @@ def _format(x): if x is None: return "None" elif x is NA: - return "NA" + return str(NA) elif x is NaT or np.isnat(x): return "NaT" except (TypeError, ValueError): @@ -1276,7 +1274,7 @@ class FloatArrayFormatter(GenericArrayFormatter): """ def __init__(self, *args, **kwargs): - GenericArrayFormatter.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) # float_format is expected to be a string # formatter should be used to pass a function diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index b88478b3da181..e3161415fe2bc 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -2,12 +2,13 @@ Module for formatting output data in HTML. """ -from collections import OrderedDict from textwrap import dedent from typing import IO, Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union, cast from pandas._config import get_option +from pandas._libs import lib + from pandas.core.dtypes.generic import ABCMultiIndex from pandas import option_context @@ -138,10 +139,9 @@ def _write_cell( else: start_tag = "<{kind}>".format(kind=kind) - esc: Union[OrderedDict[str, str], Dict] if self.escape: # escape & first to prevent double escaping of & - esc = OrderedDict([("&", r"&"), ("<", r"<"), (">", r">")]) + esc = {"&": r"&", "<": r"<", ">": r">"} else: esc = {} @@ -216,8 +216,8 @@ def _write_table(self, indent: int = 0) -> None: self.classes = self.classes.split() if not isinstance(self.classes, (list, tuple)): raise TypeError( - "classes must be a string, list, or tuple, " - "not {typ}".format(typ=type(self.classes)) + "classes must be a string, list, " + f"or tuple, not {type(self.classes)}" ) _classes.extend(self.classes) @@ -247,7 +247,7 @@ def _write_col_header(self, indent: int) -> None: if self.fmt.sparsify: # GH3547 - sentinel = object() + sentinel = lib.no_default else: sentinel = False levels = self.columns.format(sparsify=sentinel, adjoin=False, names=False) @@ -453,7 +453,7 @@ def _write_hierarchical_rows( if self.fmt.sparsify: # GH3547 - sentinel = object() + sentinel = lib.no_default levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 008a99427f3c7..8ab56437d5c05 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -114,8 +114,7 @@ def pad_empties(x): column_format = index_format + column_format elif not isinstance(self.column_format, str): # pragma: no cover raise AssertionError( - "column_format must be str or unicode, " - "not {typ}".format(typ=type(column_format)) + f"column_format must be str or unicode, not {type(column_format)}" ) else: column_format = self.column_format diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index d3a12ccb77048..565752e269d79 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1,6 +1,5 @@ """ -Module for applying conditional formatting to -DataFrames and Series. +Module for applying conditional formatting to DataFrames and Series. """ from collections import defaultdict @@ -8,13 +7,25 @@ import copy from functools import partial from itertools import product -from typing import Any, Callable, DefaultDict, Dict, List, Optional, Sequence, Tuple +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, +) from uuid import uuid1 import numpy as np from pandas._config import get_option +from pandas._libs import lib +from pandas._typing import Axis, FrameOrSeries, FrameOrSeriesUnion, Label from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import Appender @@ -23,6 +34,7 @@ import pandas as pd from pandas.api.types import is_dict_like, is_list_like import pandas.core.common as com +from pandas.core.frame import DataFrame from pandas.core.generic import _shared_docs from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice @@ -40,7 +52,7 @@ @contextmanager -def _mpl(func): +def _mpl(func: Callable): if has_mpl: yield plt, colors else: @@ -124,13 +136,13 @@ class Styler: def __init__( self, - data, - precision=None, - table_styles=None, - uuid=None, - caption=None, - table_attributes=None, - cell_ids=True, + data: FrameOrSeriesUnion, + precision: Optional[int] = None, + table_styles: Optional[List[Dict[str, List[Tuple[str, str]]]]] = None, + uuid: Optional[str] = None, + caption: Optional[str] = None, + table_attributes: Optional[str] = None, + cell_ids: bool = True, na_rep: Optional[str] = None, ): self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) @@ -174,7 +186,7 @@ def default_display_func(x): Tuple[int, int], Callable[[Any], str] ] = defaultdict(lambda: default_display_func) - def _repr_html_(self): + def _repr_html_(self) -> str: """ Hooks into Jupyter notebook rich display system. """ @@ -195,22 +207,22 @@ def _repr_html_(self): def to_excel( self, excel_writer, - sheet_name="Sheet1", - na_rep="", - float_format=None, - columns=None, - header=True, - index=True, - index_label=None, - startrow=0, - startcol=0, - engine=None, - merge_cells=True, - encoding=None, - inf_rep="inf", - verbose=True, - freeze_panes=None, - ): + sheet_name: str = "Sheet1", + na_rep: str = "", + float_format: Optional[str] = None, + columns: Optional[Sequence[Label]] = None, + header: Union[Sequence[Label], bool] = True, + index: bool = True, + index_label: Optional[Union[Label, Sequence[Label]]] = None, + startrow: int = 0, + startcol: int = 0, + engine: Optional[str] = None, + merge_cells: bool = True, + encoding: Optional[str] = None, + inf_rep: str = "inf", + verbose: bool = True, + freeze_panes: Optional[Tuple[int, int]] = None, + ) -> None: from pandas.io.formats.excel import ExcelFormatter @@ -255,7 +267,7 @@ def _translate(self): BLANK_VALUE = "" def format_attr(pair): - return "{key}={value}".format(**pair) + return f"{pair['key']}={pair['value']}" # for sparsifying a MultiIndex idx_lengths = _get_level_lengths(self.index) @@ -274,7 +286,7 @@ def format_attr(pair): clabels = [[x] for x in clabels] clabels = list(zip(*clabels)) - cellstyle = [] + cellstyle_map = defaultdict(list) head = [] for r in range(n_clvls): @@ -396,12 +408,17 @@ def format_attr(pair): for x in ctx[r, c]: # have to handle empty styles like [''] if x.count(":"): - props.append(x.split(":")) + props.append(tuple(x.split(":"))) else: - props.append(["", ""]) - cellstyle.append({"props": props, "selector": f"row{r}_col{c}"}) + props.append(("", "")) + cellstyle_map[tuple(props)].append(f"row{r}_col{c}") body.append(row_es) + cellstyle = [ + {"props": list(props), "selectors": selectors} + for props, selectors in cellstyle_map.items() + ] + table_attr = self.table_attributes use_mathjax = get_option("display.html.use_mathjax") if not use_mathjax: @@ -422,7 +439,7 @@ def format_attr(pair): table_attributes=table_attr, ) - def format(self, formatter, subset=None, na_rep: Optional[str] = None): + def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Styler": """ Format the text display value of cells. @@ -495,7 +512,7 @@ def format(self, formatter, subset=None, na_rep: Optional[str] = None): self._display_funcs[(i, j)] = formatter return self - def render(self, **kwargs): + def render(self, **kwargs) -> str: """ Render the built up styles to HTML. @@ -544,16 +561,18 @@ def render(self, **kwargs): d.update(kwargs) return self.template.render(**d) - def _update_ctx(self, attrs): + def _update_ctx(self, attrs: DataFrame) -> None: """ Update the state of the Styler. Collects a mapping of {index_label: [': ']}. - attrs : Series or DataFrame - should contain strings of ': ;: ' - Whitespace shouldn't matter and the final trailing ';' shouldn't - matter. + Parameters + ---------- + attrs : DataFrame + should contain strings of ': ;: ' + Whitespace shouldn't matter and the final trailing ';' shouldn't + matter. """ for row_label, v in attrs.iterrows(): for col_label, col in v.items(): @@ -562,7 +581,7 @@ def _update_ctx(self, attrs): for pair in col.rstrip(";").split(";"): self.ctx[(i, j)].append(pair) - def _copy(self, deepcopy=False): + def _copy(self, deepcopy: bool = False) -> "Styler": styler = Styler( self.data, precision=self.precision, @@ -579,16 +598,16 @@ def _copy(self, deepcopy=False): styler._todo = self._todo return styler - def __copy__(self): + def __copy__(self) -> "Styler": """ Deep copy by default. """ return self._copy(deepcopy=False) - def __deepcopy__(self, memo): + def __deepcopy__(self, memo) -> "Styler": return self._copy(deepcopy=True) - def clear(self): + def clear(self) -> None: """ Reset the styler, removing any previously applied styles. @@ -611,7 +630,13 @@ def _compute(self): r = func(self)(*args, **kwargs) return r - def _apply(self, func, axis=0, subset=None, **kwargs): + def _apply( + self, + func: Callable[..., "Styler"], + axis: Optional[Axis] = 0, + subset=None, + **kwargs, + ) -> "Styler": subset = slice(None) if subset is None else subset subset = _non_reducing_slice(subset) data = self.data.loc[subset] @@ -644,7 +669,13 @@ def _apply(self, func, axis=0, subset=None, **kwargs): self._update_ctx(result) return self - def apply(self, func, axis=0, subset=None, **kwargs): + def apply( + self, + func: Callable[..., "Styler"], + axis: Optional[Axis] = 0, + subset=None, + **kwargs, + ) -> "Styler": """ Apply a function column-wise, row-wise, or table-wise. @@ -695,7 +726,7 @@ def apply(self, func, axis=0, subset=None, **kwargs): ) return self - def _applymap(self, func, subset=None, **kwargs): + def _applymap(self, func: Callable, subset=None, **kwargs) -> "Styler": func = partial(func, **kwargs) # applymap doesn't take kwargs? if subset is None: subset = pd.IndexSlice[:] @@ -704,7 +735,7 @@ def _applymap(self, func, subset=None, **kwargs): self._update_ctx(result) return self - def applymap(self, func, subset=None, **kwargs): + def applymap(self, func: Callable, subset=None, **kwargs) -> "Styler": """ Apply a function elementwise. @@ -733,7 +764,14 @@ def applymap(self, func, subset=None, **kwargs): ) return self - def where(self, cond, value, other=None, subset=None, **kwargs): + def where( + self, + cond: Callable, + value: str, + other: Optional[str] = None, + subset=None, + **kwargs, + ) -> "Styler": """ Apply a function elementwise. @@ -772,7 +810,7 @@ def where(self, cond, value, other=None, subset=None, **kwargs): lambda val: value if cond(val) else other, subset=subset, **kwargs ) - def set_precision(self, precision): + def set_precision(self, precision: int) -> "Styler": """ Set the precision used to render. @@ -787,7 +825,7 @@ def set_precision(self, precision): self.precision = precision return self - def set_table_attributes(self, attributes): + def set_table_attributes(self, attributes: str) -> "Styler": """ Set the table attributes. @@ -811,7 +849,7 @@ def set_table_attributes(self, attributes): self.table_attributes = attributes return self - def export(self): + def export(self) -> List[Tuple[Callable, Tuple, Dict]]: """ Export the styles to applied to the current Styler. @@ -827,7 +865,7 @@ def export(self): """ return self._todo - def use(self, styles): + def use(self, styles: List[Tuple[Callable, Tuple, Dict]]) -> "Styler": """ Set the styles on the current Styler. @@ -849,7 +887,7 @@ def use(self, styles): self._todo.extend(styles) return self - def set_uuid(self, uuid): + def set_uuid(self, uuid: str) -> "Styler": """ Set the uuid for a Styler. @@ -864,7 +902,7 @@ def set_uuid(self, uuid): self.uuid = uuid return self - def set_caption(self, caption): + def set_caption(self, caption: str) -> "Styler": """ Set the caption on a Styler. @@ -879,7 +917,7 @@ def set_caption(self, caption): self.caption = caption return self - def set_table_styles(self, table_styles): + def set_table_styles(self, table_styles) -> "Styler": """ Set the table styles on a Styler. @@ -926,7 +964,7 @@ def set_na_rep(self, na_rep: str) -> "Styler": self.na_rep = na_rep return self - def hide_index(self): + def hide_index(self) -> "Styler": """ Hide any indices from rendering. @@ -939,7 +977,7 @@ def hide_index(self): self.hidden_index = True return self - def hide_columns(self, subset): + def hide_columns(self, subset) -> "Styler": """ Hide columns from rendering. @@ -965,10 +1003,10 @@ def hide_columns(self, subset): # ----------------------------------------------------------------------- @staticmethod - def _highlight_null(v, null_color): + def _highlight_null(v, null_color: str) -> str: return f"background-color: {null_color}" if pd.isna(v) else "" - def highlight_null(self, null_color="red"): + def highlight_null(self, null_color: str = "red") -> "Styler": """ Shade the background ``null_color`` for missing values. @@ -986,14 +1024,14 @@ def highlight_null(self, null_color="red"): def background_gradient( self, cmap="PuBu", - low=0, - high=0, - axis=0, + low: float = 0, + high: float = 0, + axis: Optional[Axis] = 0, subset=None, - text_color_threshold=0.408, + text_color_threshold: float = 0.408, vmin: Optional[float] = None, vmax: Optional[float] = None, - ): + ) -> "Styler": """ Color the background in a gradient style. @@ -1068,9 +1106,9 @@ def background_gradient( def _background_gradient( s, cmap="PuBu", - low=0, - high=0, - text_color_threshold=0.408, + low: float = 0, + high: float = 0, + text_color_threshold: float = 0.408, vmin: Optional[float] = None, vmax: Optional[float] = None, ): @@ -1094,7 +1132,7 @@ def _background_gradient( # https://github.com/matplotlib/matplotlib/issues/5427 rgbas = plt.cm.get_cmap(cmap)(norm(s.to_numpy(dtype=float))) - def relative_luminance(rgba): + def relative_luminance(rgba) -> float: """ Calculate relative luminance of a color. @@ -1116,7 +1154,7 @@ def relative_luminance(rgba): ) return 0.2126 * r + 0.7152 * g + 0.0722 * b - def css(rgba): + def css(rgba) -> str: dark = relative_luminance(rgba) < text_color_threshold text_color = "#f1f1f1" if dark else "#000000" return f"background-color: {colors.rgb2hex(rgba)};color: {text_color};" @@ -1130,7 +1168,7 @@ def css(rgba): columns=s.columns, ) - def set_properties(self, subset=None, **kwargs): + def set_properties(self, subset=None, **kwargs) -> "Styler": """ Method to set one or more non-data dependent properties or each cell. @@ -1156,7 +1194,14 @@ def set_properties(self, subset=None, **kwargs): return self.applymap(f, subset=subset) @staticmethod - def _bar(s, align, colors, width=100, vmin=None, vmax=None): + def _bar( + s, + align: str, + colors: List[str], + width: float = 100, + vmin: Optional[float] = None, + vmax: Optional[float] = None, + ): """ Draw bar chart in dataframe cells. """ @@ -1174,7 +1219,7 @@ def _bar(s, align, colors, width=100, vmin=None, vmax=None): normed = width * (s.to_numpy(dtype=float) - smin) / (smax - smin + 1e-12) zero = -width * smin / (smax - smin + 1e-12) - def css_bar(start, end, color): + def css_bar(start: float, end: float, color: str) -> str: """ Generate CSS code to draw a bar from start to end. """ @@ -1211,13 +1256,13 @@ def css(x): def bar( self, subset=None, - axis=0, + axis: Optional[Axis] = 0, color="#d65f5f", - width=100, - align="left", - vmin=None, - vmax=None, - ): + width: float = 100, + align: str = "left", + vmin: Optional[float] = None, + vmax: Optional[float] = None, + ) -> "Styler": """ Draw bar chart in the cell backgrounds. @@ -1272,9 +1317,9 @@ def bar( color = [color[0], color[0]] elif len(color) > 2: raise ValueError( - "`color` must be string or a list-like" - " of length 2: [`color_neg`, `color_pos`]" - " (eg: color=['#d65f5f', '#5fba7d'])" + "`color` must be string or a list-like " + "of length 2: [`color_neg`, `color_pos`] " + "(eg: color=['#d65f5f', '#5fba7d'])" ) subset = _maybe_numeric_slice(self.data, subset) @@ -1292,7 +1337,9 @@ def bar( return self - def highlight_max(self, subset=None, color="yellow", axis=0): + def highlight_max( + self, subset=None, color: str = "yellow", axis: Optional[Axis] = 0 + ) -> "Styler": """ Highlight the maximum by shading the background. @@ -1312,7 +1359,9 @@ def highlight_max(self, subset=None, color="yellow", axis=0): """ return self._highlight_handler(subset=subset, color=color, axis=axis, max_=True) - def highlight_min(self, subset=None, color="yellow", axis=0): + def highlight_min( + self, subset=None, color: str = "yellow", axis: Optional[Axis] = 0 + ) -> "Styler": """ Highlight the minimum by shading the background. @@ -1334,7 +1383,13 @@ def highlight_min(self, subset=None, color="yellow", axis=0): subset=subset, color=color, axis=axis, max_=False ) - def _highlight_handler(self, subset=None, color="yellow", axis=None, max_=True): + def _highlight_handler( + self, + subset=None, + color: str = "yellow", + axis: Optional[Axis] = None, + max_: bool = True, + ) -> "Styler": subset = _non_reducing_slice(_maybe_numeric_slice(self.data, subset)) self.apply( self._highlight_extrema, color=color, axis=axis, subset=subset, max_=max_ @@ -1342,7 +1397,9 @@ def _highlight_handler(self, subset=None, color="yellow", axis=None, max_=True): return self @staticmethod - def _highlight_extrema(data, color="yellow", max_=True): + def _highlight_extrema( + data: FrameOrSeries, color: str = "yellow", max_: bool = True + ): """ Highlight the min or max in a Series or DataFrame. """ @@ -1387,7 +1444,7 @@ class MyStyler(cls): return MyStyler - def pipe(self, func, *args, **kwargs): + def pipe(self, func: Callable, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``, and return the result. @@ -1459,7 +1516,7 @@ def pipe(self, func, *args, **kwargs): return com.pipe(self, func, *args, **kwargs) -def _is_visible(idx_row, idx_col, lengths): +def _is_visible(idx_row, idx_col, lengths) -> bool: """ Index -> {(idx_row, idx_col): bool}). """ @@ -1475,8 +1532,7 @@ def _get_level_lengths(index, hidden_elements=None): Result is a dictionary of (level, initial_position): span """ - sentinel = object() - levels = index.format(sparsify=sentinel, adjoin=False, names=False) + levels = index.format(sparsify=lib.no_default, adjoin=False, names=False) if hidden_elements is None: hidden_elements = [] @@ -1492,10 +1548,10 @@ def _get_level_lengths(index, hidden_elements=None): for j, row in enumerate(lvl): if not get_option("display.multi_sparse"): lengths[(i, j)] = 1 - elif (row != sentinel) and (j not in hidden_elements): + elif (row is not lib.no_default) and (j not in hidden_elements): last_label = j lengths[(i, last_label)] = 1 - elif row != sentinel: + elif row is not lib.no_default: # even if its hidden, keep track of it in case # length >1 and later elements are visible last_label = j @@ -1510,7 +1566,9 @@ def _get_level_lengths(index, hidden_elements=None): return non_zero_lengths -def _maybe_wrap_formatter(formatter, na_rep: Optional[str]): +def _maybe_wrap_formatter( + formatter: Union[Callable, str], na_rep: Optional[str] +) -> Callable: if isinstance(formatter, str): formatter_func = lambda x: formatter.format(x) elif callable(formatter): diff --git a/pandas/io/formats/templates/html.tpl b/pandas/io/formats/templates/html.tpl index 15feafcea6864..97bfda9af089d 100644 --- a/pandas/io/formats/templates/html.tpl +++ b/pandas/io/formats/templates/html.tpl @@ -14,7 +14,7 @@ {% block before_cellstyle %}{% endblock before_cellstyle %} {% block cellstyle %} {%- for s in cellstyle %} - #T_{{uuid}}{{s.selector}} { + {%- for selector in s.selectors -%}{%- if not loop.first -%},{%- endif -%}#T_{{uuid}}{{selector}}{%- endfor -%} { {% for p,val in s.props %} {{p}}: {{val}}; {% endfor %} diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index d9711f4f4626a..69ebc470fba6f 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -1,6 +1,11 @@ """ Google BigQuery support """ +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union + from pandas.compat._optional import import_optional_dependency +if TYPE_CHECKING: + from pandas import DataFrame + def _try_import(): # since pandas is a dependency of pandas-gbq @@ -14,21 +19,21 @@ def _try_import(): def read_gbq( - query, - project_id=None, - index_col=None, - col_order=None, - reauth=False, - auth_local_webserver=False, - dialect=None, - location=None, - configuration=None, + query: str, + project_id: Optional[str] = None, + index_col: Optional[str] = None, + col_order: Optional[List[str]] = None, + reauth: bool = False, + auth_local_webserver: bool = False, + dialect: Optional[str] = None, + location: Optional[str] = None, + configuration: Optional[Dict[str, Any]] = None, credentials=None, - use_bqstorage_api=None, + use_bqstorage_api: Optional[bool] = None, private_key=None, verbose=None, - progress_bar_type=None, -): + progress_bar_type: Optional[str] = None, +) -> "DataFrame": """ Load data from Google BigQuery. @@ -157,7 +162,7 @@ def read_gbq( """ pandas_gbq = _try_import() - kwargs = {} + kwargs: Dict[str, Union[str, bool]] = {} # START: new kwargs. Don't populate unless explicitly set. if use_bqstorage_api is not None: @@ -183,20 +188,20 @@ def read_gbq( def to_gbq( - dataframe, - destination_table, - project_id=None, - chunksize=None, - reauth=False, - if_exists="fail", - auth_local_webserver=False, - table_schema=None, - location=None, - progress_bar=True, + dataframe: "DataFrame", + destination_table: str, + project_id: Optional[str] = None, + chunksize: Optional[int] = None, + reauth: bool = False, + if_exists: str = "fail", + auth_local_webserver: bool = False, + table_schema: Optional[List[Dict[str, str]]] = None, + location: Optional[str] = None, + progress_bar: bool = True, credentials=None, verbose=None, private_key=None, -): +) -> None: pandas_gbq = _try_import() pandas_gbq.to_gbq( dataframe, diff --git a/pandas/io/html.py b/pandas/io/html.py index eafcca0e85bb3..75cb0fafaa6b3 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -591,9 +591,14 @@ def _setup_build_doc(self): def _build_doc(self): from bs4 import BeautifulSoup - return BeautifulSoup( - self._setup_build_doc(), features="html5lib", from_encoding=self.encoding - ) + bdoc = self._setup_build_doc() + if isinstance(bdoc, bytes) and self.encoding is not None: + udoc = bdoc.decode(self.encoding) + from_encoding = None + else: + udoc = bdoc + from_encoding = self.encoding + return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) def _build_xpath_expr(attrs) -> str: @@ -899,8 +904,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): f"The flavor {flav} failed to parse your input. " "Since you passed a non-rewindable file " "object, we can't rewind it to try " - "another parser. Try read_html() with a " - "different flavor." + "another parser. Try read_html() with a different flavor." ) retained = caught diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 086a486a2ec9a..ae6ae70cbac72 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,5 +1,4 @@ -from collections import OrderedDict -from collections.abc import Iterator +from collections import abc import functools from io import StringIO from itertools import islice @@ -12,6 +11,7 @@ from pandas._libs.tslibs import iNaT from pandas._typing import JSONSerializable from pandas.errors import AbstractMethodError +from pandas.util._decorators import deprecate_kwarg from pandas.core.dtypes.common import ensure_str, is_period_dtype @@ -25,11 +25,10 @@ infer_compression, stringify_path, ) +from pandas.io.json._normalize import convert_to_line_delimits +from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import _validate_integer -from ._normalize import convert_to_line_delimits -from ._table_schema import build_table_schema, parse_table_schema - loads = json.loads dumps = json.dumps @@ -54,7 +53,7 @@ def to_json( if not index and orient not in ["split", "table"]: raise ValueError( - "'index=False' is only valid when 'orient' is " "'split' or 'table'" + "'index=False' is only valid when 'orient' is 'split' or 'table'" ) path_or_buf = stringify_path(path_or_buf) @@ -332,7 +331,7 @@ def _write( default_handler, indent, ): - table_obj = OrderedDict((("schema", self.schema), ("data", obj))) + table_obj = {"schema": self.schema, "data": obj} serialized = super()._write( table_obj, orient, @@ -347,6 +346,7 @@ def _write( return serialized +@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None) def read_json( path_or_buf=None, orient=None, @@ -439,8 +439,17 @@ def read_json( Not applicable for ``orient='table'``. convert_dates : bool or list of str, default True - List of columns to parse for dates. If True, then try to parse - datelike columns. A column label is datelike if + If True then default datelike columns may be converted (depending on + keep_default_dates). + If False, no dates will be converted. + If a list of column names, then those columns will be converted and + default datelike columns may also be converted (depending on + keep_default_dates). + + keep_default_dates : bool, default True + If parsing dates (convert_dates is not False), then try to parse the + default datelike columns. + A column label is datelike if * it ends with ``'_at'``, @@ -452,14 +461,13 @@ def read_json( * it is ``'date'``. - keep_default_dates : bool, default True - If parsing dates, then parse the default datelike columns. - numpy : bool, default False Direct decoding to numpy arrays. Supports numeric data only, but non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. + .. deprecated:: 1.0.0 + precise_float : bool, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but @@ -480,7 +488,7 @@ def read_json( chunksize : int, optional Return JsonReader object for iteration. See the `line-delimited json docs - `_ + `_ for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. @@ -610,7 +618,7 @@ def read_json( return result -class JsonReader(Iterator): +class JsonReader(abc.Iterator): """ JsonReader provides an interface for reading in a JSON file. diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index aa14c3f3a63f3..cf292a13fed7f 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -3,13 +3,14 @@ from collections import defaultdict import copy -from typing import DefaultDict, Dict, List, Optional, Union +from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union import numpy as np from pandas._libs.writers import convert_json_to_lines from pandas.util._decorators import deprecate +import pandas as pd from pandas import DataFrame @@ -112,13 +113,13 @@ def nested_to_record( def _json_normalize( data: Union[Dict, List[Dict]], record_path: Optional[Union[str, List]] = None, - meta: Optional[Union[str, List]] = None, + meta: Optional[Union[str, List[Union[str, List[str]]]]] = None, meta_prefix: Optional[str] = None, record_prefix: Optional[str] = None, errors: Optional[str] = "raise", sep: str = ".", max_level: Optional[int] = None, -): +) -> "DataFrame": """ Normalize semi-structured JSON data into a flat table. @@ -229,14 +230,23 @@ def _json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field(js, spec): - result = js + def _pull_field(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + result = js # type: ignore if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] + if not isinstance(result, Iterable): + if pd.isnull(result): + result = [] # type: ignore + else: + raise TypeError( + f"{js} has non iterable value {result} for path {spec}. " + "Must be iterable or null." + ) + return result if isinstance(data, list) and not data: @@ -265,21 +275,21 @@ def _pull_field(js, spec): elif not isinstance(meta, list): meta = [meta] - meta = [m if isinstance(m, list) else [m] for m in meta] + _meta = [m if isinstance(m, list) else [m] for m in meta] # Disastrously inefficient for now records: List = [] lengths = [] meta_vals: DefaultDict = defaultdict(list) - meta_keys = [sep.join(val) for val in meta] + meta_keys = [sep.join(val) for val in _meta] def _recursive_extract(data, path, seen_meta, level=0): if isinstance(data, dict): data = [data] if len(path) > 1: for obj in data: - for val, key in zip(meta, meta_keys): + for val, key in zip(_meta, meta_keys): if level + 1 == len(val): seen_meta[key] = _pull_field(obj, val[-1]) @@ -296,7 +306,7 @@ def _recursive_extract(data, path, seen_meta, level=0): # For repeating the metadata later lengths.append(len(recs)) - for val, key in zip(meta, meta_keys): + for val, key in zip(_meta, meta_keys): if level + 1 > len(val): meta_val = seen_meta[key] else: @@ -307,8 +317,7 @@ def _recursive_extract(data, path, seen_meta, level=0): meta_val = np.nan else: raise KeyError( - "Try running with " - "errors='ignore' as key " + "Try running with errors='ignore' as key " f"{e} is not always present" ) meta_vals[key].append(meta_val) diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index bc5a9783391a4..5f23b95c10f8e 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -18,9 +18,9 @@ is_string_dtype, is_timedelta64_dtype, ) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import DataFrame -from pandas.api.types import CategoricalDtype import pandas.core.common as com loads = json.loads @@ -81,9 +81,7 @@ def set_default_names(data): if len(nms) == 1 and data.index.name == "index": warnings.warn("Index name of 'index' is not round-trippable") elif len(nms) > 1 and any(x.startswith("level_") for x in nms): - warnings.warn( - "Index names beginning with 'level_' are not " "round-trippable" - ) + warnings.warn("Index names beginning with 'level_' are not round-trippable") return data data = data.copy() @@ -317,12 +315,12 @@ def parse_table_schema(json, precise_float): # Cannot directly use as_type with timezone data on object; raise for now if any(str(x).startswith("datetime64[ns, ") for x in dtypes.values()): - raise NotImplementedError('table="orient" can not yet read timezone ' "data") + raise NotImplementedError('table="orient" can not yet read timezone data') # No ISO constructor for Timedelta as of yet, so need to raise if "timedelta64" in dtypes.values(): raise NotImplementedError( - 'table="orient" can not yet read ' "ISO-formatted Timedelta data" + 'table="orient" can not yet read ISO-formatted Timedelta data' ) df = df.astype(dtypes) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f68347f042086..98f2eb3929b59 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -32,8 +32,7 @@ def get_engine(engine: str) -> "BaseImpl": raise ImportError( "Unable to find a usable engine; " "tried using: 'pyarrow', 'fastparquet'.\n" - "pyarrow or fastparquet is required for parquet " - "support" + "pyarrow or fastparquet is required for parquet support" ) if engine == "pyarrow": @@ -52,7 +51,7 @@ def validate_dataframe(df: DataFrame): raise ValueError("to_parquet only supports IO with DataFrames") # must have value column names (strings only) - if df.columns.inferred_type not in {"string", "unicode", "empty"}: + if df.columns.inferred_type not in {"string", "empty"}: raise ValueError("parquet must have string column names") # index level names must be strings @@ -76,6 +75,9 @@ def __init__(self): ) import pyarrow.parquet + # import utils to register the pyarrow extension types + import pandas.core.arrays._arrow_utils # noqa + self.api = pyarrow def write( @@ -153,8 +155,7 @@ def write( if "partition_on" in kwargs and partition_cols is not None: raise ValueError( "Cannot use both partition_on and " - "partition_cols. Use partition_cols for " - "partitioning data" + "partition_cols. Use partition_cols for partitioning data" ) elif "partition_on" in kwargs: partition_cols = kwargs.pop("partition_on") diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 17e275b84f451..84a8b5b2a94fe 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,11 +2,10 @@ Module contains tools for processing files into DataFrames or other objects """ -from collections import defaultdict -from collections.abc import Iterator +from collections import abc, defaultdict import csv import datetime -from io import StringIO +from io import BufferedIOBase, StringIO, TextIOWrapper import re import sys from textwrap import fill @@ -63,7 +62,6 @@ from pandas.core.tools import datetimes as tools from pandas.io.common import ( - UTF8Recoder, get_filepath_or_buffer, get_handle, infer_compression, @@ -85,7 +83,7 @@ into chunks. Additional help can be found in the online docs for -`IO Tools `_. +`IO Tools `_. Parameters ---------- @@ -273,7 +271,7 @@ chunksize : int, optional Return TextFileReader object for iteration. See the `IO Tools docs - `_ + `_ for more information on ``iterator`` and ``chunksize``. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and @@ -382,9 +380,7 @@ def _validate_integer(name, val, min_val=0): min_val : int Minimum allowed value (val < min_val will result in a ValueError) """ - msg = "'{name:s}' must be an integer >={min_val:d}".format( - name=name, min_val=min_val - ) + msg = f"'{name:s}' must be an integer >={min_val:d}" if val is not None: if is_float(val): @@ -615,9 +611,8 @@ def parser_f( if delim_whitespace and delimiter != default_sep: raise ValueError( - "Specified a delimiter with both sep and" - " delim_whitespace=True; you can only" - " specify one." + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." ) if engine is not None: @@ -688,7 +683,7 @@ def parser_f( read_csv = Appender( _doc_read_csv_and_table.format( func_name="read_csv", - summary=("Read a comma-separated values (csv) file into DataFrame."), + summary="Read a comma-separated values (csv) file into DataFrame.", _default_sep="','", ) )(read_csv) @@ -718,7 +713,7 @@ def read_fwf( into chunks. Additional help can be found in the `online docs for IO Tools - `_. + `_. Parameters ---------- @@ -786,7 +781,7 @@ def read_fwf( return _read(filepath_or_buffer, kwds) -class TextFileReader(Iterator): +class TextFileReader(abc.Iterator): """ Passed dialect overrides any of the related parser options @@ -823,11 +818,7 @@ def __init__(self, f, engine=None, **kwds): try: dialect_val = getattr(dialect, param) except AttributeError: - raise ValueError( - "Invalid dialect '{dialect}' provided".format( - dialect=kwds["dialect"] - ) - ) + raise ValueError(f"Invalid dialect {kwds['dialect']} provided") parser_default = _parser_defaults[param] provided = kwds.get(param, parser_default) @@ -839,11 +830,9 @@ def __init__(self, f, engine=None, **kwds): # even if it conflicts with the dialect (gh-23761). if provided != parser_default and provided != dialect_val: msg = ( - "Conflicting values for '{param}': '{val}' was " - "provided, but the dialect specifies '{diaval}'. " - "Using the dialect-specified value.".format( - param=param, val=provided, diaval=dialect_val - ) + f"Conflicting values for '{param}': '{provided}' was " + f"provided, but the dialect specifies '{dialect_val}'. " + "Using the dialect-specified value." ) # Annoying corner case for not warning about @@ -917,8 +906,8 @@ def _get_options_with_defaults(self, engine): pass else: raise ValueError( - f"The {repr(argname)} option is not supported with the" - f" {repr(engine)} engine" + f"The {repr(argname)} option is not supported with the " + f"{repr(engine)} engine" ) else: value = _deprecated_defaults.get(argname, default) @@ -965,8 +954,8 @@ def _clean_options(self, options, engine): if sep is None and not delim_whitespace: if engine == "c": fallback_reason = ( - "the 'c' engine does not support" - " sep=None with delim_whitespace=False" + "the 'c' engine does not support " + "sep=None with delim_whitespace=False" ) engine = "python" elif sep is not None and len(sep) > 1: @@ -978,8 +967,7 @@ def _clean_options(self, options, engine): fallback_reason = ( "the 'c' engine does not support " "regex separators (separators > 1 char and " - r"different from '\s+' are " - "interpreted as regex)" + r"different from '\s+' are interpreted as regex)" ) engine = "python" elif delim_whitespace: @@ -994,9 +982,9 @@ def _clean_options(self, options, engine): encodeable = False if not encodeable and engine not in ("python", "python-fwf"): fallback_reason = ( - "the separator encoded in {encoding} " + f"the separator encoded in {encoding} " "is > 1 char long, and the 'c' engine " - "does not support such separators".format(encoding=encoding) + "does not support such separators" ) engine = "python" @@ -1010,8 +998,7 @@ def _clean_options(self, options, engine): fallback_reason = ( "ord(quotechar) > 127, meaning the " "quotechar is larger than one byte, " - "and the 'c' engine does not support " - "such quotechars" + "and the 'c' engine does not support such quotechars" ) engine = "python" @@ -1026,9 +1013,9 @@ def _clean_options(self, options, engine): for arg in _python_unsupported: if fallback_reason and result[arg] != _c_parser_defaults[arg]: raise ValueError( - f"Falling back to the 'python' engine because " + "Falling back to the 'python' engine because " f"{fallback_reason}, but this causes {repr(arg)} to be " - f"ignored as it is not supported by the 'python' engine." + "ignored as it is not supported by the 'python' engine." ) del result[arg] @@ -1036,9 +1023,9 @@ def _clean_options(self, options, engine): warnings.warn( ( "Falling back to the 'python' engine because " - "{0}; you can avoid this warning by specifying " + f"{fallback_reason}; you can avoid this warning by specifying " "engine='python'." - ).format(fallback_reason), + ), ParserWarning, stacklevel=5, ) @@ -1059,7 +1046,7 @@ def _clean_options(self, options, engine): msg = ( f"The {repr(arg)} argument has been deprecated and will be " - f"removed in a future version." + "removed in a future version." ) if result.get(arg, depr_default) != depr_default: @@ -1129,9 +1116,8 @@ def _make_engine(self, engine="c"): klass = FixedWidthFieldParser else: raise ValueError( - "Unknown engine: {engine} (valid options are" - ' "c", "python", or' - ' "python-fwf")'.format(engine=engine) + f"Unknown engine: {engine} (valid options " + 'are "c", "python", or "python-fwf")' ) self._engine = klass(self.f, **self.options) @@ -1240,8 +1226,7 @@ def _validate_usecols_names(usecols, names): missing = [c for c in usecols if c not in names] if len(missing) > 0: raise ValueError( - "Usecols do not match columns, " - "columns expected but not found: {missing}".format(missing=missing) + f"Usecols do not match columns, columns expected but not found: {missing}" ) return usecols @@ -1319,7 +1304,7 @@ def _validate_usecols_arg(usecols): usecols_dtype = lib.infer_dtype(usecols, skipna=False) - if usecols_dtype not in ("empty", "integer", "string", "unicode"): + if usecols_dtype not in ("empty", "integer", "string"): raise ValueError(msg) usecols = set(usecols) @@ -1335,8 +1320,7 @@ def _validate_parse_dates_arg(parse_dates): that is the case. """ msg = ( - "Only booleans, lists, and " - "dictionaries are accepted " + "Only booleans, lists, and dictionaries are accepted " "for the 'parse_dates' parameter" ) @@ -1542,11 +1526,9 @@ def _maybe_dedup_names(self, names): counts[col] = cur_count + 1 if is_potential_mi: - col = col[:-1] + ( - "{column}.{count}".format(column=col[-1], count=cur_count), - ) + col = col[:-1] + (f"{col[-1]}.{cur_count}",) else: - col = "{column}.{count}".format(column=col, count=cur_count) + col = f"{col}.{cur_count}" cur_count = counts[col] names[i] = col @@ -1592,7 +1574,7 @@ def _get_simple_index(self, data, columns): def ix(col): if not isinstance(col, str): return col - raise ValueError("Index {col} invalid".format(col=col)) + raise ValueError(f"Index {col} invalid") to_remove = [] index = [] @@ -1616,11 +1598,7 @@ def _get_name(icol): return icol if col_names is None: - raise ValueError( - ("Must supply column order to use {icol!s} as index").format( - icol=icol - ) - ) + raise ValueError(f"Must supply column order to use {icol!s} as index") for i, c in enumerate(col_names): if i == icol: @@ -1696,9 +1674,8 @@ def _convert_to_ndarrays( warnings.warn( ( "Both a converter and dtype were specified " - "for column {0} - only the converter will " - "be used" - ).format(c), + f"for column {c} - only the converter will be used" + ), ParserWarning, stacklevel=7, ) @@ -1736,10 +1713,7 @@ def _convert_to_ndarrays( and not is_categorical_dtype(cast_type) and na_count > 0 ): - raise ValueError( - "Bool column has NA values in " - "column {column}".format(column=c) - ) + raise ValueError(f"Bool column has NA values in column {c}") except (AttributeError, TypeError): # invalid input to is_bool_dtype pass @@ -1747,11 +1721,7 @@ def _convert_to_ndarrays( result[c] = cvals if verbose and na_count: - print( - "Filled {count} NA values in column {c!s}".format( - count=na_count, c=c - ) - ) + print(f"Filled {na_count} NA values in column {c!s}") return result def _infer_types(self, values, na_values, try_num_bool=True): @@ -1848,9 +1818,8 @@ def _cast_types(self, values, cast_type, column): return array_type._from_sequence_of_strings(values, dtype=cast_type) except NotImplementedError: raise NotImplementedError( - "Extension Array: {ea} must implement " - "_from_sequence_of_strings in order " - "to be used in parser methods".format(ea=array_type) + f"Extension Array: {array_type} must implement " + "_from_sequence_of_strings in order to be used in parser methods" ) else: @@ -1858,8 +1827,7 @@ def _cast_types(self, values, cast_type, column): values = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError: raise ValueError( - "Unable to convert column {column} to type " - "{cast_type}".format(column=column, cast_type=cast_type) + f"Unable to convert column {column} to type {cast_type}" ) return values @@ -1891,12 +1859,18 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) - if kwds.get("compression") is None and "utf-16" in (kwds.get("encoding") or ""): - # if source is utf-16 plain text, convert source to utf-8 + encoding = kwds.get("encoding") + + if kwds.get("compression") is None and encoding: if isinstance(src, str): src = open(src, "rb") self.handles.append(src) - src = UTF8Recoder(src, kwds["encoding"]) + + # Handle the file object with universal line mode enabled. + # We will handle the newline character ourselves later on. + if isinstance(src, BufferedIOBase): + src = TextIOWrapper(src, encoding=encoding, newline="") + kwds["encoding"] = "utf-8" # #2442 @@ -1930,8 +1904,7 @@ def __init__(self, src, **kwds): if self.names is None: if self.prefix: self.names = [ - "{prefix}{i}".format(prefix=self.prefix, i=i) - for i in range(self._reader.table_width) + f"{self.prefix}{i}" for i in range(self._reader.table_width) ] else: self.names = list(range(self._reader.table_width)) @@ -2228,8 +2201,6 @@ class PythonParser(ParserBase): def __init__(self, f, **kwds): """ Workhorse function for processing nested list into DataFrame - - Should be replaced by np.genfromtxt eventually? """ ParserBase.__init__(self, kwds) @@ -2346,15 +2317,9 @@ def __init__(self, f, **kwds): raise ValueError("Only length-1 decimal markers supported") if self.thousands is None: - self.nonnum = re.compile( - r"[^-^0-9^{decimal}]+".format(decimal=self.decimal) - ) + self.nonnum = re.compile(fr"[^-^0-9^{self.decimal}]+") else: - self.nonnum = re.compile( - r"[^-^0-9^{thousands}^{decimal}]+".format( - thousands=self.thousands, decimal=self.decimal - ) - ) + self.nonnum = re.compile(fr"[^-^0-9^{self.thousands}^{self.decimal}]+") def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands @@ -2590,8 +2555,8 @@ def _infer_columns(self): except StopIteration: if self.line_pos < hr: raise ValueError( - "Passed header={hr} but only {pos} lines in " - "file".format(hr=hr, pos=(self.line_pos + 1)) + f"Passed header={hr} but only {self.line_pos + 1} lines in " + "file" ) # We have an empty file, so check @@ -2614,11 +2579,9 @@ def _infer_columns(self): for i, c in enumerate(line): if c == "": if have_mi_columns: - col_name = "Unnamed: {i}_level_{level}".format( - i=i, level=level - ) + col_name = f"Unnamed: {i}_level_{level}" else: - col_name = "Unnamed: {i}".format(i=i) + col_name = f"Unnamed: {i}" this_unnamed_cols.append(i) this_columns.append(col_name) @@ -2633,7 +2596,7 @@ def _infer_columns(self): while cur_count > 0: counts[col] = cur_count + 1 - col = "{column}.{count}".format(column=col, count=cur_count) + col = f"{col}.{cur_count}" cur_count = counts[col] this_columns[i] = col @@ -2698,12 +2661,7 @@ def _infer_columns(self): if not names: if self.prefix: - columns = [ - [ - "{prefix}{idx}".format(prefix=self.prefix, idx=i) - for i in range(ncols) - ] - ] + columns = [[f"{self.prefix}{i}" for i in range(ncols)]] else: columns = [list(range(ncols))] columns = self._handle_usecols(columns, columns[0]) @@ -2905,7 +2863,7 @@ def _alert_malformed(self, msg, row_num): if self.error_bad_lines: raise ParserError(msg) elif self.warn_bad_lines: - base = "Skipping line {row_num}: ".format(row_num=row_num) + base = f"Skipping line {row_num}: " sys.stderr.write(base + msg + "\n") def _next_iter_line(self, row_num): @@ -3129,10 +3087,8 @@ def _rows_to_cols(self, content): for row_num, actual_len in bad_lines: msg = ( - "Expected {col_len} fields in line {line}, saw " - "{length}".format( - col_len=col_len, line=(row_num + 1), length=actual_len - ) + f"Expected {col_len} fields in line {row_num + 1}, saw " + f"{actual_len}" ) if ( self.delimiter @@ -3330,9 +3286,7 @@ def _isindex(colspec): converter, colspec, data_dict, orig_names ) if new_name in data_dict: - raise ValueError( - "New date column already in dict {name}".format(name=new_name) - ) + raise ValueError(f"New date column already in dict {new_name}") new_data[new_name] = col new_cols.append(new_name) date_cols.update(old_names) @@ -3341,9 +3295,7 @@ def _isindex(colspec): # dict of new name to column list for new_name, colspec in parse_spec.items(): if new_name in data_dict: - raise ValueError( - "Date column {name} already in dict".format(name=new_name) - ) + raise ValueError(f"Date column {new_name} already in dict") _, col, old_names = _try_convert_dates( converter, colspec, data_dict, orig_names @@ -3522,7 +3474,7 @@ def _stringify_na_values(na_values): # we are like 999 here if v == int(v): v = int(v) - result.append("{value}.0".format(value=v)) + result.append(f"{v}.0") result.append(str(v)) result.append(v) @@ -3582,7 +3534,7 @@ def _get_col_names(colspec, columns): return colnames -class FixedWidthReader(Iterator): +class FixedWidthReader(abc.Iterator): """ A reader of fixed-width lines. """ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 6ce52da21b4e8..e51f24b551f31 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,13 +1,20 @@ """ pickle compat """ import pickle +from typing import Any, Optional import warnings +from pandas._typing import FilePathOrBuffer from pandas.compat import pickle_compat as pc -from pandas.io.common import get_handle, stringify_path +from pandas.io.common import get_filepath_or_buffer, get_handle -def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): +def to_pickle( + obj: Any, + filepath_or_buffer: FilePathOrBuffer, + compression: Optional[str] = "infer", + protocol: int = pickle.HIGHEST_PROTOCOL, +): """ Pickle (serialize) object to file. @@ -15,11 +22,17 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): ---------- obj : any object Any python object. - path : str - File path where the pickled object will be stored. + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be stored. + + .. versionchanged:: 1.0.0 + Accept URL. URL has to be of S3 or GCS. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - A string representing the compression to use in the output file. By - default, infers from the file extension in specified path. + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible @@ -63,8 +76,12 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): >>> import os >>> os.remove("./dummy.pkl") """ - path = stringify_path(path) - f, fh = get_handle(path, "wb", compression=compression, is_text=False) + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression, mode="wb" + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: @@ -73,9 +90,16 @@ def to_pickle(obj, path, compression="infer", protocol=pickle.HIGHEST_PROTOCOL): f.close() for _f in fh: _f.close() + if should_close: + try: + fp_or_buf.close() + except ValueError: + pass -def read_pickle(path, compression="infer"): +def read_pickle( + filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer" +): """ Load pickled pandas object (or any object) from file. @@ -86,13 +110,17 @@ def read_pickle(path, compression="infer"): Parameters ---------- - path : str - File path where the pickled object will be loaded. + filepath_or_buffer : str, path object or file-like object + File path, URL, or buffer where the pickled object will be loaded from. + + .. versionchanged:: 1.0.0 + Accept URL. URL is not limited to S3 and GCS. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', - or '.zip' respectively, and no decompression otherwise. - Set to None for no decompression. + If 'infer' and 'path_or_url' is path-like, then detect compression from + the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no + compression) If 'infer' and 'path_or_url' is not path-like, then use + None (= no decompression). Returns ------- @@ -134,8 +162,12 @@ def read_pickle(path, compression="infer"): >>> import os >>> os.remove("./dummy.pkl") """ - path = stringify_path(path) - f, fh = get_handle(path, "rb", compression=compression, is_text=False) + fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, compression=compression + ) + if not isinstance(fp_or_buf, str) and compression == "infer": + compression = None + f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False) # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes @@ -159,3 +191,8 @@ def read_pickle(path, compression="infer"): f.close() for _f in fh: _f.close() + if should_close: + try: + fp_or_buf.close() + except ValueError: + pass diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5f4636ac070bb..3e4673c890bef 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -58,7 +58,7 @@ concat, isna, ) -from pandas.core.arrays.categorical import Categorical +from pandas.core.arrays import Categorical, DatetimeArray, PeriodArray import pandas.core.common as com from pandas.core.computation.pytables import PyTablesExpr, maybe_expression from pandas.core.indexes.api import ensure_index @@ -413,8 +413,8 @@ def read_hdf( for group_to_check in groups[1:]: if not _is_metadata_of(group_to_check, candidate_only_group): raise ValueError( - "key must be provided when HDF5 file " - "contains multiple datasets." + "key must be provided when HDF5 " + "file contains multiple datasets." ) key = candidate_only_group._v_pathname return store.select( @@ -1018,7 +1018,7 @@ def put( data_columns : list, default None List of columns to create as data columns, or True to use all columns. See `here - `__. + `__. encoding : str, default None Provide an encoding for strings. dropna : bool, default False, do not write an ALL nan row to @@ -1138,7 +1138,7 @@ def append( List of columns to create as indexed data columns for on-disk queries, or True to use all columns. By default only the axes of the object are indexed. See `here - `__. + `__. min_itemsize : dict of columns that specify minimum string sizes nan_rep : string to use as string nan representation chunksize : size to chunk the writing @@ -1215,9 +1215,8 @@ def append_to_multiple( """ if axes is not None: raise TypeError( - "axes is currently not accepted as a parameter to" - " append_to_multiple; you can create the " - "tables independently instead" + "axes is currently not accepted as a parameter to append_to_multiple; " + "you can create the tables independently instead" ) if not isinstance(d, dict): @@ -1241,8 +1240,7 @@ def append_to_multiple( if v is None: if remain_key is not None: raise ValueError( - "append_to_multiple can only have one value in d that " - "is None" + "append_to_multiple can only have one value in d that is None" ) remain_key = k else: @@ -1459,7 +1457,7 @@ def copy( data = self.select(k) if isinstance(s, Table): - index: Union[bool, list] = False + index: Union[bool, List[str]] = False if propindexes: index = [a.name for a in s.axes if a.is_indexed] new_store.append( @@ -2314,8 +2312,7 @@ def validate_attr(self, append): existing_dtype = getattr(self.attrs, self.dtype_attr, None) if existing_dtype is not None and existing_dtype != self.dtype: raise ValueError( - "appended items dtype do not match existing " - "items dtype in table!" + "appended items dtype do not match existing items dtype in table!" ) def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str): @@ -2659,7 +2656,8 @@ def _get_index_factory(self, klass): def f(values, freq=None, tz=None): # data are already in UTC, localize and convert if tz present - result = DatetimeIndex._simple_new(values.values, name=None, freq=freq) + dta = DatetimeArray._simple_new(values.values, freq=freq) + result = DatetimeIndex._simple_new(dta, name=None) if tz is not None: result = result.tz_localize("UTC").tz_convert(tz) return result @@ -2668,7 +2666,8 @@ def f(values, freq=None, tz=None): elif klass == PeriodIndex: def f(values, freq=None, tz=None): - return PeriodIndex._simple_new(values, name=None, freq=freq) + parr = PeriodArray._simple_new(values, freq=freq) + return PeriodIndex._simple_new(parr, name=None) return f @@ -2681,14 +2680,12 @@ def validate_read(self, columns, where): if columns is not None: raise TypeError( "cannot pass a column specification when reading " - "a Fixed format store. this store must be " - "selected in its entirety" + "a Fixed format store. this store must be selected in its entirety" ) if where is not None: raise TypeError( "cannot pass a where specification when reading " - "from a Fixed format store. this store must be " - "selected in its entirety" + "from a Fixed format store. this store must be selected in its entirety" ) @property @@ -2909,8 +2906,7 @@ def write_array(self, key: str, value: ArrayLike, items: Optional[Index] = None) if is_categorical_dtype(value): raise NotImplementedError( - "Cannot store a category dtype in " - "a HDF5 dataset that uses format=" + "Cannot store a category dtype in a HDF5 dataset that uses format=" '"fixed". Use format="table".' ) if not empty_array: @@ -3145,15 +3141,25 @@ class Table(Fixed): info: Dict def __init__( - self, parent: HDFStore, group: "Node", encoding=None, errors: str = "strict" + self, + parent: HDFStore, + group: "Node", + encoding=None, + errors: str = "strict", + index_axes=None, + non_index_axes=None, + values_axes=None, + data_columns=None, + info=None, + nan_rep=None, ): super().__init__(parent, group, encoding=encoding, errors=errors) - self.index_axes = [] - self.non_index_axes = [] - self.values_axes = [] - self.data_columns = [] - self.info = dict() - self.nan_rep = None + self.index_axes = index_axes or [] + self.non_index_axes = non_index_axes or [] + self.values_axes = values_axes or [] + self.data_columns = data_columns or [] + self.info = info or dict() + self.nan_rep = nan_rep @property def table_type_short(self) -> str: @@ -3287,7 +3293,7 @@ def data_orientation(self): def queryables(self) -> Dict[str, Any]: """ return a dict of the kinds allowable columns for this object """ - # mypy doesnt recognize DataFrame._AXIS_NAMES, so we re-write it here + # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here axis_names = {0: "index", 1: "columns"} # compute the values_axes queryables @@ -3538,9 +3544,8 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): if not v.is_indexed: if v.type.startswith("complex"): raise TypeError( - "Columns containing complex values can be stored " - "but cannot" - " be indexed when using table format. Either use " + "Columns containing complex values can be stored but " + "cannot be indexed when using table format. Either use " "fixed format, set index=False, or do not include " "the columns containing complex values to " "data_columns when initializing the table." @@ -3635,23 +3640,28 @@ def _create_axes( data_columns=None, min_itemsize=None, ): - """ create and return the axes - legacy tables create an indexable column, indexable index, - non-indexable fields - - Parameters - ---------- - axes: a list of the axes in order to create (names or numbers of - the axes) - obj : the object to create axes on - validate: validate the obj against an existing object already - written - min_itemsize: a dict of the min size for a column in bytes - nan_rep : a values to use for string column nan_rep - encoding : the encoding for string values - data_columns : a list of columns that we want to create separate to - allow indexing (or True will force all columns) + """ + Create and return the axes. + + Parameters + ---------- + axes: list or None + The names or numbers of the axes to create. + obj : DataFrame + The object to create axes on. + validate: bool, default True + Whether to validate the obj against an existing object already written. + nan_rep : + A value to use for string column nan_rep. + data_columns : List[str], True, or None, default None + Specify the columns that we want to create to allow indexing on. + * True : Use all available columns. + * None : Use no columns. + * List[str] : Use the specified columns. + + min_itemsize: Dict[str, int] or None, default None + The min itemsize for a column in bytes. """ if not isinstance(obj, DataFrame): @@ -3670,15 +3680,15 @@ def _create_axes( # do we have an existing table (if so, use its axes & data_columns) if self.infer_axes(): - existing_table = self.copy() + table_exists = True axes = [a.axis for a in self.index_axes] - data_columns = self.data_columns + data_columns = list(self.data_columns) nan_rep = self.nan_rep - new_info = self.info # TODO: do we always have validate=True here? else: - existing_table = None - new_info = self.info + table_exists = False + + new_info = self.info assert self.ndim == 2 # with next check, we must have len(axes) == 1 # currently support on ndim-1 axes @@ -3700,9 +3710,9 @@ def _create_axes( a = obj.axes[idx] # we might be able to change the axes on the appending data if necessary append_axis = list(a) - if existing_table is not None: + if table_exists: indexer = len(new_non_index_axes) # i.e. 0 - exist_axis = existing_table.non_index_axes[indexer][1] + exist_axis = self.non_index_axes[indexer][1] if not array_equivalent(np.array(append_axis), np.array(exist_axis)): # ahah! -> reindex @@ -3721,8 +3731,8 @@ def _create_axes( # Now we can construct our new index axis idx = axes[0] a = obj.axes[idx] - index_name = obj._AXIS_NAMES[idx] - new_index = _convert_index(index_name, a, self.encoding, self.errors) + axis_name = obj._AXIS_NAMES[idx] + new_index = _convert_index(axis_name, a, self.encoding, self.errors) new_index.axis = idx # Because we are always 2D, there is only one new_index, so @@ -3749,9 +3759,11 @@ def get_blk_items(mgr, blocks): data_columns = self.validate_data_columns( data_columns, min_itemsize, new_non_index_axes ) + block_obj = self.get_object(obj, transposed)._consolidate() + blocks, blk_items = self._get_blocks_and_items( - block_obj, existing_table, new_non_index_axes, data_columns + block_obj, table_exists, new_non_index_axes, self.values_axes, data_columns ) # add my values @@ -3772,13 +3784,15 @@ def get_blk_items(mgr, blocks): # make sure that we match up the existing columns # if we have an existing table - if existing_table is not None and validate: + existing_col: Optional[DataCol] + + if table_exists and validate: try: - existing_col = existing_table.values_axes[i] + existing_col = self.values_axes[i] except (IndexError, KeyError): raise ValueError( f"Incompatible appended table [{blocks}]" - f"with existing table [{existing_table.values_axes}]" + f"with existing table [{self.values_axes}]" ) else: existing_col = None @@ -3827,22 +3841,34 @@ def get_blk_items(mgr, blocks): j += 1 - self.nan_rep = nan_rep - self.data_columns = [col.name for col in vaxes if col.is_data_indexable] - self.values_axes = vaxes - self.index_axes = new_index_axes - self.non_index_axes = new_non_index_axes + dcs = [col.name for col in vaxes if col.is_data_indexable] - # validate our min_itemsize - self.validate_min_itemsize(min_itemsize) + new_table = type(self)( + parent=self.parent, + group=self.group, + encoding=self.encoding, + errors=self.errors, + index_axes=new_index_axes, + non_index_axes=new_non_index_axes, + values_axes=vaxes, + data_columns=dcs, + info=new_info, + nan_rep=nan_rep, + ) + if hasattr(self, "levels"): + # TODO: get this into constructor, only for appropriate subclass + new_table.levels = self.levels + + new_table.validate_min_itemsize(min_itemsize) + + if validate and table_exists: + new_table.validate(self) - # validate the axes if we have an existing table - if validate: - self.validate(existing_table) + return new_table @staticmethod def _get_blocks_and_items( - block_obj, existing_table, new_non_index_axes, data_columns + block_obj, table_exists, new_non_index_axes, values_axes, data_columns ): # Helper to clarify non-state-altering parts of _create_axes @@ -3864,15 +3890,15 @@ def get_blk_items(mgr, blocks): blocks.extend(mgr.blocks) blk_items.extend(get_blk_items(mgr, mgr.blocks)) - # reorder the blocks in the same order as the existing_table if we can - if existing_table is not None: + # reorder the blocks in the same order as the existing table if we can + if table_exists: by_items = { tuple(b_items.tolist()): (b, b_items) for b, b_items in zip(blocks, blk_items) } new_blocks = [] new_blk_items = [] - for ea in existing_table.values_axes: + for ea in values_axes: items = tuple(ea.values) try: b, b_items = by_items.pop(items) @@ -4103,7 +4129,7 @@ def write( self._handle.remove_node(self.group, "table") # create the axes - self._create_axes( + table = self._create_axes( axes=axes, obj=obj, validate=append, @@ -4112,13 +4138,13 @@ def write( data_columns=data_columns, ) - for a in self.axes: + for a in table.axes: a.validate_names() - if not self.is_exists: + if not table.is_exists: # create the table - options = self.create_description( + options = table.create_description( complib=complib, complevel=complevel, fletcher32=fletcher32, @@ -4126,20 +4152,20 @@ def write( ) # set the table attributes - self.set_attrs() + table.set_attrs() # create the table - self._handle.create_table(self.group, **options) + table._handle.create_table(table.group, **options) # update my info - self.attrs.info = self.info + table.attrs.info = table.info # validate the axes and set the kinds - for a in self.axes: - a.validate_and_set(self, append) + for a in table.axes: + a.validate_and_set(table, append) # add the rows - self.write_data(chunksize, dropna=dropna) + table.write_data(chunksize, dropna=dropna) def write_data(self, chunksize: Optional[int], dropna: bool = False): """ we form the data into a 2-d including indexes,values,mask @@ -4962,7 +4988,7 @@ def _get_data_and_dtype_name(data: Union[np.ndarray, ABCExtensionArray]): if data.dtype.kind in ["m", "M"]: data = np.asarray(data.view("i8")) # TODO: we used to reshape for the dt64tz case, but no longer - # doing that doesnt seem to break anything. why? + # doing that doesn't seem to break anything. why? elif isinstance(data, PeriodIndex): data = data.asi8 diff --git a/pandas/io/sas/__init__.py b/pandas/io/sas/__init__.py index fa6b29a1a3fcc..8f81352e6aecb 100644 --- a/pandas/io/sas/__init__.py +++ b/pandas/io/sas/__init__.py @@ -1 +1 @@ -from .sasreader import read_sas # noqa +from pandas.io.sas.sasreader import read_sas # noqa diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index bb5bce96bc64b..40fea0aaf0d07 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -13,8 +13,7 @@ ctypedef unsigned short uint16_t # algorithm. It is partially documented here: # # https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf -cdef const uint8_t[:] rle_decompress(int result_length, - const uint8_t[:] inbuff): +cdef const uint8_t[:] rle_decompress(int result_length, const uint8_t[:] inbuff): cdef: uint8_t control_byte, x @@ -33,7 +32,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, raise ValueError("Unexpected non-zero end_of_first_byte") nbytes = (inbuff[ipos]) + 64 ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = inbuff[ipos] rpos += 1 ipos += 1 @@ -42,20 +41,20 @@ cdef const uint8_t[:] rle_decompress(int result_length, nbytes = end_of_first_byte * 16 nbytes += (inbuff[ipos]) ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = inbuff[ipos] rpos += 1 ipos += 1 elif control_byte == 0x60: nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x20 rpos += 1 elif control_byte == 0x70: nbytes = end_of_first_byte * 256 + (inbuff[ipos]) + 17 ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x00 rpos += 1 elif control_byte == 0x80: @@ -86,22 +85,22 @@ cdef const uint8_t[:] rle_decompress(int result_length, nbytes = end_of_first_byte + 3 x = inbuff[ipos] ipos += 1 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = x rpos += 1 elif control_byte == 0xD0: nbytes = end_of_first_byte + 2 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x40 rpos += 1 elif control_byte == 0xE0: nbytes = end_of_first_byte + 2 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x20 rpos += 1 elif control_byte == 0xF0: nbytes = end_of_first_byte + 2 - for i in range(nbytes): + for _ in range(nbytes): result[rpos] = 0x00 rpos += 1 else: @@ -117,8 +116,7 @@ cdef const uint8_t[:] rle_decompress(int result_length, # rdc_decompress decompresses data using the Ross Data Compression algorithm: # # http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm -cdef const uint8_t[:] rdc_decompress(int result_length, - const uint8_t[:] inbuff): +cdef const uint8_t[:] rdc_decompress(int result_length, const uint8_t[:] inbuff): cdef: uint8_t cmd @@ -233,8 +231,7 @@ cdef class Parser: int subheader_pointer_length int current_page_type bint is_little_endian - const uint8_t[:] (*decompress)(int result_length, - const uint8_t[:] inbuff) + const uint8_t[:] (*decompress)(int result_length, const uint8_t[:] inbuff) object parser def __init__(self, object parser): @@ -267,8 +264,7 @@ cdef class Parser: elif column_types[j] == b's': self.column_types[j] = column_type_string else: - raise ValueError("unknown column type: " - f"{self.parser.columns[j].ctype}") + raise ValueError(f"unknown column type: {self.parser.columns[j].ctype}") # compression if parser.compression == const.rle_compression: @@ -288,15 +284,14 @@ cdef class Parser: bint done int i - for i in range(nrows): + for _ in range(nrows): done = self.readline() if done: break # update the parser self.parser._current_row_on_page_index = self.current_row_on_page_index - self.parser._current_row_in_chunk_index =\ - self.current_row_in_chunk_index + self.parser._current_row_in_chunk_index = self.current_row_in_chunk_index self.parser._current_row_in_file_index = self.current_row_in_file_index cdef bint read_next_page(self): @@ -317,9 +312,9 @@ cdef class Parser: self.current_page_type = self.parser._current_page_type self.current_page_block_count = self.parser._current_page_block_count self.current_page_data_subheader_pointers_len = len( - self.parser._current_page_data_subheader_pointers) - self.current_page_subheaders_count =\ - self.parser._current_page_subheaders_count + self.parser._current_page_data_subheader_pointers + ) + self.current_page_subheaders_count = self.parser._current_page_subheaders_count cdef readline(self): @@ -357,19 +352,18 @@ cdef class Parser: return False elif (self.current_page_type == page_mix_types_0 or self.current_page_type == page_mix_types_1): - align_correction = (bit_offset + subheader_pointers_offset + - self.current_page_subheaders_count * - subheader_pointer_length) + align_correction = ( + bit_offset + + subheader_pointers_offset + + self.current_page_subheaders_count * subheader_pointer_length + ) align_correction = align_correction % 8 offset = bit_offset + align_correction offset += subheader_pointers_offset - offset += (self.current_page_subheaders_count * - subheader_pointer_length) + offset += self.current_page_subheaders_count * subheader_pointer_length offset += self.current_row_on_page_index * self.row_length - self.process_byte_array_with_data(offset, - self.row_length) - mn = min(self.parser.row_count, - self.parser._mix_page_row_count) + self.process_byte_array_with_data(offset, self.row_length) + mn = min(self.parser.row_count, self.parser._mix_page_row_count) if self.current_row_on_page_index == mn: done = self.read_next_page() if done: @@ -377,11 +371,12 @@ cdef class Parser: return False elif self.current_page_type & page_data_type == page_data_type: self.process_byte_array_with_data( - bit_offset + subheader_pointers_offset + - self.current_row_on_page_index * self.row_length, - self.row_length) - flag = (self.current_row_on_page_index == - self.current_page_block_count) + bit_offset + + subheader_pointers_offset + + self.current_row_on_page_index * self.row_length, + self.row_length, + ) + flag = self.current_row_on_page_index == self.current_page_block_count if flag: done = self.read_next_page() if done: diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index fe96b94e368e3..9b40778dbcfdf 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -13,7 +13,7 @@ Reference for binary data compression: http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm """ -from collections.abc import Iterator +from collections import abc from datetime import datetime import struct @@ -37,7 +37,7 @@ class _column: # SAS7BDAT represents a SAS data file in SAS7BDAT format. -class SAS7BDATReader(Iterator): +class SAS7BDATReader(abc.Iterator): """ Read SAS files in SAS7BDAT format. @@ -170,7 +170,7 @@ def _get_properties(self): if buf in const.encoding_names: self.file_encoding = const.encoding_names[buf] else: - self.file_encoding = "unknown (code={name!s})".format(name=buf) + self.file_encoding = f"unknown (code={buf})" # Get platform information buf = self._read_bytes(const.platform_offset, const.platform_length) @@ -294,8 +294,8 @@ def _read_bytes(self, offset, length): buf = self._path_or_buf.read(length) if len(buf) < length: self.close() - msg = "Unable to read {:d} bytes from file position {:d}." - raise ValueError(msg.format(length, offset)) + msg = f"Unable to read {length:d} bytes from file position {offset:d}." + raise ValueError(msg) return buf else: if offset + length > len(self._cached_page): @@ -458,12 +458,8 @@ def _process_columnsize_subheader(self, offset, length): self.column_count = self._read_int(offset, int_len) if self.col_count_p1 + self.col_count_p2 != self.column_count: print( - "Warning: column count mismatch ({p1} + {p2} != " - "{column_count})\n".format( - p1=self.col_count_p1, - p2=self.col_count_p2, - column_count=self.column_count, - ) + f"Warning: column count mismatch ({self.col_count_p1} + " + f"{self.col_count_p2} != {self.column_count})\n" ) # Unknown purpose @@ -673,8 +669,11 @@ def _read_next_page(self): return True elif len(self._cached_page) != self._page_length: self.close() - msg = "failed to read complete page from file (read {:d} of {:d} bytes)" - raise ValueError(msg.format(len(self._cached_page), self._page_length)) + msg = ( + "failed to read complete page from file (read " + f"{len(self._cached_page):d} of {self._page_length:d} bytes)" + ) + raise ValueError(msg) self._read_page_header() page_type = self._current_page_type @@ -726,8 +725,6 @@ def _chunk_to_dataframe(self): js += 1 else: self.close() - raise ValueError( - "unknown column type {type}".format(type=self._column_types[j]) - ) + raise ValueError(f"unknown column type {self._column_types[j]}") return rslt diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index ccaee56383a5f..3cf7fd885e564 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -7,7 +7,7 @@ https://support.sas.com/techsup/technote/ts140.pdf """ -from collections.abc import Iterator +from collections import abc from datetime import datetime from io import BytesIO import struct @@ -251,7 +251,7 @@ def _parse_float_vec(vec): return ieee -class XportReader(Iterator): +class XportReader(abc.Iterator): __doc__ = _xport_reader_doc def __init__( @@ -367,8 +367,8 @@ def _read_header(self): fl = field["field_length"] if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)): self.close() - msg = "Floating field width {0} is not between 2 and 8." - raise TypeError(msg.format(fl)) + msg = f"Floating field width {fl} is not between 2 and 8." + raise TypeError(msg) for k, v in field.items(): try: diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 56ebb583bc2f9..27d56d4ede403 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -49,8 +49,7 @@ def read_sas( if format is None: buffer_error_msg = ( "If this is a buffer object rather " - "than a string name, you must specify " - "a format string" + "than a string name, you must specify a format string" ) filepath_or_buffer = stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): diff --git a/pandas/io/spss.py b/pandas/io/spss.py index cf682ec72f284..cdbe14e9fe927 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -3,7 +3,8 @@ from pandas.compat._optional import import_optional_dependency -from pandas.api.types import is_list_like +from pandas.core.dtypes.inference import is_list_like + from pandas.core.api import DataFrame diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b619ea93b981d..58fed0d18dd4a 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -241,7 +241,7 @@ def read_sql_table( try: meta.reflect(only=[table_name], views=True) except sqlalchemy.exc.InvalidRequestError: - raise ValueError("Table {name} not found".format(name=table_name)) + raise ValueError(f"Table {table_name} not found") pandas_sql = SQLDatabase(con, meta=meta) table = pandas_sql.read_table( @@ -256,7 +256,7 @@ def read_sql_table( if table is not None: return table else: - raise ValueError("Table {name} not found".format(name=table_name), con) + raise ValueError(f"Table {table_name} not found", con) def read_sql_query( @@ -498,7 +498,7 @@ def to_sql( .. versionadded:: 0.24.0 """ if if_exists not in ("fail", "replace", "append"): - raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) + raise ValueError(f"'{if_exists}' is not valid for if_exists") pandas_sql = pandasSQL_builder(con, schema=schema) @@ -625,7 +625,7 @@ def __init__( self.table = self.pd_sql.get_table(self.name, self.schema) if self.table is None: - raise ValueError("Could not init table '{name}'".format(name=name)) + raise ValueError(f"Could not init table '{name}'") def exists(self): return self.pd_sql.has_table(self.name, self.schema) @@ -643,18 +643,14 @@ def _execute_create(self): def create(self): if self.exists(): if self.if_exists == "fail": - raise ValueError( - "Table '{name}' already exists.".format(name=self.name) - ) + raise ValueError(f"Table '{self.name}' already exists.") elif self.if_exists == "replace": self.pd_sql.drop_table(self.name, self.schema) self._execute_create() elif self.if_exists == "append": pass else: - raise ValueError( - "'{0}' is not valid for if_exists".format(self.if_exists) - ) + raise ValueError(f"'{self.if_exists}' is not valid for if_exists") else: self._execute_create() @@ -689,7 +685,7 @@ def insert_data(self): try: temp.reset_index(inplace=True) except ValueError as err: - raise ValueError("duplicate name in index/columns: {0}".format(err)) + raise ValueError(f"duplicate name in index/columns: {err}") else: temp = self.frame @@ -732,7 +728,7 @@ def insert(self, chunksize=None, method=None): elif callable(method): exec_insert = partial(method, self) else: - raise ValueError("Invalid parameter `method`: {}".format(method)) + raise ValueError(f"Invalid parameter `method`: {method}") keys, data_list = self.insert_data() @@ -786,7 +782,8 @@ def read(self, coerce_float=True, parse_dates=None, columns=None, chunksize=None cols = [self.table.c[n] for n in columns] if self.index is not None: - [cols.insert(0, self.table.c[idx]) for idx in self.index[::-1]] + for idx in self.index[::-1]: + cols.insert(0, self.table.c[idx]) sql_select = select(cols) else: sql_select = self.table.select() @@ -826,7 +823,7 @@ def _index_name(self, index, index_label): if len(index_label) != nlevels: raise ValueError( "Length of 'index_label' should match number of " - "levels, which is {0}".format(nlevels) + f"levels, which is {nlevels}" ) else: return index_label @@ -839,7 +836,7 @@ def _index_name(self, index, index_label): return ["index"] else: return [ - l if l is not None else "level_{0}".format(i) + l if l is not None else f"level_{i}" for i, l in enumerate(self.frame.index.names) ] @@ -980,8 +977,7 @@ def _sqlalchemy_type(self, col): if col_type == "timedelta64": warnings.warn( "the 'timedelta' type is not supported, and will be " - "written as integer values (ns frequency) to the " - "database.", + "written as integer values (ns frequency) to the database.", UserWarning, stacklevel=8, ) @@ -1304,10 +1300,7 @@ def to_sql( for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): - raise ValueError( - "The type of {column} is not a " - "SQLAlchemy type ".format(column=col) - ) + raise ValueError(f"The type of {col} is not a SQLAlchemy type") table = SQLTable( name, @@ -1331,11 +1324,11 @@ def to_sql( ) if name not in table_names: msg = ( - "The provided table name '{0}' is not found exactly as " + f"The provided table name '{name}' is not found exactly as " "such in the database after writing the table, possibly " "due to case sensitivity issues. Consider using lower " "case table names." - ).format(name) + ) warnings.warn(msg, UserWarning) @property @@ -1395,14 +1388,12 @@ def _get_unicode_name(name): try: uname = str(name).encode("utf-8", "strict").decode("utf-8") except UnicodeError: - raise ValueError( - "Cannot convert identifier to UTF-8: '{name}'".format(name=name) - ) + raise ValueError(f"Cannot convert identifier to UTF-8: '{name}'") return uname def _get_valid_sqlite_name(name): - # See http://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ + # See https://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ # -for-sqlite-table-column-names-in-python # Ensure the string can be encoded as UTF-8. # Ensure the string does not include any NUL characters. @@ -1421,8 +1412,7 @@ def _get_valid_sqlite_name(name): _SAFE_NAMES_WARNING = ( "The spaces in these column names will not be changed. " - "In pandas versions < 0.14, spaces were converted to " - "underscores." + "In pandas versions < 0.14, spaces were converted to underscores." ) @@ -1456,13 +1446,14 @@ def insert_statement(self): escape = _get_valid_sqlite_name if self.index is not None: - [names.insert(0, idx) for idx in self.index[::-1]] + for idx in self.index[::-1]: + names.insert(0, idx) bracketed_names = [escape(column) for column in names] col_names = ",".join(bracketed_names) wildcards = ",".join([wld] * len(names)) - insert_statement = "INSERT INTO {table} ({columns}) VALUES ({wld})".format( - table=escape(self.name), columns=col_names, wld=wildcards + insert_statement = ( + f"INSERT INTO {escape(self.name)} ({col_names}) VALUES ({wildcards})" ) return insert_statement @@ -1496,9 +1487,7 @@ def _create_table_setup(self): keys = self.keys cnames_br = ", ".join(escape(c) for c in keys) create_tbl_stmts.append( - "CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format( - tbl=self.name, cnames_br=cnames_br - ) + f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})" ) create_stmts = [ @@ -1537,8 +1526,7 @@ def _sql_type_name(self, col): if col_type == "timedelta64": warnings.warn( "the 'timedelta' type is not supported, and will be " - "written as integer values (ns frequency) to the " - "database.", + "written as integer values (ns frequency) to the database.", UserWarning, stacklevel=8, ) @@ -1599,14 +1587,11 @@ def execute(self, *args, **kwargs): self.con.rollback() except Exception as inner_exc: # pragma: no cover ex = DatabaseError( - "Execution failed on sql: {sql}\n{exc}\nunable " - "to rollback".format(sql=args[0], exc=exc) + f"Execution failed on sql: {args[0]}\n{exc}\nunable to rollback" ) raise ex from inner_exc - ex = DatabaseError( - "Execution failed on sql '{sql}': {exc}".format(sql=args[0], exc=exc) - ) + ex = DatabaseError(f"Execution failed on sql '{args[0]}': {exc}") raise ex from exc @staticmethod @@ -1731,11 +1716,7 @@ def to_sql( if dtype is not None: for col, my_type in dtype.items(): if not isinstance(my_type, str): - raise ValueError( - "{column} ({type!s}) not a string".format( - column=col, type=my_type - ) - ) + raise ValueError(f"{col} ({my_type}) not a string") table = SQLiteTable( name, @@ -1755,9 +1736,7 @@ def has_table(self, name, schema=None): # esc_name = escape(name) wld = "?" - query = ( - "SELECT name FROM sqlite_master WHERE type='table' AND name={wld};" - ).format(wld=wld) + query = f"SELECT name FROM sqlite_master WHERE type='table' AND name={wld};" return len(self.execute(query, [name]).fetchall()) > 0 @@ -1765,7 +1744,7 @@ def get_table(self, table_name, schema=None): return None # not supported in fallback mode def drop_table(self, name, schema=None): - drop_sql = "DROP TABLE {name}".format(name=_get_valid_sqlite_name(name)) + drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}" self.execute(drop_sql) def _create_sql_schema(self, frame, table_name, keys=None, dtype=None): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index fc54a1fa2370d..cee5f3d280991 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -9,13 +9,13 @@ You can find more information on http://presbrey.mit.edu/PyDTA and http://www.statsmodels.org/devel/ """ -from collections.abc import Iterator +from collections import abc import datetime from io import BytesIO import os import struct import sys -from typing import Any +from typing import Any, Dict, Hashable, Optional, Sequence import warnings from dateutil.relativedelta import relativedelta @@ -23,6 +23,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array +from pandas._typing import FilePathOrBuffer from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -47,9 +48,10 @@ from pandas.io.common import get_filepath_or_buffer, stringify_path _version_error = ( - "Version of given Stata file is not 104, 105, 108, " - "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), " - "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" + "Version of given Stata file is {version}. pandas supports importing " + "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), " + "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16)," + "and 119 (Stata 15/16, over 32,767 variables)." ) _statafile_processing_params1 = """\ @@ -85,7 +87,7 @@ iterator : bool, default False Return StataReader object.""" -_read_stata_doc = """ +_read_stata_doc = f""" Read Stata file into DataFrame. Parameters @@ -100,10 +102,10 @@ By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. -%s -%s -%s -%s +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +{_iterator_params} Returns ------- @@ -125,33 +127,24 @@ >>> itr = pd.read_stata('filename.dta', chunksize=10000) >>> for chunk in itr: ... do_something(chunk) -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, - _chunksize_params, - _iterator_params, -) +""" -_read_method_doc = """\ +_read_method_doc = f"""\ Reads observations from Stata file, converting them into a dataframe Parameters ---------- nrows : int Number of lines to read from data file, if None read whole file. -%s -%s +{_statafile_processing_params1} +{_statafile_processing_params2} Returns ------- DataFrame -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, -) - +""" -_stata_reader_doc = """\ +_stata_reader_doc = f"""\ Class for reading Stata dta files. Parameters @@ -161,14 +154,10 @@ implementing a binary read() functions. .. versionadded:: 0.23.0 support for pathlib, py.path. -%s -%s -%s -""" % ( - _statafile_processing_params1, - _statafile_processing_params2, - _chunksize_params, -) +{_statafile_processing_params1} +{_statafile_processing_params2} +{_chunksize_params} +""" @Appender(_read_stata_doc) @@ -370,7 +359,7 @@ def convert_delta_safe(base, deltas, unit): month = np.ones_like(dates) conv_dates = convert_year_month_safe(year, month) else: - raise ValueError("Date fmt {fmt} not understood".format(fmt=fmt)) + raise ValueError(f"Date fmt {fmt} not understood") if has_bad_values: # Restore NaT for bad values conv_dates[bad_locs] = NaT @@ -465,9 +454,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): d = parse_dates_safe(dates, year=True) conv_dates = d.year else: - raise ValueError( - "Format {fmt} is not a known Stata date format".format(fmt=fmt) - ) + raise ValueError(f"Format {fmt} is not a known Stata date format") conv_dates = Series(conv_dates, dtype=np.float64) missing_value = struct.unpack("= 2 ** 53: - ws = precision_loss_doc % ("uint64", "float64") + ws = precision_loss_doc.format("uint64", "float64") data[col] = data[col].astype(dtype) @@ -585,25 +572,21 @@ def _cast_to_stata_types(data): else: data[col] = data[col].astype(np.float64) if data[col].max() >= 2 ** 53 or data[col].min() <= -(2 ** 53): - ws = precision_loss_doc % ("int64", "float64") + ws = precision_loss_doc.format("int64", "float64") elif dtype in (np.float32, np.float64): value = data[col].max() if np.isinf(value): raise ValueError( - "Column {col} has a maximum value of " - "infinity which is outside the range " - "supported by Stata.".format(col=col) + f"Column {col} has a maximum value of infinity which is outside " + "the range supported by Stata." ) if dtype == np.float32 and value > float32_max: data[col] = data[col].astype(np.float64) elif dtype == np.float64: if value > float64_max: raise ValueError( - "Column {col} has a maximum value " - "({val}) outside the range supported by " - "Stata ({float64_max})".format( - col=col, val=value, float64_max=float64_max - ) + f"Column {col} has a maximum value ({value}) outside the range " + f"supported by Stata ({float64_max})" ) if ws: @@ -618,26 +601,18 @@ class StataValueLabel: Parameters ---------- - value : int8, int16, int32, float32 or float64 - The Stata missing value code - - Attributes - ---------- - string : string - String representation of the Stata missing value - value : int8, int16, int32, float32 or float64 - The original encoded missing value - - Methods - ------- - generate_value_label - + catarray : Categorical + Categorical Series to encode + encoding : {"latin-1", "utf-8"} + Encoding to use for value labels. """ - def __init__(self, catarray): + def __init__(self, catarray, encoding="latin-1"): + if encoding not in ("latin-1", "utf-8"): + raise ValueError("Only latin-1 and utf-8 are supported.") self.labname = catarray.name - + self._encoding = encoding categories = catarray.cat.categories self.value_labels = list(zip(np.arange(len(categories)), categories)) self.value_labels.sort(key=lambda x: x[0]) @@ -656,7 +631,7 @@ def __init__(self, catarray): value_label_mismatch_doc.format(catarray.name), ValueLabelTypeMismatch, ) - + category = category.encode(encoding) self.off.append(self.text_len) self.text_len += len(category) + 1 # +1 for the padding self.val.append(vl[0]) @@ -666,8 +641,7 @@ def __init__(self, catarray): if self.text_len > 32000: raise ValueError( "Stata value labels for a single variable must " - "have a combined length less than 32,000 " - "characters." + "have a combined length less than 32,000 characters." ) # Ensure int32 @@ -683,31 +657,31 @@ def _encode(self, s): """ return s.encode(self._encoding) - def generate_value_label(self, byteorder, encoding): + def generate_value_label(self, byteorder): """ + Generate the binary representation of the value labals. + Parameters ---------- byteorder : str Byte order of the output - encoding : str - File encoding Returns ------- value_label : bytes Bytes containing the formatted value label """ - - self._encoding = encoding + encoding = self._encoding bio = BytesIO() - null_string = "\x00" null_byte = b"\x00" # len bio.write(struct.pack(byteorder + "i", self.len)) # labname - labname = self._encode(_pad_bytes(self.labname[:32], 33)) + labname = self.labname[:32].encode(encoding) + lab_len = 32 if encoding not in ("utf-8", "utf8") else 128 + labname = _pad_bytes(labname, lab_len + 1) bio.write(labname) # padding - 3 bytes @@ -731,7 +705,7 @@ def generate_value_label(self, byteorder, encoding): # txt - Text labels, null terminated for text in self.txt: - bio.write(self._encode(text + null_string)) + bio.write(text + null_byte) bio.seek(0) return bio.read() @@ -1007,10 +981,26 @@ def __init__(self): "typedef", "typename", "virtual", + "_all", + "_N", + "_skip", + "_b", + "_pi", + "str#", + "in", + "_pred", + "strL", + "_coef", + "_rc", + "using", + "_cons", + "_se", + "with", + "_n", ) -class StataReader(StataParser, Iterator): +class StataReader(StataParser, abc.Iterator): __doc__ = _stata_reader_doc def __init__( @@ -1102,11 +1092,11 @@ def _read_header(self): self.col_sizes = [self._calcsize(typ) for typ in self.typlist] def _read_new_header(self, first_char): - # The first part of the header is common to 117 and 118. + # The first part of the header is common to 117 - 119. self.path_or_buf.read(27) # stata_dta>
self.format_version = int(self.path_or_buf.read(3)) if self.format_version not in [117, 118, 119]: - raise ValueError(_version_error) + raise ValueError(_version_error.format(version=self.format_version)) self._set_encoding() self.path_or_buf.read(21) # self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<" @@ -1192,7 +1182,7 @@ def f(typ): try: return self.TYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata types [{0}]".format(typ)) + raise ValueError(f"cannot convert stata types [{typ}]") typlist = [f(x) for x in raw_typlist] @@ -1202,7 +1192,7 @@ def f(typ): try: return self.DTYPE_MAP_XML[typ] except KeyError: - raise ValueError("cannot convert stata dtype [{0}]".format(typ)) + raise ValueError(f"cannot convert stata dtype [{typ}]") dtyplist = [f(x) for x in raw_typlist] @@ -1299,7 +1289,7 @@ def _get_seek_variable_labels(self): def _read_old_header(self, first_char): self.format_version = struct.unpack("b", first_char)[0] if self.format_version not in [104, 105, 108, 111, 113, 114, 115]: - raise ValueError(_version_error) + raise ValueError(_version_error.format(version=self.format_version)) self._set_encoding() self.byteorder = ( struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<" @@ -1330,19 +1320,13 @@ def _read_old_header(self, first_char): try: self.typlist = [self.TYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError( - "cannot convert stata types [{0}]".format( - ",".join(str(x) for x in typlist) - ) - ) + invalid_types = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata types [{invalid_types}]") try: self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] except ValueError: - raise ValueError( - "cannot convert stata dtypes [{0}]".format( - ",".join(str(x) for x in typlist) - ) - ) + invalid_dtypes = ",".join(str(x) for x in typlist) + raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") if self.format_version > 108: self.varlist = [ @@ -1415,12 +1399,13 @@ def _decode(self, s): except UnicodeDecodeError: # GH 25960, fallback to handle incorrect format produced when 117 # files are converted to 118 files in Stata - msg = """ + encoding = self._encoding + msg = f""" One or more strings in the dta file could not be decoded using {encoding}, and so the fallback encoding of latin-1 is being used. This can happen when a file has been incorrectly encoded by Stata or some other software. You should verify the string values returned are correct.""" - warnings.warn(msg.format(encoding=self._encoding), UnicodeWarning) + warnings.warn(msg, UnicodeWarning) return s.decode("latin-1") def _read_value_labels(self): @@ -1745,9 +1730,10 @@ def _do_select_columns(self, data, columns): raise ValueError("columns contains duplicate entries") unmatched = column_set.difference(data.columns) if unmatched: + joined = ", ".join(list(unmatched)) raise ValueError( - "The following columns were not found in the " - "Stata data set: " + ", ".join(list(unmatched)) + "The following columns were not " + f"found in the Stata data set: {joined}" ) # Copy information for retained columns for later processing dtyplist = [] @@ -1794,7 +1780,7 @@ def _do_convert_categoricals( repeats = list(vc.index[vc > 1]) repeats = "-" * 80 + "\n" + "\n".join(repeats) # GH 25772 - msg = """ + msg = f""" Value labels for column {col} are not unique. These cannot be converted to pandas categoricals. @@ -1805,7 +1791,7 @@ def _do_convert_categoricals( The repeated labels are: {repeats} """ - raise ValueError(msg.format(col=col, repeats=repeats)) + raise ValueError(msg) # TODO: is the next line needed above in the data(...) method? cat_data = Series(cat_data, index=data.index) cat_converted_data.append((col, cat_data)) @@ -1874,13 +1860,15 @@ def _set_endianness(endianness): elif endianness.lower() in [">", "big"]: return ">" else: # pragma : no cover - raise ValueError("Endianness {endian} not understood".format(endian=endianness)) + raise ValueError(f"Endianness {endianness} not understood") def _pad_bytes(name, length): """ Take a char string and pads it with null bytes until it's length chars. """ + if isinstance(name, bytes): + return name + b"\x00" * (length - len(name)) return name + "\x00" * (length - len(name)) @@ -1906,7 +1894,7 @@ def _convert_datetime_to_stata_type(fmt): ]: return np.float64 # Stata expects doubles for SIFs else: - raise NotImplementedError("Format {fmt} not implemented".format(fmt=fmt)) + raise NotImplementedError(f"Format {fmt} not implemented") def _maybe_convert_to_int_keys(convert_dates, varlist): @@ -1956,9 +1944,7 @@ def _dtype_to_stata_type(dtype, column): elif dtype == np.int8: return 251 else: # pragma : no cover - raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype) - ) + raise NotImplementedError(f"Data type {dtype} not supported.") def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False): @@ -1985,24 +1971,12 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False if force_strl: return "%9s" if dtype.type == np.object_: - inferred_dtype = infer_dtype(column, skipna=True) - if not (inferred_dtype in ("string", "unicode") or len(column) == 0): - raise ValueError( - "Column `{col}` cannot be exported.\n\nOnly " - "string-like object arrays containing all " - "strings or a mix of strings and None can be " - "exported. Object arrays containing only null " - "values are prohibited. Other object types" - "cannot be exported and must first be converted " - "to one of the supported " - "types.".format(col=column.name) - ) itemsize = max_len_string_array(ensure_object(column.values)) if itemsize > max_str_len: if dta_version >= 117: return "%9s" else: - raise ValueError(excessive_string_length_error % column.name) + raise ValueError(excessive_string_length_error.format(column.name)) return "%" + str(max(itemsize, 1)) + "s" elif dtype == np.float64: return "%10.0g" @@ -2013,9 +1987,7 @@ def _dtype_to_default_stata_fmt(dtype, column, dta_version=114, force_strl=False elif dtype == np.int8 or dtype == np.int16: return "%8.0g" else: # pragma : no cover - raise NotImplementedError( - "Data type {dtype} not supported.".format(dtype=dtype) - ) + raise NotImplementedError(f"Data type {dtype} not supported.") class StataWriter(StataParser): @@ -2043,8 +2015,6 @@ class StataWriter(StataParser): timezone information write_index : bool Write the index to Stata dataset. - encoding : str - Default is latin-1. Only latin-1 and ascii are supported. byteorder : str Can be ">", "<", "little", or "big". default is `sys.byteorder` time_stamp : datetime @@ -2086,6 +2056,7 @@ class StataWriter(StataParser): """ _max_string_length = 244 + _encoding = "latin-1" def __init__( self, @@ -2101,7 +2072,6 @@ def __init__( super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates self._write_index = write_index - self._encoding = "latin-1" self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels @@ -2136,7 +2106,8 @@ def _prepare_categoricals(self, data): data_formatted = [] for col, col_is_cat in zip(data, is_cat): if col_is_cat: - self._value_labels.append(StataValueLabel(data[col])) + svl = StataValueLabel(data[col], encoding=self._encoding) + self._value_labels.append(svl) dtype = data[col].cat.codes.dtype if dtype == np.int64: raise ValueError( @@ -2181,6 +2152,36 @@ def _update_strl_names(self): """No-op, forward compatibility""" pass + def _validate_variable_name(self, name): + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9 + and _. + """ + for c in name: + if ( + (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ): + name = name.replace(c, "_") + return name + def _check_column_names(self, data): """ Checks column names to ensure that they are valid Stata column names. @@ -2204,14 +2205,7 @@ def _check_column_names(self, data): if not isinstance(name, str): name = str(name) - for c in name: - if ( - (c < "A" or c > "Z") - and (c < "a" or c > "z") - and (c < "0" or c > "9") - and c != "_" - ): - name = name.replace(c, "_") + name = self._validate_variable_name(name) # Variable name must not be a reserved word if name in self.RESERVED_WORDS: @@ -2251,7 +2245,7 @@ def _check_column_names(self, data): orig_name = orig_name.encode("utf-8") except (UnicodeDecodeError, AttributeError): pass - msg = "{0} -> {1}".format(orig_name, name) + msg = f"{orig_name} -> {name}" conversion_warning.append(msg) ws = invalid_name_doc.format("\n ".join(conversion_warning)) @@ -2262,12 +2256,12 @@ def _check_column_names(self, data): return data - def _set_formats_and_types(self, data, dtypes): + def _set_formats_and_types(self, dtypes): self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.items(): - self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, data[col])) - self.typlist.append(_dtype_to_stata_type(dtype, data[col])) + self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col])) + self.typlist.append(_dtype_to_stata_type(dtype, self.data[col])) def _prepare_pandas(self, data): # NOTE: we might need a different API / class for pandas objects so @@ -2311,17 +2305,57 @@ def _prepare_pandas(self, data): new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) dtypes[key] = np.dtype(new_type) - self._set_formats_and_types(data, dtypes) + # Verify object arrays are strings and encode to bytes + self._encode_strings() + + self._set_formats_and_types(dtypes) # set the given format for the datetime cols if self._convert_dates is not None: for key in self._convert_dates: self.fmtlist[key] = self._convert_dates[key] + def _encode_strings(self): + """ + Encode strings in dta-specific encoding + + Do not encode columns marked for date conversion or for strL + conversion. The strL converter independently handles conversion and + also accepts empty string arrays. + """ + convert_dates = self._convert_dates + # _convert_strl is not available in dta 114 + convert_strl = getattr(self, "_convert_strl", []) + for i, col in enumerate(self.data): + # Skip columns marked for date conversion or strl conversion + if i in convert_dates or col in convert_strl: + continue + column = self.data[col] + dtype = column.dtype + if dtype.type == np.object_: + inferred_dtype = infer_dtype(column, skipna=True) + if not ((inferred_dtype in ("string")) or len(column) == 0): + col = column.name + raise ValueError( + f"""\ +Column `{col}` cannot be exported.\n\nOnly string-like object arrays +containing all strings or a mix of strings and None can be exported. +Object arrays containing only null values are prohibited. Other object +types cannot be exported and must first be converted to one of the +supported types.""" + ) + encoded = self.data[col].str.encode(self._encoding) + # If larger than _max_string_length do nothing + if ( + max_len_string_array(ensure_object(encoded.values)) + <= self._max_string_length + ): + self.data[col] = encoded + def write_file(self): self._file, self._own_file = _open_file_binary_write(self._fname) try: - self._write_header(time_stamp=self._time_stamp, data_label=self._data_label) + self._write_header(data_label=self._data_label, time_stamp=self._time_stamp) self._write_map() self._write_variable_types() self._write_varnames() @@ -2344,9 +2378,8 @@ def write_file(self): os.unlink(self._fname) except OSError: warnings.warn( - "This save was not successful but {0} could not " - "be deleted. This file is not " - "valid.".format(self._fname), + f"This save was not successful but {self._fname} could not " + "be deleted. This file is not valid.", ResourceWarning, ) raise exc @@ -2392,7 +2425,7 @@ def _write_expansion_fields(self): def _write_value_labels(self): for vl in self._value_labels: - self._file.write(vl.generate_value_label(self._byteorder, self._encoding)) + self._file.write(vl.generate_value_label(self._byteorder)) def _write_header(self, data_label=None, time_stamp=None): byteorder = self._byteorder @@ -2494,9 +2527,8 @@ def _write_variable_labels(self): is_latin1 = all(ord(c) < 256 for c in label) if not is_latin1: raise ValueError( - "Variable labels must contain only " - "characters that can be encoded in " - "Latin-1" + "Variable labels must contain only characters that " + "can be encoded in Latin-1" ) self._write(_pad_bytes(label, 81)) else: @@ -2527,9 +2559,9 @@ def _prepare_data(self): typ = typlist[i] if typ <= self._max_string_length: data[col] = data[col].fillna("").apply(_pad_bytes, args=(typ,)) - stype = "S{type}".format(type=typ) + stype = f"S{typ}" dtypes[col] = stype - data[col] = data[col].str.encode(self._encoding).astype(stype) + data[col] = data[col].astype(stype) else: dtype = data[col].dtype if not native_byteorder: @@ -2665,7 +2697,7 @@ def _convert_key(self, key): def generate_table(self): """ - Generates the GSO lookup table for the DataFRame + Generates the GSO lookup table for the DataFrame Returns ------- @@ -2715,12 +2747,6 @@ def generate_table(self): return gso_table, gso_df - def _encode(self, s): - """ - Python 3 compatibility shim - """ - return s.encode(self._encoding) - def generate_blob(self, gso_table): """ Generates the binary blob of GSOs that is written to the dta file. @@ -2860,6 +2886,7 @@ class StataWriter117(StataWriter): """ _max_string_length = 2045 + _dta_version = 117 def __init__( self, @@ -2906,18 +2933,21 @@ def _write_header(self, data_label=None, time_stamp=None): self._file.write(bytes("", "utf-8")) bio = BytesIO() # ds_format - 117 - bio.write(self._tag(bytes("117", "utf-8"), "release")) + bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release")) # byteorder bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder")) - # number of vars, 2 bytes - assert self.nvar < 2 ** 16 - bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K")) - # number of obs, 4 bytes - bio.write(self._tag(struct.pack(byteorder + "I", self.nobs), "N")) + # number of vars, 2 bytes in 117 and 118, 4 byte in 119 + nvar_type = "H" if self._dta_version <= 118 else "I" + bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K")) + # 117 uses 4 bytes, 118 uses 8 + nobs_size = "I" if self._dta_version == 117 else "Q" + bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N")) # data label 81 bytes, char, null terminated label = data_label[:80] if data_label is not None else "" - label_len = struct.pack(byteorder + "B", len(label)) - label = label_len + bytes(label, "utf-8") + label = label.encode(self._encoding) + label_size = "B" if self._dta_version == 117 else "H" + label_len = struct.pack(byteorder + label_size, len(label)) + label = label_len + label bio.write(self._tag(label, "label")) # time stamp, 18 bytes, char, null terminated # format dd Mon yyyy hh:mm @@ -2947,7 +2977,7 @@ def _write_header(self, data_label=None, time_stamp=None): + time_stamp.strftime(" %Y %H:%M") ) # '\x11' added due to inspection of Stata file - ts = b"\x11" + bytes(ts, "utf8") + ts = b"\x11" + bytes(ts, "utf-8") bio.write(self._tag(ts, "timestamp")) bio.seek(0) self._file.write(self._tag(bio.read(), "header")) @@ -2994,35 +3024,41 @@ def _write_variable_types(self): def _write_varnames(self): self._update_map("varnames") bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vn_len = 32 if self._dta_version == 117 else 128 for name in self.varlist: name = self._null_terminate(name, True) - name = _pad_bytes_new(name[:32], 33) + name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1) bio.write(name) bio.seek(0) self._file.write(self._tag(bio.read(), "varnames")) def _write_sortlist(self): self._update_map("sortlist") - self._file.write(self._tag(b"\x00\00" * (self.nvar + 1), "sortlist")) + sort_size = 2 if self._dta_version < 119 else 4 + self._file.write(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist")) def _write_formats(self): self._update_map("formats") bio = BytesIO() + fmt_len = 49 if self._dta_version == 117 else 57 for fmt in self.fmtlist: - bio.write(_pad_bytes_new(fmt, 49)) + bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len)) bio.seek(0) self._file.write(self._tag(bio.read(), "formats")) def _write_value_label_names(self): self._update_map("value_label_names") bio = BytesIO() + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 32 if self._dta_version == 117 else 128 for i in range(self.nvar): # Use variable name when categorical name = "" # default name if self._is_col_cat[i]: name = self.varlist[i] name = self._null_terminate(name, True) - name = _pad_bytes_new(name[:32], 33) + name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1) bio.write(name) bio.seek(0) self._file.write(self._tag(bio.read(), "value_label_names")) @@ -3031,7 +3067,9 @@ def _write_variable_labels(self): # Missing labels are 80 blank characters plus null termination self._update_map("variable_labels") bio = BytesIO() - blank = _pad_bytes_new("", 81) + # 118 scales by 4 to accommodate utf-8 data worst case encoding + vl_len = 80 if self._dta_version == 117 else 320 + blank = _pad_bytes_new("", vl_len + 1) if self._variable_labels is None: for _ in range(self.nvar): @@ -3045,14 +3083,15 @@ def _write_variable_labels(self): label = self._variable_labels[col] if len(label) > 80: raise ValueError("Variable labels must be 80 characters or fewer") - is_latin1 = all(ord(c) < 256 for c in label) - if not is_latin1: + try: + encoded = label.encode(self._encoding) + except UnicodeEncodeError: raise ValueError( - "Variable labels must contain only " - "characters that can be encoded in " - "Latin-1" + "Variable labels must contain only characters that " + f"can be encoded in {self._encoding}" ) - bio.write(_pad_bytes_new(label, 81)) + + bio.write(_pad_bytes_new(encoded, vl_len + 1)) else: bio.write(blank) bio.seek(0) @@ -3084,7 +3123,7 @@ def _write_value_labels(self): self._update_map("value_labels") bio = BytesIO() for vl in self._value_labels: - lab = vl.generate_value_label(self._byteorder, self._encoding) + lab = vl.generate_value_label(self._byteorder) lab = self._tag(lab, "lbl") bio.write(lab) bio.seek(0) @@ -3114,19 +3153,181 @@ def _convert_strls(self, data): ] if convert_cols: - ssw = StataStrLWriter(data, convert_cols) + ssw = StataStrLWriter(data, convert_cols, version=self._dta_version) tab, new_data = ssw.generate_table() data = new_data self._strl_blob = ssw.generate_blob(tab) return data - def _set_formats_and_types(self, data, dtypes): + def _set_formats_and_types(self, dtypes): self.typlist = [] self.fmtlist = [] for col, dtype in dtypes.items(): force_strl = col in self._convert_strl fmt = _dtype_to_default_stata_fmt( - dtype, data[col], dta_version=117, force_strl=force_strl + dtype, + self.data[col], + dta_version=self._dta_version, + force_strl=force_strl, ) self.fmtlist.append(fmt) - self.typlist.append(_dtype_to_stata_type_117(dtype, data[col], force_strl)) + self.typlist.append( + _dtype_to_stata_type_117(dtype, self.data[col], force_strl) + ) + + +class StataWriterUTF8(StataWriter117): + """ + Stata binary dta file writing in Stata 15 (118) and 16 (119) formats + + DTA 118 and 119 format files support unicode string data (both fixed + and strL) format. Unicode is also supported in value labels, variable + labels and the dataset label. Format 119 is automatically used if the + file contains more than 32,767 variables. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. + data : DataFrame + Input to save + convert_dates : dict, default None + Dictionary mapping columns containing datetime types to stata internal + format to use when writing the dates. Options are 'tc', 'td', 'tm', + 'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name. + Datetime columns that do not have a conversion type specified will be + converted to 'tc'. Raises NotImplementedError if a datetime column has + timezone information + write_index : bool, default True + Write the index to Stata dataset. + byteorder : str, default None + Can be ">", "<", "little", or "big". default is `sys.byteorder` + time_stamp : datetime, default None + A datetime to use as file creation date. Default is the current time + data_label : str, default None + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict, default None + Dictionary containing columns as keys and variable labels as values. + Each label must be 80 characters or smaller. + convert_strl : list, default None + List of columns names to convert to Stata StrL format. Columns with + more than 2045 characters are automatically written as StrL. + Smaller columns can be converted by including the column name. Using + StrLs can reduce output file size when strings are longer than 8 + characters, and either frequently repeated or sparse. + version : int, default None + The dta version to use. By default, uses the size of data to determine + the version. 118 is used if data.shape[1] <= 32767, and 119 is used + for storing larger DataFrames. + + Returns + ------- + StataWriterUTF8 + The instance has a write_file method, which will write the file to the + given `fname`. + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column dtype is not representable in Stata + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + Examples + -------- + Using Unicode data and column names + + >>> from pandas.io.stata import StataWriterUTF8 + >>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ']) + >>> writer = StataWriterUTF8('./data_file.dta', data) + >>> writer.write_file() + + Or with long strings stored in strl format + + >>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']], + ... columns=['strls']) + >>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data, + ... convert_strl=['strls']) + >>> writer.write_file() + """ + + _encoding = "utf-8" + + def __init__( + self, + fname: FilePathOrBuffer, + data: DataFrame, + convert_dates: Optional[Dict[Hashable, str]] = None, + write_index: bool = True, + byteorder: Optional[str] = None, + time_stamp: Optional[datetime.datetime] = None, + data_label: Optional[str] = None, + variable_labels: Optional[Dict[Hashable, str]] = None, + convert_strl: Optional[Sequence[Hashable]] = None, + version: Optional[int] = None, + ): + if version is None: + version = 118 if data.shape[1] <= 32767 else 119 + elif version not in (118, 119): + raise ValueError("version must be either 118 or 119.") + elif version == 118 and data.shape[1] > 32767: + raise ValueError( + "You must use version 119 for data sets containing more than" + "32,767 variables" + ) + + super().__init__( + fname, + data, + convert_dates=convert_dates, + write_index=write_index, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + variable_labels=variable_labels, + convert_strl=convert_strl, + ) + # Override version set in StataWriter117 init + self._dta_version = version + + def _validate_variable_name(self, name: str) -> str: + """ + Validate variable names for Stata export. + + Parameters + ---------- + name : str + Variable name + + Returns + ------- + str + The validated name with invalid characters replaced with + underscores. + + Notes + ----- + Stata 118+ support most unicode characters. The only limitation is in + the ascii range where the characters supported are a-z, A-Z, 0-9 and _. + """ + # High code points appear to be acceptable + for c in name: + if ( + ord(c) < 128 + and (c < "A" or c > "Z") + and (c < "a" or c > "z") + and (c < "0" or c > "9") + and c != "_" + ) or 128 <= ord(c) < 256: + name = name.replace(c, "_") + + return name diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index f9a692b0559ca..27b1d55fe1bd6 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING, Dict, Type + from pandas.plotting._matplotlib.boxplot import ( BoxPlot, boxplot, @@ -26,7 +28,10 @@ ) from pandas.plotting._matplotlib.tools import table -PLOT_CLASSES = { +if TYPE_CHECKING: + from pandas.plotting._matplotlib.core import MPLPlot # noqa: F401 + +PLOT_CLASSES: Dict[str, Type["MPLPlot"]] = { "line": LinePlot, "bar": BarPlot, "barh": BarhPlot, diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 5b37ebb42aecc..a1035fd0823bb 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -421,8 +421,7 @@ def __call__(self): if estimate > self.MAXTICKS * 2: raise RuntimeError( "MillisecondLocator estimated to generate " - f"{estimate:d} ticks from {dmin} to {dmax}: " - "exceeds Locator.MAXTICKS" + f"{estimate:d} ticks from {dmin} to {dmax}: exceeds Locator.MAXTICKS" f"* 2 ({self.MAXTICKS * 2:d}) " ) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 2a6da18096c84..2d68bb46a8ada 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -229,10 +229,9 @@ def _validate_color_args(self): for char in s: if char in matplotlib.colors.BASE_COLORS: raise ValueError( - "Cannot pass 'style' string with a color " - "symbol and 'color' keyword argument. Please" - " use one or the other or pass 'style' " - "without a color symbol" + "Cannot pass 'style' string with a color symbol and " + "'color' keyword argument. Please use one or the other or " + "pass 'style' without a color symbol" ) def _iter_data(self, data=None, keep_index=False, fillna=None): @@ -241,12 +240,6 @@ def _iter_data(self, data=None, keep_index=False, fillna=None): if fillna is not None: data = data.fillna(fillna) - # TODO: unused? - # if self.sort_columns: - # columns = com.try_sort(data.columns) - # else: - # columns = data.columns - for col, values in data.items(): if keep_index is True: yield col, values @@ -401,6 +394,10 @@ def _compute_plot_data(self): include_type = [np.number] exclude_type = ["timedelta"] + # GH 18755, include object and category type for scatter plot + if self._kind == "scatter": + include_type.extend(["object", "category"]) + numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type) try: @@ -872,10 +869,13 @@ def __init__(self, data, x, y, **kwargs): x = self.data.columns[x] if is_integer(y) and not self.data.columns.holds_integer(): y = self.data.columns[y] - if len(self.data[x]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires x column to be numeric") - if len(self.data[y]._get_numeric_data()) == 0: - raise ValueError(self._kind + " requires y column to be numeric") + + # Scatter plot allows to plot objects data + if self._kind == "hexbin": + if len(self.data[x]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires x column to be numeric") + if len(self.data[y]._get_numeric_data()) == 0: + raise ValueError(self._kind + " requires y column to be numeric") self.x = x self.y = y diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index 8957389ac2b13..d54fc73b495ba 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -250,7 +250,8 @@ def _grouped_hist( def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) - xrot = xrot or rot + if xrot is None: + xrot = rot fig, axes = _grouped_plot( plot_group, @@ -317,8 +318,7 @@ def hist_series( if "figure" in kwds: raise ValueError( "Cannot pass 'figure' when using the " - "'by' argument, since a new 'Figure' instance " - "will be created" + "'by' argument, since a new 'Figure' instance will be created" ) axes = _grouped_hist( self, diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index dd4034a97f58e..d7732c86911b8 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -190,8 +190,7 @@ def _subplots( if sharex or sharey: warnings.warn( "When passing multiple axes, sharex and sharey " - "are ignored. These settings must be specified " - "when creating axes", + "are ignored. These settings must be specified when creating axes", UserWarning, stacklevel=4, ) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 7f208436ddc4a..ccd42d3940431 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -453,7 +453,7 @@ def __delitem__(self, key): raise ValueError(f"Cannot remove default parameter {key}") return super().__delitem__(key) - def __contains__(self, key): + def __contains__(self, key) -> bool: key = self._get_canonical_key(key) return super().__contains__(key) diff --git a/pandas/testing.py b/pandas/testing.py index acae47367d997..0445fa5b5efc0 100644 --- a/pandas/testing.py +++ b/pandas/testing.py @@ -1,11 +1,17 @@ -# flake8: noqa - """ Public testing utility functions. """ -from pandas.util.testing import ( +from pandas._testing import ( + assert_extension_array_equal, assert_frame_equal, assert_index_equal, assert_series_equal, ) + +__all__ = [ + "assert_extension_array_equal", + "assert_frame_equal", + "assert_series_equal", + "assert_index_equal", +] diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 900ba878e4c0a..406d5f055797d 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -1,8 +1,12 @@ +import subprocess +import sys from typing import List +import pytest + import pandas as pd from pandas import api, compat -import pandas.util.testing as tm +import pandas._testing as tm class Base: @@ -20,7 +24,6 @@ def check(self, namespace, expected, ignored=None): class TestPDApi(Base): - # these are optionally imported based on testing # & need to be ignored ignored = ["tests", "locale", "conftest"] @@ -43,7 +46,7 @@ class TestPDApi(Base): ] # these are already deprecated; awaiting removal - deprecated_modules: List[str] = [] + deprecated_modules: List[str] = ["np", "datetime"] # misc misc = ["IndexSlice", "NaT", "NA"] @@ -68,7 +71,6 @@ class TestPDApi(Base): "RangeIndex", "UInt64Index", "Series", - "SparseArray", "SparseDtype", "StringDtype", "Timedelta", @@ -91,17 +93,20 @@ class TestPDApi(Base): "UInt64Dtype", "NamedAgg", ] - if not compat.PY37: - classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) # these are already deprecated; awaiting removal deprecated_classes: List[str] = [] # these should be deprecated in the future - deprecated_classes_in_future: List[str] = [] + deprecated_classes_in_future: List[str] = ["SparseArray"] + + if not compat.PY37: + classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) + # deprecated_modules.extend(["np", "datetime"]) + # deprecated_classes_in_future.extend(["SparseArray"]) # external modules exposed in pandas namespace - modules = ["np", "datetime"] + modules: List[str] = [] # top-level functions funcs = [ @@ -193,6 +198,7 @@ class TestPDApi(Base): "_np_version_under1p16", "_np_version_under1p17", "_np_version_under1p18", + "_testing", "_tslib", "_typing", "_version", @@ -200,42 +206,126 @@ class TestPDApi(Base): def test_api(self): - self.check( - pd, + checkthese = ( self.lib + self.misc + self.modules - + self.deprecated_modules + self.classes - + self.deprecated_classes - + self.deprecated_classes_in_future + self.funcs + self.funcs_option + self.funcs_read + self.funcs_json + self.funcs_to - + self.deprecated_funcs_in_future + + self.private_modules + ) + if not compat.PY37: + checkthese.extend( + self.deprecated_modules + + self.deprecated_classes + + self.deprecated_classes_in_future + + self.deprecated_funcs_in_future + + self.deprecated_funcs + ) + self.check(pd, checkthese, self.ignored) + + def test_depr(self): + deprecated_list = ( + self.deprecated_modules + + self.deprecated_classes + + self.deprecated_classes_in_future + self.deprecated_funcs - + self.private_modules, - self.ignored, + + self.deprecated_funcs_in_future ) + for depr in deprecated_list: + with tm.assert_produces_warning(FutureWarning): + deprecated = getattr(pd, depr) + if not compat.PY37: + if depr == "datetime": + deprecated.__getattr__(dir(pd.datetime.datetime)[-1]) + elif depr == "SparseArray": + deprecated([]) + else: + deprecated.__getattr__(dir(deprecated)[-1]) -class TestApi(Base): +def test_datetime(): + from datetime import datetime + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert datetime(2015, 1, 2, 0, 0) == pd.datetime(2015, 1, 2, 0, 0) + assert isinstance(pd.datetime(2015, 1, 2, 0, 0), pd.datetime) + + +def test_sparsearray(): + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert isinstance(pd.array([1, 2, 3], dtype="Sparse"), pd.SparseArray) + + +def test_np(): + import numpy as np + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + assert (pd.np.arange(0, 10) == np.arange(0, 10)).all() + + +class TestApi(Base): allowed = ["types", "extensions", "indexers"] def test_api(self): - self.check(api, self.allowed) class TestTesting(Base): - - funcs = ["assert_frame_equal", "assert_series_equal", "assert_index_equal"] + funcs = [ + "assert_frame_equal", + "assert_series_equal", + "assert_index_equal", + "assert_extension_array_equal", + ] def test_testing(self): - from pandas import testing self.check(testing, self.funcs) + + def test_util_testing_deprecated(self): + # avoid cache state affecting the test + sys.modules.pop("pandas.util.testing", None) + + with tm.assert_produces_warning(FutureWarning) as m: + import pandas.util.testing # noqa: F401 + + assert "pandas.util.testing is deprecated" in str(m[0].message) + assert "pandas.testing instead" in str(m[0].message) + + def test_util_testing_deprecated_direct(self): + # avoid cache state affecting the test + sys.modules.pop("pandas.util.testing", None) + with tm.assert_produces_warning(FutureWarning) as m: + from pandas.util.testing import assert_series_equal # noqa: F401 + + assert "pandas.util.testing is deprecated" in str(m[0].message) + assert "pandas.testing instead" in str(m[0].message) + + def test_util_in_top_level(self): + # in a subprocess to avoid import caching issues + out = subprocess.check_output( + [ + sys.executable, + "-c", + "import pandas; pandas.util.testing.assert_series_equal", + ], + stderr=subprocess.STDOUT, + ).decode() + assert "pandas.util.testing is deprecated" in out + + with pytest.raises(AttributeError, match="foo"): + pd.util.foo diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py index 97480502f192c..31423c03dee34 100644 --- a/pandas/tests/api/test_types.py +++ b/pandas/tests/api/test_types.py @@ -1,5 +1,5 @@ +import pandas._testing as tm from pandas.api import types -import pandas.util.testing as tm from .test_api import Base diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index bc02a1e76a695..83d19b8a20ac3 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -5,7 +5,7 @@ import pytest from pandas import DataFrame, Index, Series -import pandas.util.testing as tm +import pandas._testing as tm def assert_invalid_addsub_type(left, right, msg=None): @@ -70,7 +70,7 @@ def assert_invalid_comparison(left, right, box): result = right != left tm.assert_equal(result, ~expected) - msg = "Invalid comparison between" + msg = "Invalid comparison between|Cannot compare type|not supported between" with pytest.raises(TypeError, match=msg): left < right with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 1f8fdfd671856..577093c0f2967 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm # ------------------------------------------------------------------ # Helper Functions @@ -235,25 +235,6 @@ def box_df_fail(request): return request.param -@pytest.fixture( - params=[ - (pd.Index, False), - (pd.Series, False), - (pd.DataFrame, False), - pytest.param((pd.DataFrame, True), marks=pytest.mark.xfail), - (tm.to_array, False), - ], - ids=id_func, -) -def box_transpose_fail(request): - """ - Fixture similar to `box` but testing both transpose cases for DataFrame, - with the tranpose=True case xfailed. - """ - # GH#23620 - return request.param - - @pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array], ids=id_func) def box_with_array(request): """ diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index afce374aebe05..d3f9ac4f3f8b2 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -26,15 +26,13 @@ Timestamp, date_range, ) -import pandas.core.arrays.datetimelike as dtl -from pandas.core.indexes.datetimes import _to_M8 +import pandas._testing as tm from pandas.core.ops import roperator from pandas.tests.arithmetic.common import ( assert_invalid_addsub_type, assert_invalid_comparison, get_upcast_box, ) -import pandas.util.testing as tm # ------------------------------------------------------------------ # Comparisons @@ -86,6 +84,52 @@ def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_arra dtarr = tm.box_expected(rng, box_with_array) assert_invalid_comparison(dtarr, other, box_with_array) + @pytest.mark.parametrize( + "other", + [ + list(range(10)), + np.arange(10), + np.arange(10).astype(np.float32), + np.arange(10).astype(object), + pd.timedelta_range("1ns", periods=10).array, + np.array(pd.timedelta_range("1ns", periods=10)), + list(pd.timedelta_range("1ns", periods=10)), + pd.timedelta_range("1 Day", periods=10).astype(object), + pd.period_range("1971-01-01", freq="D", periods=10).array, + pd.period_range("1971-01-01", freq="D", periods=10).astype(object), + ], + ) + def test_dt64arr_cmp_arraylike_invalid(self, other, tz_naive_fixture): + # We don't parametrize this over box_with_array because listlike + # other plays poorly with assert_invalid_comparison reversed checks + tz = tz_naive_fixture + + dta = date_range("1970-01-01", freq="ns", periods=10, tz=tz)._data + assert_invalid_comparison(dta, other, tm.to_array) + + def test_dt64arr_cmp_mixed_invalid(self, tz_naive_fixture): + tz = tz_naive_fixture + + dta = date_range("1970-01-01", freq="h", periods=5, tz=tz)._data + + other = np.array([0, 1, 2, dta[3], pd.Timedelta(days=1)]) + result = dta == other + expected = np.array([False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = dta != other + tm.assert_numpy_array_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + dta < other + with pytest.raises(TypeError, match=msg): + dta > other + with pytest.raises(TypeError, match=msg): + dta <= other + with pytest.raises(TypeError, match=msg): + dta >= other + def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly tz = tz_naive_fixture @@ -296,7 +340,7 @@ class TestDatetimeIndexComparisons: def test_comparators(self, op): index = tm.makeDateIndex(100) element = index[len(index) // 2] - element = _to_M8(element) + element = Timestamp(element).to_datetime64() arr = np.array(index) arr_result = op(arr, element) @@ -1332,7 +1376,7 @@ def test_dt64arr_add_mixed_offset_array(self, box_with_array): s = tm.box_expected(s, box_with_array) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[dtl]): + with tm.assert_produces_warning(warn): other = pd.Index([pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()]) other = tm.box_expected(other, box_with_array) result = s + other @@ -1361,7 +1405,7 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): other = np.array([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)]) warn = None if box_with_array is pd.DataFrame else PerformanceWarning - with tm.assert_produces_warning(warn, clear=[dtl]): + with tm.assert_produces_warning(warn): res = dtarr + other expected = DatetimeIndex( [dti[n] + other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -1369,11 +1413,11 @@ def test_dt64arr_add_sub_offset_ndarray(self, tz_naive_fixture, box_with_array): expected = tm.box_expected(expected, box_with_array) tm.assert_equal(res, expected) - with tm.assert_produces_warning(warn, clear=[dtl]): + with tm.assert_produces_warning(warn): res2 = other + dtarr tm.assert_equal(res2, expected) - with tm.assert_produces_warning(warn, clear=[dtl]): + with tm.assert_produces_warning(warn): res = dtarr - other expected = DatetimeIndex( [dti[n] - other[n] for n in range(len(dti))], name=dti.name, freq="infer" @@ -2298,7 +2342,7 @@ def test_dti_addsub_offset_arraylike( xbox = get_upcast_box(box, other) - with tm.assert_produces_warning(PerformanceWarning, clear=[dtl]): + with tm.assert_produces_warning(PerformanceWarning): res = op(dti, other) expected = DatetimeIndex( @@ -2307,6 +2351,32 @@ def test_dti_addsub_offset_arraylike( expected = tm.box_expected(expected, xbox) tm.assert_equal(res, expected) + @pytest.mark.parametrize("other_box", [pd.Index, np.array]) + def test_dti_addsub_object_arraylike( + self, tz_naive_fixture, box_with_array, other_box + ): + tz = tz_naive_fixture + + dti = pd.date_range("2017-01-01", periods=2, tz=tz) + dtarr = tm.box_expected(dti, box_with_array) + other = other_box([pd.offsets.MonthEnd(), pd.Timedelta(days=4)]) + xbox = get_upcast_box(box_with_array, other) + + expected = pd.DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) + + warn = None if box_with_array is pd.DataFrame else PerformanceWarning + with tm.assert_produces_warning(warn): + result = dtarr + other + tm.assert_equal(result, expected) + + expected = pd.DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture) + expected = tm.box_expected(expected, xbox) + + with tm.assert_produces_warning(warn): + result = dtarr - other + tm.assert_equal(result, expected) + @pytest.mark.parametrize("years", [-1, 0, 1]) @pytest.mark.parametrize("months", [-2, 0, 2]) diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py new file mode 100644 index 0000000000000..f9e1a515277d5 --- /dev/null +++ b/pandas/tests/arithmetic/test_interval.py @@ -0,0 +1,273 @@ +import operator + +import numpy as np +import pytest + +from pandas.core.dtypes.common import is_list_like + +import pandas as pd +from pandas import ( + Categorical, + Index, + Interval, + IntervalIndex, + Period, + Series, + Timedelta, + Timestamp, + date_range, + period_range, + timedelta_range, +) +import pandas._testing as tm +from pandas.core.arrays import IntervalArray + + +@pytest.fixture( + params=[ + (Index([0, 2, 4, 4]), Index([1, 3, 5, 8])), + (Index([0.0, 1.0, 2.0, np.nan]), Index([1.0, 2.0, 3.0, np.nan])), + ( + timedelta_range("0 days", periods=3).insert(4, pd.NaT), + timedelta_range("1 day", periods=3).insert(4, pd.NaT), + ), + ( + date_range("20170101", periods=3).insert(4, pd.NaT), + date_range("20170102", periods=3).insert(4, pd.NaT), + ), + ( + date_range("20170101", periods=3, tz="US/Eastern").insert(4, pd.NaT), + date_range("20170102", periods=3, tz="US/Eastern").insert(4, pd.NaT), + ), + ], + ids=lambda x: str(x[0].dtype), +) +def left_right_dtypes(request): + """ + Fixture for building an IntervalArray from various dtypes + """ + return request.param + + +@pytest.fixture +def array(left_right_dtypes): + """ + Fixture to generate an IntervalArray of various dtypes containing NA if possible + """ + left, right = left_right_dtypes + return IntervalArray.from_arrays(left, right) + + +def create_categorical_intervals(left, right, closed="right"): + return Categorical(IntervalIndex.from_arrays(left, right, closed)) + + +def create_series_intervals(left, right, closed="right"): + return Series(IntervalArray.from_arrays(left, right, closed)) + + +def create_series_categorical_intervals(left, right, closed="right"): + return Series(Categorical(IntervalIndex.from_arrays(left, right, closed))) + + +class TestComparison: + @pytest.fixture(params=[operator.eq, operator.ne]) + def op(self, request): + return request.param + + @pytest.fixture( + params=[ + IntervalArray.from_arrays, + IntervalIndex.from_arrays, + create_categorical_intervals, + create_series_intervals, + create_series_categorical_intervals, + ], + ids=[ + "IntervalArray", + "IntervalIndex", + "Categorical[Interval]", + "Series[Interval]", + "Series[Categorical[Interval]]", + ], + ) + def interval_constructor(self, request): + """ + Fixture for all pandas native interval constructors. + To be used as the LHS of IntervalArray comparisons. + """ + return request.param + + def elementwise_comparison(self, op, array, other): + """ + Helper that performs elementwise comparisions between `array` and `other` + """ + other = other if is_list_like(other) else [other] * len(array) + return np.array([op(x, y) for x, y in zip(array, other)]) + + def test_compare_scalar_interval(self, op, array): + # matches first interval + other = array[0] + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # matches on a single endpoint but not both + other = Interval(array.left[0], array.right[1]) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_scalar_interval_mixed_closed(self, op, closed, other_closed): + array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed) + other = Interval(0, 1, closed=other_closed) + + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_scalar_na(self, op, array, nulls_fixture): + result = op(array, nulls_fixture) + expected = self.elementwise_comparison(op, array, nulls_fixture) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + 0, + 1.0, + True, + "foo", + Timestamp("2017-01-01"), + Timestamp("2017-01-01", tz="US/Eastern"), + Timedelta("0 days"), + Period("2017-01-01", "D"), + ], + ) + def test_compare_scalar_other(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_interval( + self, op, array, interval_constructor, + ): + # same endpoints + other = interval_constructor(array.left, array.right) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # different endpoints + other = interval_constructor(array.left[::-1], array.right[::-1]) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + # all nan endpoints + other = interval_constructor([np.nan] * 4, [np.nan] * 4) + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_interval_mixed_closed( + self, op, interval_constructor, closed, other_closed + ): + array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed) + other = interval_constructor(range(2), range(1, 3), closed=other_closed) + + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + ( + Interval(0, 1), + Interval(Timedelta("1 day"), Timedelta("2 days")), + Interval(4, 5, "both"), + Interval(10, 20, "neither"), + ), + (0, 1.5, Timestamp("20170103"), np.nan), + ( + Timestamp("20170102", tz="US/Eastern"), + Timedelta("2 days"), + "baz", + pd.NaT, + ), + ], + ) + def test_compare_list_like_object(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + def test_compare_list_like_nan(self, op, array, nulls_fixture): + other = [nulls_fixture] * 4 + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize( + "other", + [ + np.arange(4, dtype="int64"), + np.arange(4, dtype="float64"), + date_range("2017-01-01", periods=4), + date_range("2017-01-01", periods=4, tz="US/Eastern"), + timedelta_range("0 days", periods=4), + period_range("2017-01-01", periods=4, freq="D"), + Categorical(list("abab")), + Categorical(date_range("2017-01-01", periods=4)), + pd.array(list("abcd")), + pd.array(["foo", 3.14, None, object()]), + ], + ids=lambda x: str(x.dtype), + ) + def test_compare_list_like_other(self, op, array, other): + result = op(array, other) + expected = self.elementwise_comparison(op, array, other) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("length", [1, 3, 5]) + @pytest.mark.parametrize("other_constructor", [IntervalArray, list]) + def test_compare_length_mismatch_errors(self, op, other_constructor, length): + array = IntervalArray.from_arrays(range(4), range(1, 5)) + other = other_constructor([Interval(0, 1)] * length) + with pytest.raises(ValueError, match="Lengths must match to compare"): + op(array, other) + + @pytest.mark.parametrize( + "constructor, expected_type, assert_func", + [ + (IntervalIndex, np.array, tm.assert_numpy_array_equal), + (Series, Series, tm.assert_series_equal), + ], + ) + def test_index_series_compat(self, op, constructor, expected_type, assert_func): + # IntervalIndex/Series that rely on IntervalArray for comparisons + breaks = range(4) + index = constructor(IntervalIndex.from_breaks(breaks)) + + # scalar comparisons + other = index[0] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + other = breaks[0] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + # list-like comparisons + other = IntervalArray.from_breaks(breaks) + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) + + other = [index[0], breaks[0], "foo"] + result = op(index, other) + expected = expected_type(self.elementwise_comparison(op, index, other)) + assert_func(result, expected) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 9733d589ee93b..f55e2b98ee912 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -12,8 +12,8 @@ import pandas as pd from pandas import Index, Series, Timedelta, TimedeltaIndex +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm def adjust_negative_zero(zero, expected): @@ -65,13 +65,16 @@ def test_df_numeric_cmp_dt64_raises(self): # GH#8932, GH#22163 ts = pd.Timestamp.now() df = pd.DataFrame({"x": range(5)}) - with pytest.raises(TypeError): + + msg = "Invalid comparison between dtype=int64 and Timestamp" + + with pytest.raises(TypeError, match=msg): df > ts - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): df < ts - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): ts < df - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): ts > df assert not (df == ts).any().any() diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index f9c1de115b3a4..c0d3c9d4977bd 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -9,8 +9,8 @@ import pandas as pd from pandas import Series, Timestamp +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm # ------------------------------------------------------------------ # Comparisons @@ -137,7 +137,13 @@ def test_objarr_radd_str_invalid(self, dtype, data, box_with_array): ser = Series(data, dtype=dtype) ser = tm.box_expected(ser, box_with_array) - with pytest.raises(TypeError): + msg = ( + "can only concatenate str|" + "did not contain a loop with signature matching types|" + "unsupported operand type|" + "must be str" + ) + with pytest.raises(TypeError, match=msg): "foo_" + ser @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub]) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index ed693d873efb8..abb667260f094 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -11,12 +11,14 @@ import pandas as pd from pandas import Period, PeriodIndex, Series, period_range +import pandas._testing as tm from pandas.core import ops from pandas.core.arrays import TimedeltaArray -import pandas.util.testing as tm from pandas.tseries.frequencies import to_offset +from .common import assert_invalid_comparison + # ------------------------------------------------------------------ # Comparisons @@ -39,11 +41,93 @@ def test_compare_zerodim(self, box_with_array): expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) + @pytest.mark.parametrize( + "scalar", ["foo", pd.Timestamp.now(), pd.Timedelta(days=4)] + ) + def test_compare_invalid_scalar(self, box_with_array, scalar): + # comparison with scalar that cannot be interpreted as a Period + pi = pd.period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, scalar, box_with_array) + + @pytest.mark.parametrize( + "other", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("1D", periods=4).array, + np.arange(4), + np.arange(4).astype(np.float64), + list(range(4)), + ], + ) + def test_compare_invalid_listlike(self, box_with_array, other): + pi = pd.period_range("2000", periods=4) + parr = tm.box_expected(pi, box_with_array) + assert_invalid_comparison(parr, other, box_with_array) + + @pytest.mark.parametrize("other_box", [list, np.array, lambda x: x.astype(object)]) + def test_compare_object_dtype(self, box_with_array, other_box): + pi = pd.period_range("2000", periods=5) + parr = tm.box_expected(pi, box_with_array) + + xbox = np.ndarray if box_with_array is pd.Index else box_with_array + + other = other_box(pi) + + expected = np.array([True, True, True, True, True]) + expected = tm.box_expected(expected, xbox) + + result = parr == other + tm.assert_equal(result, expected) + result = parr <= other + tm.assert_equal(result, expected) + result = parr >= other + tm.assert_equal(result, expected) + + result = parr != other + tm.assert_equal(result, ~expected) + result = parr < other + tm.assert_equal(result, ~expected) + result = parr > other + tm.assert_equal(result, ~expected) + + other = other_box(pi[::-1]) + + expected = np.array([False, False, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr == other + tm.assert_equal(result, expected) + + expected = np.array([True, True, True, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr <= other + tm.assert_equal(result, expected) + + expected = np.array([False, False, True, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr >= other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr != other + tm.assert_equal(result, expected) + + expected = np.array([True, True, False, False, False]) + expected = tm.box_expected(expected, xbox) + result = parr < other + tm.assert_equal(result, expected) + + expected = np.array([False, False, False, True, True]) + expected = tm.box_expected(expected, xbox) + result = parr > other + tm.assert_equal(result, expected) + class TestPeriodIndexComparisons: # TODO: parameterize over boxes - @pytest.mark.parametrize("other", ["2017", 2017]) + @pytest.mark.parametrize("other", ["2017", pd.Period("2017", freq="D")]) def test_eq(self, other): idx = PeriodIndex(["2017", "2017", "2018"], freq="D") expected = np.array([True, True, False]) @@ -51,6 +135,34 @@ def test_eq(self, other): tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "other", + [ + 2017, + [2017, 2017, 2017], + np.array([2017, 2017, 2017]), + np.array([2017, 2017, 2017], dtype=object), + pd.Index([2017, 2017, 2017]), + ], + ) + def test_eq_integer_disallowed(self, other): + # match Period semantics by not treating integers as Periods + + idx = PeriodIndex(["2017", "2017", "2018"], freq="D") + expected = np.array([False, False, False]) + result = idx == other + + tm.assert_numpy_array_equal(result, expected) + + with pytest.raises(TypeError): + idx < other + with pytest.raises(TypeError): + idx > other + with pytest.raises(TypeError): + idx <= other + with pytest.raises(TypeError): + idx >= other + def test_pi_cmp_period(self): idx = period_range("2007-01", periods=20, freq="M") @@ -168,9 +280,7 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): # TODO: Could parametrize over boxes for idx? idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") - rev_msg = ( - r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=A-DEC\)" - ) + rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=A-DEC\)" idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx @@ -184,7 +294,7 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): Period("2011", freq="4M") >= base idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="4M") - rev_msg = r"Input has different freq=(M|2M|3M) from " r"PeriodArray\(freq=4M\)" + rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=4M\)" idx_msg = rev_msg if box_with_array is tm.to_array else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx @@ -755,18 +865,18 @@ def test_pi_sub_isub_offset(self): rng -= pd.offsets.MonthEnd(5) tm.assert_index_equal(rng, expected) - def test_pi_add_offset_n_gt1(self, box_transpose_fail): + @pytest.mark.parametrize("transpose", [True, False]) + def test_pi_add_offset_n_gt1(self, box_with_array, transpose): # GH#23215 # add offset to PeriodIndex with freq.n > 1 - box, transpose = box_transpose_fail per = pd.Period("2016-01", freq="2M") pi = pd.PeriodIndex([per]) expected = pd.PeriodIndex(["2016-03"], freq="2M") - pi = tm.box_expected(pi, box, transpose=transpose) - expected = tm.box_expected(expected, box, transpose=transpose) + pi = tm.box_expected(pi, box_with_array, transpose=transpose) + expected = tm.box_expected(expected, box_with_array, transpose=transpose) result = pi + per.freq tm.assert_equal(result, expected) @@ -984,16 +1094,15 @@ def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, mismatched_freq): with pytest.raises(IncompatibleFrequency, match=msg): rng -= other - def test_parr_add_sub_td64_nat(self, box_transpose_fail): + @pytest.mark.parametrize("transpose", [True, False]) + def test_parr_add_sub_td64_nat(self, box_with_array, transpose): # GH#23320 special handling for timedelta64("NaT") - box, transpose = box_transpose_fail - pi = pd.period_range("1994-04-01", periods=9, freq="19D") other = np.timedelta64("NaT") expected = pd.PeriodIndex(["NaT"] * 9, freq="19D") - obj = tm.box_expected(pi, box, transpose=transpose) - expected = tm.box_expected(expected, box, transpose=transpose) + obj = tm.box_expected(pi, box_with_array, transpose=transpose) + expected = tm.box_expected(expected, box_with_array, transpose=transpose) result = obj + other tm.assert_equal(result, expected) @@ -1011,16 +1120,12 @@ def test_parr_add_sub_td64_nat(self, box_transpose_fail): TimedeltaArray._from_sequence(["NaT"] * 9), ], ) - def test_parr_add_sub_tdt64_nat_array(self, box_df_fail, other): - # FIXME: DataFrame fails because when when operating column-wise - # timedelta64 entries become NaT and are treated like datetimes - box = box_df_fail - + def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other): pi = pd.period_range("1994-04-01", periods=9, freq="19D") expected = pd.PeriodIndex(["NaT"] * 9, freq="19D") - obj = tm.box_expected(pi, box) - expected = tm.box_expected(expected, box) + obj = tm.box_expected(pi, box_with_array) + expected = tm.box_expected(expected, box_with_array) result = obj + other tm.assert_equal(result, expected) @@ -1043,6 +1148,26 @@ def test_parr_add_sub_index(self): expected = pi - pi tm.assert_index_equal(result, expected) + def test_parr_add_sub_object_array(self): + pi = pd.period_range("2000-12-31", periods=3, freq="D") + parr = pi.array + + other = np.array([pd.Timedelta(days=1), pd.offsets.Day(2), 3]) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr + other + + expected = pd.PeriodIndex( + ["2001-01-01", "2001-01-03", "2001-01-05"], freq="D" + ).array + tm.assert_equal(result, expected) + + with tm.assert_produces_warning(PerformanceWarning): + result = parr - other + + expected = pd.PeriodIndex(["2000-12-30"] * 3, freq="D").array + tm.assert_equal(result, expected) + class TestPeriodSeriesArithmetic: def test_ops_series_timedelta(self): diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index cc337f8fdd7ce..158da37aa7239 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -18,12 +18,12 @@ Timestamp, timedelta_range, ) +import pandas._testing as tm from pandas.tests.arithmetic.common import ( assert_invalid_addsub_type, assert_invalid_comparison, get_upcast_box, ) -import pandas.util.testing as tm # ------------------------------------------------------------------ # Timedelta64[ns] dtype Comparisons @@ -76,6 +76,49 @@ def test_td64_comparisons_invalid(self, box_with_array, invalid): assert_invalid_comparison(obj, invalid, box) + @pytest.mark.parametrize( + "other", + [ + list(range(10)), + np.arange(10), + np.arange(10).astype(np.float32), + np.arange(10).astype(object), + pd.date_range("1970-01-01", periods=10, tz="UTC").array, + np.array(pd.date_range("1970-01-01", periods=10)), + list(pd.date_range("1970-01-01", periods=10)), + pd.date_range("1970-01-01", periods=10).astype(object), + pd.period_range("1971-01-01", freq="D", periods=10).array, + pd.period_range("1971-01-01", freq="D", periods=10).astype(object), + ], + ) + def test_td64arr_cmp_arraylike_invalid(self, other): + # We don't parametrize this over box_with_array because listlike + # other plays poorly with assert_invalid_comparison reversed checks + + rng = timedelta_range("1 days", periods=10)._data + assert_invalid_comparison(rng, other, tm.to_array) + + def test_td64arr_cmp_mixed_invalid(self): + rng = timedelta_range("1 days", periods=5)._data + + other = np.array([0, 1, 2, rng[3], pd.Timestamp.now()]) + result = rng == other + expected = np.array([False, False, False, True, False]) + tm.assert_numpy_array_equal(result, expected) + + result = rng != other + tm.assert_numpy_array_equal(result, ~expected) + + msg = "Invalid comparison between|Cannot compare type|not supported between" + with pytest.raises(TypeError, match=msg): + rng < other + with pytest.raises(TypeError, match=msg): + rng > other + with pytest.raises(TypeError, match=msg): + rng <= other + with pytest.raises(TypeError, match=msg): + rng >= other + class TestTimedelta64ArrayComparisons: # TODO: All of these need to be parametrized over box @@ -1469,6 +1512,40 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): with tm.assert_produces_warning(PerformanceWarning): anchored - tdi + # ------------------------------------------------------------------ + # Unsorted + + def test_td64arr_add_sub_object_array(self, box_with_array): + tdi = pd.timedelta_range("1 day", periods=3, freq="D") + tdarr = tm.box_expected(tdi, box_with_array) + + other = np.array( + [pd.Timedelta(days=1), pd.offsets.Day(2), pd.Timestamp("2000-01-04")] + ) + + warn = PerformanceWarning if box_with_array is not pd.DataFrame else None + with tm.assert_produces_warning(warn): + result = tdarr + other + + expected = pd.Index( + [pd.Timedelta(days=2), pd.Timedelta(days=4), pd.Timestamp("2000-01-07")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + + with pytest.raises(TypeError): + with tm.assert_produces_warning(warn): + tdarr - other + + with tm.assert_produces_warning(warn): + result = other - tdarr + + expected = pd.Index( + [pd.Timedelta(0), pd.Timedelta(0), pd.Timestamp("2000-01-01")] + ) + expected = tm.box_expected(expected, box_with_array) + tm.assert_equal(result, expected) + class TestTimedeltaArraylikeMulDivOps: # Tests for timedelta64[ns] diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 870a0a5db175e..5ff0bb8ef0d78 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("ordered", [True, False]) @@ -111,7 +111,7 @@ def test_take_bounds(self, allow_fill): if allow_fill: msg = "indices are out-of-bounds" else: - msg = "index 4 is out of bounds for size 3" + msg = "index 4 is out of bounds for( axis 0 with)? size 3" with pytest.raises(IndexError, match=msg): cat.take([4, 5], allow_fill=allow_fill) @@ -177,3 +177,7 @@ def test_take_nd_deprecated(self): cat = pd.Categorical(["a", "b", "c"]) with tm.assert_produces_warning(FutureWarning): cat.take_nd([0, 1]) + + ci = pd.Index(cat) + with tm.assert_produces_warning(FutureWarning): + ci.take_nd([0, 1]) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 4122a64a64516..90fcf12093909 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -6,8 +6,8 @@ from pandas.compat import PYPY from pandas import Categorical, Index, NaT, Series, date_range +import pandas._testing as tm from pandas.api.types import is_scalar -import pandas.util.testing as tm class TestCategoricalAnalytics: diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 82f2fe1ab8fb6..f49f70f5acf77 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -4,9 +4,9 @@ import pytest from pandas import Categorical, CategoricalIndex, DataFrame, Index, Series +import pandas._testing as tm from pandas.core.arrays.categorical import _recode_for_categories from pandas.tests.arrays.categorical.common import TestCategorical -import pandas.util.testing as tm class TestCategoricalAPI: @@ -87,8 +87,8 @@ def test_rename_categories(self): def test_rename_categories_wrong_length_raises(self, new_categories): cat = Categorical(["a", "b", "c", "a"]) msg = ( - "new categories need to have the same number of items as the" - " old categories!" + "new categories need to have the same number of items as the " + "old categories!" ) with pytest.raises(ValueError, match=msg): cat.rename_categories(new_categories) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 6c8b654c1955c..70a23e9748dd1 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -24,7 +24,7 @@ period_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalConstructors: diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 85bf385b029a3..19746d7d72162 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -4,7 +4,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical, CategoricalIndex, Index, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalDtypes: diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index 37dea53f792cb..85d5a6a3dc3ac 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -3,9 +3,9 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Index, PeriodIndex, Series +import pandas._testing as tm import pandas.core.common as com from pandas.tests.arrays.categorical.common import TestCategorical -import pandas.util.testing as tm class TestCategoricalIndexingWithFactor(TestCategorical): @@ -157,8 +157,8 @@ def test_categories_assigments(self): def test_categories_assigments_wrong_length_raises(self, new_categories): cat = Categorical(["a", "b", "c", "a"]) msg = ( - "new categories need to have the same number of items" - " as the old categories!" + "new categories need to have the same number of items " + "as the old categories!" ) with pytest.raises(ValueError, match=msg): cat.categories = new_categories diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 3037ac79cd592..211bf091ee17d 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical, Index, Series, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalMissing: diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 10e33bf70dc66..0c830c65e0f8b 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import Categorical, DataFrame, Series, date_range +import pandas._testing as tm from pandas.tests.arrays.categorical.common import TestCategorical -import pandas.util.testing as tm class TestCategoricalOpsWithFactor(TestCategorical): @@ -97,8 +97,8 @@ def test_comparisons(self): # comparison (in both directions) with Series will raise s = Series(["b", "b", "b"]) msg = ( - "Cannot compare a Categorical for op __gt__ with type" - r" " + "Cannot compare a Categorical for op __gt__ with type " + r"" ) with pytest.raises(TypeError, match=msg): cat > s @@ -172,8 +172,8 @@ def test_comparison_with_unknown_scalars(self): cat = Categorical([1, 2, 3], ordered=True) msg = ( - "Cannot compare a Categorical for op __{}__ with a scalar," - " which is not a category" + "Cannot compare a Categorical for op __{}__ with a scalar, " + "which is not a category" ) with pytest.raises(TypeError, match=msg.format("lt")): cat < 4 @@ -265,8 +265,8 @@ def test_comparisons(self, data, reverse, base): # categorical cannot be compared to Series or numpy array, and also # not the other way around msg = ( - "Cannot compare a Categorical for op __gt__ with type" - r" " + "Cannot compare a Categorical for op __gt__ with type " + r"" ) with pytest.raises(TypeError, match=msg): cat > s diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 9321813b42b33..d08c4b47dd3cb 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -147,8 +147,6 @@ def test_categorical_repr_datetime(self): idx = date_range("2011-01-01 09:00", freq="H", periods=5) c = Categorical(idx) - # TODO(wesm): exceeding 80 characters in the console is not good - # behavior exp = ( "[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, " "2011-01-01 12:00:00, 2011-01-01 13:00:00]\n" diff --git a/pandas/tests/arrays/categorical/test_sorting.py b/pandas/tests/arrays/categorical/test_sorting.py index a0b09e19ece6e..2a0ef043bf9a9 100644 --- a/pandas/tests/arrays/categorical/test_sorting.py +++ b/pandas/tests/arrays/categorical/test_sorting.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, Index -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalSort: diff --git a/pandas/tests/arrays/categorical/test_subclass.py b/pandas/tests/arrays/categorical/test_subclass.py index cfc7b8541302f..b80d0ff41aba6 100644 --- a/pandas/tests/arrays/categorical/test_subclass.py +++ b/pandas/tests/arrays/categorical/test_subclass.py @@ -1,5 +1,5 @@ from pandas import Categorical -import pandas.util.testing as tm +import pandas._testing as tm class TestCategoricalSubclassing: diff --git a/pandas/tests/arrays/categorical/test_warnings.py b/pandas/tests/arrays/categorical/test_warnings.py index 29bd5252dbe3a..f66c327e9967d 100644 --- a/pandas/tests/arrays/categorical/test_warnings.py +++ b/pandas/tests/arrays/categorical/test_warnings.py @@ -1,16 +1,19 @@ import pytest -import pandas.util.testing as tm +from pandas.util._test_decorators import async_mark + +import pandas._testing as tm class TestCategoricalWarnings: - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; c = Categorical([])" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("c.", 1)) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 655a6e717119b..e046d87780bb4 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Index, @@ -11,8 +13,8 @@ date_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import IntervalArray -import pandas.util.testing as tm @pytest.fixture( @@ -103,3 +105,110 @@ def test_repr(): "Length: 2, closed: right, dtype: interval[int64]" ) assert result == expected + + +# ---------------------------------------------------------------------------- +# Arrow interaction + + +pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") + + +@pyarrow_skip +def test_arrow_extension_type(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + p1 = ArrowIntervalType(pa.int64(), "left") + p2 = ArrowIntervalType(pa.int64(), "left") + p3 = ArrowIntervalType(pa.int64(), "right") + + assert p1.closed == "left" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + +@pyarrow_skip +def test_arrow_array(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + intervals = pd.interval_range(1, 5, freq=1).array + + result = pa.array(intervals) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == intervals.closed + assert result.type.subtype == pa.int64() + assert result.storage.field("left").equals(pa.array([1, 2, 3, 4], type="int64")) + assert result.storage.field("right").equals(pa.array([2, 3, 4, 5], type="int64")) + + expected = pa.array([{"left": i, "right": i + 1} for i in range(1, 5)]) + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(intervals, type=expected.type) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError): + pa.array(intervals, type="float64") + + with pytest.raises(TypeError, match="different 'subtype'"): + pa.array(intervals, type=ArrowIntervalType(pa.float64(), "left")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + arr = IntervalArray.from_breaks([0, 1, 2, 3]) + arr[1] = None + + result = pa.array(arr) + assert isinstance(result.type, ArrowIntervalType) + assert result.type.closed == arr.closed + assert result.type.subtype == pa.float64() + + # fields have missing values (not NaN) + left = pa.array([0.0, None, 2.0], type="float64") + right = pa.array([1.0, None, 3.0], type="float64") + assert result.storage.field("left").equals(left) + assert result.storage.field("right").equals(right) + + # structarray itself also has missing values on the array level + vals = [ + {"left": 0.0, "right": 1.0}, + {"left": None, "right": None}, + {"left": 2.0, "right": 3.0}, + ] + expected = pa.StructArray.from_pandas(vals, mask=np.array([False, True, False])) + assert result.storage.equals(expected) + + +@pyarrow_skip +@pytest.mark.parametrize( + "breaks", + [[0, 1, 2, 3], pd.date_range("2017", periods=4, freq="D")], + ids=["int", "datetime64[ns]"], +) +def test_arrow_table_roundtrip(breaks): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType + + arr = IntervalArray.from_breaks(breaks) + arr[1] = None + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowIntervalType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, pd.IntervalDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/interval/test_ops.py b/pandas/tests/arrays/interval/test_ops.py index a55c33c2f22e9..b4de80dc00a4e 100644 --- a/pandas/tests/arrays/interval/test_ops.py +++ b/pandas/tests/arrays/interval/test_ops.py @@ -3,8 +3,8 @@ import pytest from pandas import Interval, IntervalIndex, Timedelta, Timestamp +import pandas._testing as tm from pandas.core.arrays import IntervalArray -import pandas.util.testing as tm @pytest.fixture(params=[IntervalArray, IntervalIndex]) diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index eab174862818c..d8a1831cd61ec 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -6,7 +6,8 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype class TestSeriesAccessor: @@ -31,7 +32,7 @@ def test_accessor_raises(self): def test_from_spmatrix(self, format, labels, dtype): import scipy.sparse - sp_dtype = pd.SparseDtype(dtype, np.array(0, dtype=dtype).item()) + sp_dtype = SparseDtype(dtype, np.array(0, dtype=dtype).item()) mat = scipy.sparse.eye(10, format=format, dtype=dtype) result = pd.DataFrame.sparse.from_spmatrix(mat, index=labels, columns=labels) @@ -48,7 +49,7 @@ def test_from_spmatrix(self, format, labels, dtype): def test_from_spmatrix_columns(self, columns): import scipy.sparse - dtype = pd.SparseDtype("float64", 0.0) + dtype = SparseDtype("float64", 0.0) mat = scipy.sparse.random(10, 2, density=0.5) result = pd.DataFrame.sparse.from_spmatrix(mat, columns=columns) @@ -67,9 +68,9 @@ def test_to_coo(self): def test_to_dense(self): df = pd.DataFrame( { - "A": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 0)), - "B": pd.SparseArray([1, 0], dtype=pd.SparseDtype("int64", 1)), - "C": pd.SparseArray([1.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)), + "A": SparseArray([1, 0], dtype=SparseDtype("int64", 0)), + "B": SparseArray([1, 0], dtype=SparseDtype("int64", 1)), + "C": SparseArray([1.0, 0.0], dtype=SparseDtype("float64", 0.0)), }, index=["b", "a"], ) @@ -82,8 +83,8 @@ def test_to_dense(self): def test_density(self): df = pd.DataFrame( { - "A": pd.SparseArray([1, 0, 2, 1], fill_value=0), - "B": pd.SparseArray([0, 1, 1, 1], fill_value=0), + "A": SparseArray([1, 0, 2, 1], fill_value=0), + "B": SparseArray([0, 1, 1, 1], fill_value=0), } ) res = df.sparse.density @@ -99,9 +100,7 @@ def test_series_from_coo(self, dtype, dense_index): A = scipy.sparse.eye(3, format="coo", dtype=dtype) result = pd.Series.sparse.from_coo(A, dense_index=dense_index) index = pd.MultiIndex.from_tuples([(0, 0), (1, 1), (2, 2)]) - expected = pd.Series( - pd.SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index - ) + expected = pd.Series(SparseArray(np.array([1, 1, 1], dtype=dtype)), index=index) if dense_index: expected = expected.reindex(pd.MultiIndex.from_product(index.levels)) @@ -117,3 +116,8 @@ def test_series_from_coo_incorrect_format_raises(self): TypeError, match="Expected coo_matrix. Got csr_matrix instead." ): pd.Series.sparse.from_coo(m) + + def test_with_column_named_sparse(self): + # https://github.com/pandas-dev/pandas/issues/30758 + df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) + assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index f1d2803ce5505..76442a63ccb0f 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -4,9 +4,9 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core import ops -from pandas.core.arrays.sparse import SparseDtype -import pandas.util.testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype @pytest.fixture(params=["integer", "block"]) @@ -24,7 +24,7 @@ def mix(request): class TestSparseArrayArithmetics: _base = np.array - _klass = pd.SparseArray + _klass = SparseArray def _assert(self, a, b): tm.assert_numpy_array_equal(a, b) @@ -391,15 +391,15 @@ def test_mixed_array_comparison(self, kind): @pytest.mark.parametrize("op", [operator.eq, operator.add]) def test_with_list(op): - arr = pd.SparseArray([0, 1], fill_value=0) + arr = SparseArray([0, 1], fill_value=0) result = op(arr, [0, 1]) - expected = op(arr, pd.SparseArray([0, 1])) + expected = op(arr, SparseArray([0, 1])) tm.assert_sp_array_equal(result, expected) def test_with_dataframe(): # GH#27910 - arr = pd.SparseArray([0, 1], fill_value=0) + arr = SparseArray([0, 1], fill_value=0) df = pd.DataFrame([[1, 2], [3, 4]]) result = arr.__add__(df) assert result is NotImplemented @@ -407,7 +407,7 @@ def test_with_dataframe(): def test_with_zerodim_ndarray(): # GH#27910 - arr = pd.SparseArray([0, 1], fill_value=0) + arr = SparseArray([0, 1], fill_value=0) result = arr * np.array(2) expected = arr * 2 @@ -416,23 +416,23 @@ def test_with_zerodim_ndarray(): @pytest.mark.parametrize("ufunc", [np.abs, np.exp]) @pytest.mark.parametrize( - "arr", [pd.SparseArray([0, 0, -1, 1]), pd.SparseArray([None, None, -1, 1])] + "arr", [SparseArray([0, 0, -1, 1]), SparseArray([None, None, -1, 1])] ) def test_ufuncs(ufunc, arr): result = ufunc(arr) fill_value = ufunc(arr.fill_value) - expected = pd.SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value) + expected = SparseArray(ufunc(np.asarray(arr)), fill_value=fill_value) tm.assert_sp_array_equal(result, expected) @pytest.mark.parametrize( "a, b", [ - (pd.SparseArray([0, 0, 0]), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), - (pd.SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0]), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), + (SparseArray([0, 0, 0], fill_value=1), np.array([0, 1, 2])), ], ) @pytest.mark.parametrize("ufunc", [np.add, np.greater]) @@ -440,12 +440,12 @@ def test_binary_ufuncs(ufunc, a, b): # can't say anything about fill value here. result = ufunc(a, b) expected = ufunc(np.asarray(a), np.asarray(b)) - assert isinstance(result, pd.SparseArray) + assert isinstance(result, SparseArray) tm.assert_numpy_array_equal(np.asarray(result), expected) def test_ndarray_inplace(): - sparray = pd.SparseArray([0, 2, 0, 0]) + sparray = SparseArray([0, 2, 0, 0]) ndarray = np.array([0, 1, 2, 3]) ndarray += sparray expected = np.array([0, 3, 2, 3]) @@ -453,19 +453,19 @@ def test_ndarray_inplace(): def test_sparray_inplace(): - sparray = pd.SparseArray([0, 2, 0, 0]) + sparray = SparseArray([0, 2, 0, 0]) ndarray = np.array([0, 1, 2, 3]) sparray += ndarray - expected = pd.SparseArray([0, 3, 2, 3], fill_value=0) + expected = SparseArray([0, 3, 2, 3], fill_value=0) tm.assert_sp_array_equal(sparray, expected) @pytest.mark.parametrize("fill_value", [True, False]) def test_invert(fill_value): arr = np.array([True, False, False, True]) - sparray = pd.SparseArray(arr, fill_value=fill_value) + sparray = SparseArray(arr, fill_value=fill_value) result = ~sparray - expected = pd.SparseArray(~arr, fill_value=not fill_value) + expected = SparseArray(~arr, fill_value=not fill_value) tm.assert_sp_array_equal(result, expected) @@ -473,7 +473,7 @@ def test_invert(fill_value): @pytest.mark.parametrize("op", [operator.pos, operator.neg]) def test_unary_op(op, fill_value): arr = np.array([0, 1, np.nan, 2]) - sparray = pd.SparseArray(arr, fill_value=fill_value) + sparray = SparseArray(arr, fill_value=fill_value) result = op(sparray) - expected = pd.SparseArray(op(arr), fill_value=op(fill_value)) + expected = SparseArray(op(arr), fill_value=op(fill_value)) tm.assert_sp_array_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 0aaf294378bf7..baca18239b929 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -10,8 +10,8 @@ import pandas as pd from pandas import isna +import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype -import pandas.util.testing as tm @pytest.fixture(params=["integer", "block"]) @@ -470,7 +470,7 @@ def test_astype(self): arr.astype("Sparse[i8]") def test_astype_bool(self): - a = pd.SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) + a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) result = a.astype(bool) expected = SparseArray([True, 0, 0, True], dtype=SparseDtype(bool, 0)) tm.assert_sp_array_equal(result, expected) @@ -682,7 +682,7 @@ def test_getslice_tuple(self): dense[4:, :] def test_boolean_slice_empty(self): - arr = pd.SparseArray([0, 1, 2]) + arr = SparseArray([0, 1, 2]) res = arr[[False, False, False]] assert res.dtype == arr.dtype @@ -828,12 +828,12 @@ def test_fillna_overlap(self): def test_nonzero(self): # Tests regression #21172. - sa = pd.SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + sa = SparseArray([float("nan"), float("nan"), 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) expected = np.array([2, 5, 9], dtype=np.int32) (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) - sa = pd.SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) + sa = SparseArray([0, 0, 1, 0, 0, 2, 0, 0, 0, 3, 0, 0]) (result,) = sa.nonzero() tm.assert_numpy_array_equal(expected, result) @@ -1086,11 +1086,11 @@ def test_ufunc_args(self): @pytest.mark.parametrize("fill_value", [0.0, np.nan]) def test_modf(self, fill_value): # https://github.com/pandas-dev/pandas/issues/26946 - sparse = pd.SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) + sparse = SparseArray([fill_value] * 10 + [1.1, 2.2], fill_value=fill_value) r1, r2 = np.modf(sparse) e1, e2 = np.modf(np.asarray(sparse)) - tm.assert_sp_array_equal(r1, pd.SparseArray(e1, fill_value=fill_value)) - tm.assert_sp_array_equal(r2, pd.SparseArray(e2, fill_value=fill_value)) + tm.assert_sp_array_equal(r1, SparseArray(e1, fill_value=fill_value)) + tm.assert_sp_array_equal(r2, SparseArray(e2, fill_value=fill_value)) def test_nbytes_integer(self): arr = SparseArray([1, 0, 0, 0, 2], kind="integer") @@ -1106,7 +1106,7 @@ def test_nbytes_block(self): assert result == 24 def test_asarray_datetime64(self): - s = pd.SparseArray(pd.to_datetime(["2012", None, None, "2013"])) + s = SparseArray(pd.to_datetime(["2012", None, None, "2013"])) np.asarray(s) def test_density(self): @@ -1208,7 +1208,7 @@ def test_first_fill_value_loc(arr, loc): ) @pytest.mark.parametrize("fill_value", [np.nan, 0, 1]) def test_unique_na_fill(arr, fill_value): - a = pd.SparseArray(arr, fill_value=fill_value).unique() + a = SparseArray(arr, fill_value=fill_value).unique() b = pd.Series(arr).unique() assert isinstance(a, SparseArray) a = np.asarray(a) diff --git a/pandas/tests/arrays/sparse/test_combine_concat.py b/pandas/tests/arrays/sparse/test_combine_concat.py index 4ad1aa60e7b4f..f1697dc9ff7ce 100644 --- a/pandas/tests/arrays/sparse/test_combine_concat.py +++ b/pandas/tests/arrays/sparse/test_combine_concat.py @@ -1,17 +1,17 @@ import numpy as np import pytest -import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray class TestSparseArrayConcat: @pytest.mark.parametrize("kind", ["integer", "block"]) def test_basic(self, kind): - a = pd.SparseArray([1, 0, 0, 2], kind=kind) - b = pd.SparseArray([1, 0, 2, 2], kind=kind) + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=kind) - result = pd.SparseArray._concat_same_type([a, b]) + result = SparseArray._concat_same_type([a, b]) # Can't make any assertions about the sparse index itself # since we aren't don't merge sparse blocs across arrays # in to_concat @@ -22,10 +22,10 @@ def test_basic(self, kind): @pytest.mark.parametrize("kind", ["integer", "block"]) def test_uses_first_kind(self, kind): other = "integer" if kind == "block" else "block" - a = pd.SparseArray([1, 0, 0, 2], kind=kind) - b = pd.SparseArray([1, 0, 2, 2], kind=other) + a = SparseArray([1, 0, 0, 2], kind=kind) + b = SparseArray([1, 0, 2, 2], kind=other) - result = pd.SparseArray._concat_same_type([a, b]) + result = SparseArray._concat_same_type([a, b]) expected = np.array([1, 2, 1, 2, 2], dtype="int64") tm.assert_numpy_array_equal(result.sp_values, expected) assert result.kind == kind diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index 7a85ccf271e76..a2f861d378e67 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -7,8 +7,8 @@ import pandas.util._test_decorators as td from pandas import Series +import pandas._testing as tm from pandas.core.arrays.sparse import BlockIndex, IntIndex, _make_index -import pandas.util.testing as tm TEST_LENGTH = 20 diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index c3f342f16a0bf..5e2f14af341ab 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -6,17 +6,19 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm -def test_repr_with_NA(): - a = pd.array(["a", pd.NA, "b"], dtype="string") - for obj in [a, pd.Series(a), pd.DataFrame({"a": a})]: - assert "NA" in repr(obj) and "NaN" not in repr(obj) - assert "NA" in str(obj) and "NaN" not in str(obj) - if hasattr(obj, "_repr_html_"): - html_repr = obj._repr_html_() - assert "NA" in html_repr and "NaN" not in html_repr +def test_repr(): + df = pd.DataFrame({"A": pd.array(["a", pd.NA, "b"], dtype="string")}) + expected = " A\n0 a\n1 \n2 b" + assert repr(df) == expected + + expected = "0 a\n1 \n2 b\nName: A, dtype: string" + assert repr(df.A) == expected + + expected = "\n['a', , 'b']\nLength: 3, dtype: string" + assert repr(df.A.array) == expected def test_none_to_nan(): @@ -192,6 +194,25 @@ def test_constructor_raises(): with pytest.raises(ValueError, match="sequence of strings"): pd.arrays.StringArray(np.array([])) + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", np.nan], dtype=object)) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", None], dtype=object)) + + with pytest.raises(ValueError, match="strings or pandas.NA"): + pd.arrays.StringArray(np.array(["a", pd.NaT], dtype=object)) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_from_sequence_no_mutate(copy): + a = np.array(["a", np.nan], dtype=object) + original = a.copy() + result = pd.arrays.StringArray._from_sequence(a, copy=copy) + expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) + tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(a, original) + @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.xfail(reason="Not implemented StringArray.sum") @@ -237,3 +258,14 @@ def test_arrow_roundtrip(): tm.assert_frame_equal(result, df) # ensure the missing value is represented by NA and not np.nan or None assert result.loc[2, "a"] is pd.NA + + +def test_value_counts_na(): + arr = pd.array(["a", "b", "a", pd.NA], dtype="string") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=["a", "b", pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=["a", "b"], dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index f2a4e73e7b6ad..b1b5a9482e34f 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -8,29 +8,34 @@ from pandas.core.dtypes.dtypes import registry import pandas as pd +import pandas._testing as tm from pandas.api.extensions import register_extension_dtype from pandas.api.types import is_scalar +from pandas.arrays import ( + BooleanArray, + DatetimeArray, + IntegerArray, + IntervalArray, + SparseArray, + StringArray, + TimedeltaArray, +) from pandas.core.arrays import PandasArray, integer_array, period_array from pandas.tests.extension.decimal import DecimalArray, DecimalDtype, to_decimal -import pandas.util.testing as tm @pytest.mark.parametrize( "data, dtype, expected", [ # Basic NumPy defaults. - ([1, 2], None, pd.arrays.IntegerArray._from_sequence([1, 2])), + ([1, 2], None, IntegerArray._from_sequence([1, 2])), ([1, 2], object, PandasArray(np.array([1, 2], dtype=object))), ( [1, 2], np.dtype("float32"), PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), ), - ( - np.array([1, 2], dtype="int64"), - None, - pd.arrays.IntegerArray._from_sequence([1, 2]), - ), + (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2]),), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias @@ -49,37 +54,33 @@ ( [1, 2], np.dtype("datetime64[ns]"), - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype="datetime64[ns]") - ), + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), ), ( np.array([1, 2], dtype="datetime64[ns]"), None, - pd.arrays.DatetimeArray._from_sequence( - np.array([1, 2], dtype="datetime64[ns]") - ), + DatetimeArray._from_sequence(np.array([1, 2], dtype="datetime64[ns]")), ), ( pd.DatetimeIndex(["2000", "2001"]), np.dtype("datetime64[ns]"), - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( pd.DatetimeIndex(["2000", "2001"]), None, - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( ["2000", "2001"], np.dtype("datetime64[ns]"), - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), # Datetime (tz-aware) ( ["2000", "2001"], pd.DatetimeTZDtype(tz="CET"), - pd.arrays.DatetimeArray._from_sequence( + DatetimeArray._from_sequence( ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") ), ), @@ -87,17 +88,17 @@ ( ["1H", "2H"], np.dtype("timedelta64[ns]"), - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), np.dtype("timedelta64[ns]"), - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), None, - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), # Category (["a", "b"], "category", pd.Categorical(["a", "b"])), @@ -110,27 +111,19 @@ ( [pd.Interval(1, 2), pd.Interval(3, 4)], "interval", - pd.arrays.IntervalArray.from_tuples([(1, 2), (3, 4)]), + IntervalArray.from_tuples([(1, 2), (3, 4)]), ), # Sparse - ([0, 1], "Sparse[int64]", pd.SparseArray([0, 1], dtype="int64")), + ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")), # IntegerNA ([1, None], "Int16", integer_array([1, None], dtype="Int16")), (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String - (["a", None], "string", pd.arrays.StringArray._from_sequence(["a", None])), - ( - ["a", None], - pd.StringDtype(), - pd.arrays.StringArray._from_sequence(["a", None]), - ), + (["a", None], "string", StringArray._from_sequence(["a", None])), + (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None]),), # Boolean - ([True, None], "boolean", pd.arrays.BooleanArray._from_sequence([True, None])), - ( - [True, None], - pd.BooleanDtype(), - pd.arrays.BooleanArray._from_sequence([True, None]), - ), + ([True, None], "boolean", BooleanArray._from_sequence([True, None])), + ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None]),), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -181,31 +174,28 @@ def test_array_copy(): period_array(["2000", "2001"], freq="D"), ), # interval - ( - [pd.Interval(0, 1), pd.Interval(1, 2)], - pd.arrays.IntervalArray.from_breaks([0, 1, 2]), - ), + ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2]),), # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"]), + DatetimeArray._from_sequence(["2000", "2001"]), ), ( np.array([1, 2], dtype="M8[ns]"), - pd.arrays.DatetimeArray(np.array([1, 2], dtype="M8[ns]")), + DatetimeArray(np.array([1, 2], dtype="M8[ns]")), ), ( np.array([1, 2], dtype="M8[us]"), - pd.arrays.DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), + DatetimeArray(np.array([1000, 2000], dtype="M8[ns]")), ), # datetimetz ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], - pd.arrays.DatetimeArray._from_sequence( + DatetimeArray._from_sequence( ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") ), ), @@ -214,30 +204,30 @@ def test_array_copy(): datetime.datetime(2000, 1, 1, tzinfo=cet), datetime.datetime(2001, 1, 1, tzinfo=cet), ], - pd.arrays.DatetimeArray._from_sequence(["2000", "2001"], tz=cet), + DatetimeArray._from_sequence(["2000", "2001"], tz=cet), ), # timedelta ( [pd.Timedelta("1H"), pd.Timedelta("2H")], - pd.arrays.TimedeltaArray._from_sequence(["1H", "2H"]), + TimedeltaArray._from_sequence(["1H", "2H"]), ), ( np.array([1, 2], dtype="m8[ns]"), - pd.arrays.TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), + TimedeltaArray(np.array([1, 2], dtype="m8[ns]")), ), ( np.array([1, 2], dtype="m8[us]"), - pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), + TimedeltaArray(np.array([1000, 2000], dtype="m8[ns]")), ), # integer - ([1, 2], pd.arrays.IntegerArray._from_sequence([1, 2])), - ([1, None], pd.arrays.IntegerArray._from_sequence([1, None])), + ([1, 2], IntegerArray._from_sequence([1, 2])), + ([1, None], IntegerArray._from_sequence([1, None])), # string - (["a", "b"], pd.arrays.StringArray._from_sequence(["a", "b"])), - (["a", None], pd.arrays.StringArray._from_sequence(["a", None])), + (["a", "b"], StringArray._from_sequence(["a", "b"])), + (["a", None], StringArray._from_sequence(["a", None])), # Boolean - ([True, False], pd.arrays.BooleanArray._from_sequence([True, False])), - ([True, None], pd.arrays.BooleanArray._from_sequence([True, None])), + ([True, False], BooleanArray._from_sequence([True, False])), + ([True, None], BooleanArray._from_sequence([True, None])), ], ) def test_array_inference(data, expected): diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py index abec4b42c0ffb..cc8d0cdcb518d 100644 --- a/pandas/tests/arrays/test_boolean.py +++ b/pandas/tests/arrays/test_boolean.py @@ -6,10 +6,10 @@ import pandas.util._test_decorators as td import pandas as pd +import pandas._testing as tm from pandas.arrays import BooleanArray from pandas.core.arrays.boolean import coerce_to_array from pandas.tests.extension.base import BaseOpsUtil -import pandas.util.testing as tm def make_data(): @@ -251,6 +251,87 @@ def test_coerce_to_numpy_array(): np.array(arr, dtype="bool") +def test_repr(): + df = pd.DataFrame({"A": pd.array([True, False, None], dtype="boolean")}) + expected = " A\n0 True\n1 False\n2 " + assert repr(df) == expected + + expected = "0 True\n1 False\n2 \nName: A, dtype: boolean" + assert repr(df.A) == expected + + expected = "\n[True, False, ]\nLength: 3, dtype: boolean" + assert repr(df.A.array) == expected + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + # default (with or without missing values) -> object dtype + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, True], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy() + expected = np.array([True, False, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype="str") + expected = np.array([True, False, pd.NA], dtype=" can convert to bool, otherwise raises + arr = con([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype="bool") + expected = np.array([True, False, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + arr = con([True, False, None], dtype="boolean") + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype"): + result = arr.to_numpy(dtype="bool") + + # specify dtype and na_value + arr = con([True, False, None], dtype="boolean") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([True, False, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([True, False, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([1, 0, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([1, 0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # converting to int or float without specifying na_value raises + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + arr.to_numpy(dtype="int64") + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): + arr.to_numpy(dtype="float64") + + +def test_to_numpy_copy(): + # to_numpy can be zero-copy if no missing values + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool) + result[0] = False + tm.assert_extension_array_equal( + arr, pd.array([False, False, True], dtype="boolean") + ) + + arr = pd.array([True, False, True], dtype="boolean") + result = arr.to_numpy(dtype=bool, copy=True) + result[0] = False + tm.assert_extension_array_equal(arr, pd.array([True, False, True], dtype="boolean")) + + def test_astype(): # with missing values arr = pd.array([True, False, None], dtype="boolean") @@ -265,6 +346,10 @@ def test_astype(): expected = np.array([1, 0, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) + result = arr.astype("str") + expected = np.array(["True", "False", ""], dtype="object") + tm.assert_numpy_array_equal(result, expected) + # no missing values arr = pd.array([True, False, True], dtype="boolean") result = arr.astype("int64") @@ -783,3 +868,14 @@ def test_arrow_roundtrip(): result = table.to_pandas() assert isinstance(result["a"].dtype, pd.BooleanDtype) tm.assert_frame_equal(result, df) + + +def test_value_counts_na(): + arr = pd.array([True, False, pd.NA], dtype="boolean") + result = arr.value_counts(dropna=False) + expected = pd.Series([1, 1, 1], index=[True, False, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([1, 1], index=[True, False], dtype="Int64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 84b6d45b78fe8..87b825c8c27bd 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,13 +4,14 @@ import pytest from pandas._libs import OutOfBoundsDatetime +from pandas.compat.numpy import _np_version_under1p18 import pandas as pd +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex from pandas.core.indexes.timedeltas import TimedeltaIndex -import pandas.util.testing as tm # TODO: more freq variants @@ -64,8 +65,8 @@ def test_compare_len1_raises(self): # to the case where one has length-1, which numpy would broadcast data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq="D") - arr = self.array_cls(idx) + idx = self.array_cls._simple_new(data, freq="D") + arr = self.index_cls(idx) with pytest.raises(ValueError, match="Lengths must match"): arr == arr[:1] @@ -78,8 +79,8 @@ def test_take(self): data = np.arange(100, dtype="i8") * 24 * 3600 * 10 ** 9 np.random.shuffle(data) - idx = self.index_cls._simple_new(data, freq="D") - arr = self.array_cls(idx) + arr = self.array_cls._simple_new(data, freq="D") + idx = self.index_cls._simple_new(arr) takers = [1, 4, 94] result = arr.take(takers) @@ -96,8 +97,7 @@ def test_take(self): def test_take_fill(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq="D") - arr = self.array_cls(idx) + arr = self.array_cls._simple_new(data, freq="D") result = arr.take([-1, 1], allow_fill=True, fill_value=None) assert result[0] is pd.NaT @@ -120,7 +120,9 @@ def test_take_fill(self): def test_concat_same_type(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 - idx = self.index_cls._simple_new(data, freq="D").insert(0, pd.NaT) + arr = self.array_cls._simple_new(data, freq="D") + idx = self.index_cls(arr) + idx = idx.insert(0, pd.NaT) arr = self.array_cls(idx) result = arr._concat_same_type([arr[:-1], arr[1:], arr]) @@ -225,6 +227,19 @@ def test_setitem_raises(self): with pytest.raises(TypeError, match="'value' should be a.* 'object'"): arr[0] = object() + def test_inplace_arithmetic(self): + # GH#24115 check that iadd and isub are actually in-place + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = self.array_cls(data, freq="D") + + expected = arr + pd.Timedelta(days=1) + arr += pd.Timedelta(days=1) + tm.assert_equal(arr, expected) + + expected = arr - pd.Timedelta(days=1) + arr -= pd.Timedelta(days=1) + tm.assert_equal(arr, expected) + class TestDatetimeArray(SharedTests): index_cls = pd.DatetimeIndex @@ -745,3 +760,38 @@ def test_invalid_nat_setitem_array(array, non_casting_nats): for nat in non_casting_nats: with pytest.raises(TypeError): array[0] = nat + + +@pytest.mark.parametrize( + "array", + [ + pd.date_range("2000", periods=4).array, + pd.timedelta_range("2000", periods=4).array, + ], +) +def test_to_numpy_extra(array): + if _np_version_under1p18: + # np.isnan(NaT) raises, so use pandas' + isnan = pd.isna + else: + isnan = np.isnan + + array[0] = pd.NaT + original = array.copy() + + result = array.to_numpy() + assert isnan(result[0]) + + result = array.to_numpy(dtype="int64") + assert result[0] == -9223372036854775808 + + result = array.to_numpy(dtype="int64", na_value=0) + assert result[0] == 0 + + result = array.to_numpy(na_value=array[1].to_numpy()) + assert result[0] == result[1] + + result = array.to_numpy(na_value=array[1].to_numpy(copy=False)) + assert result[0] == result[1] + + tm.assert_equal(array, original) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index c3cda22497ecb..a59ed429cc404 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -9,9 +9,9 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import sequence_to_dt64ns -import pandas.util.testing as tm class TestDatetimeArrayConstructor: @@ -24,8 +24,8 @@ def test_only_1dim_accepted(self): arr = np.array([0, 1, 2, 3], dtype="M8[h]").astype("M8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): - # 2-dim - DatetimeArray(arr.reshape(2, 2)) + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + DatetimeArray(arr.reshape(2, 2, 1)) with pytest.raises(ValueError, match="Only 1-dimensional"): # 0-dim @@ -173,7 +173,7 @@ def test_tz_setter_raises(self): def test_setitem_different_tz_raises(self): data = np.array([1, 2, 3], dtype="M8[ns]") arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")) - with pytest.raises(ValueError, match="None"): + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): arr[0] = pd.Timestamp("2000") with pytest.raises(ValueError, match="US/Central"): @@ -282,6 +282,71 @@ def test_array_interface(self): ) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_different_tz(self, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D").tz_localize("Asia/Tokyo") + if index: + arr = pd.Index(arr) + + expected = arr.searchsorted(arr[2]) + result = arr.searchsorted(arr[2].tz_convert("UTC")) + assert result == expected + + expected = arr.searchsorted(arr[2:6]) + result = arr.searchsorted(arr[2:6].tz_convert("UTC")) + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_tzawareness_compat(self, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D") + if index: + arr = pd.Index(arr) + + mismatch = arr.tz_localize("Asia/Tokyo") + + msg = "Cannot compare tz-naive and tz-aware datetime-like objects" + with pytest.raises(TypeError, match=msg): + arr.searchsorted(mismatch[0]) + with pytest.raises(TypeError, match=msg): + arr.searchsorted(mismatch) + + with pytest.raises(TypeError, match=msg): + mismatch.searchsorted(arr[0]) + with pytest.raises(TypeError, match=msg): + mismatch.searchsorted(arr) + + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + 1.0, + np.timedelta64("NaT"), + pd.Timedelta(days=2), + "invalid", + np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9, + np.arange(10).view("timedelta64[ns]") * 24 * 3600 * 10 ** 9, + pd.Timestamp.now().to_period("D"), + ], + ) + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_invalid_types(self, other, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = DatetimeArray(data, freq="D") + if index: + arr = pd.Index(arr) + + msg = "|".join( + [ + "searchsorted requires compatible dtype or scalar", + "Unexpected type for 'value'", + ] + ) + with pytest.raises(TypeError, match=msg): + arr.searchsorted(other) + class TestSequenceToDT64NS: def test_tz_dtype_mismatch_raises(self): diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 7bb0b065df1da..f1a7cc741603d 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -6,6 +6,7 @@ from pandas.core.dtypes.generic import ABCIndexClass import pandas as pd +import pandas._testing as tm from pandas.api.types import is_float, is_float_dtype, is_integer, is_scalar from pandas.core.arrays import IntegerArray, integer_array from pandas.core.arrays.integer import ( @@ -19,7 +20,6 @@ UInt64Dtype, ) from pandas.tests.extension.base import BaseOpsUtil -import pandas.util.testing as tm def make_data(): @@ -90,7 +90,7 @@ def test_repr_dtype(dtype, expected): def test_repr_array(): result = repr(integer_array([1, None, 3])) - expected = "\n[1, NaN, 3]\nLength: 3, dtype: Int64" + expected = "\n[1, , 3]\nLength: 3, dtype: Int64" assert result == expected @@ -98,9 +98,9 @@ def test_repr_array_long(): data = integer_array([1, 2, None] * 1000) expected = ( "\n" - "[ 1, 2, NaN, 1, 2, NaN, 1, 2, NaN, 1,\n" + "[ 1, 2, , 1, 2, , 1, 2, , 1,\n" " ...\n" - " NaN, 1, 2, NaN, 1, 2, NaN, 1, 2, NaN]\n" + " , 1, 2, , 1, 2, , 1, 2, ]\n" "Length: 3000, dtype: Int64" ) result = repr(data) @@ -108,13 +108,19 @@ def test_repr_array_long(): class TestConstructors: + def test_uses_pandas_na(self): + a = pd.array([1, None], dtype=pd.Int64Dtype()) + assert a[1] is pd.NA + def test_from_dtype_from_float(self, data): # construct from our dtype & string dtype dtype = data.dtype # from float expected = pd.Series(data) - result = pd.Series(np.array(data).astype("float"), dtype=str(dtype)) + result = pd.Series( + data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype) + ) tm.assert_series_equal(result, expected) # from int / list @@ -156,10 +162,13 @@ def _check_op(self, s, op_name, other, exc=None): # 1 ** na is na, so need to unmask those if op_name == "__pow__": - mask = np.where(s == 1, False, mask) + mask = np.where(~s.isna() & (s == 1), False, mask) elif op_name == "__rpow__": - mask = np.where(other == 1, False, mask) + other_is_one = other == 1 + if isinstance(other_is_one, pd.Series): + other_is_one = other_is_one.fillna(False) + mask = np.where(other_is_one, False, mask) # float result type or float op if ( @@ -193,7 +202,7 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): # to compare properly, we convert the expected # to float, mask to nans and convert infs # if we have uints then we process as uints - # then conert to float + # then convert to float # and we ultimately want to create a IntArray # for comparisons @@ -208,20 +217,27 @@ def _check_op_integer(self, result, expected, mask, s, op_name, other): else: expected = expected.fillna(0) else: - expected[(s.values == 0) & ((expected == 0) | expected.isna())] = 0 + expected[ + (s.values == 0).fillna(False) + & ((expected == 0).fillna(False) | expected.isna()) + ] = 0 try: - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) except ValueError: expected = expected.astype(float) - expected[(expected == np.inf) | (expected == -np.inf)] = fill_value + expected[ + ((expected == np.inf) | (expected == -np.inf)).fillna(False) + ] = fill_value original = expected expected = expected.astype(s.dtype) - expected[mask] = np.nan + expected[mask] = pd.NA # assert that the expected astype is ok # (skip for unsigned as they have wrap around) @@ -255,21 +271,18 @@ def test_arith_integer_array(self, data, all_arithmetic_operators): def test_arith_series_with_scalar(self, data, all_arithmetic_operators): # scalar op = all_arithmetic_operators - s = pd.Series(data) self._check_op(s, op, 1, exc=TypeError) def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): # frame & scalar op = all_arithmetic_operators - df = pd.DataFrame({"A": data}) self._check_op(df, op, 1, exc=TypeError) def test_arith_series_with_array(self, data, all_arithmetic_operators): # ndarray & other series op = all_arithmetic_operators - s = pd.Series(data) other = np.ones(len(s), dtype=s.dtype.type) self._check_op(s, op, other, exc=TypeError) @@ -350,24 +363,26 @@ def test_divide_by_zero(self, zero, negative): tm.assert_numpy_array_equal(result, expected) def test_pow_scalar(self): - a = pd.array([0, 1, None, 2], dtype="Int64") + a = pd.array([-1, 0, 1, None, 2], dtype="Int64") result = a ** 0 - expected = pd.array([1, 1, 1, 1], dtype="Int64") + expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") tm.assert_extension_array_equal(result, expected) result = a ** 1 - expected = pd.array([0, 1, None, 2], dtype="Int64") + expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") tm.assert_extension_array_equal(result, expected) - # result = a ** pd.NA - # expected = pd.array([None, 1, None, None], dtype="Int64") - # tm.assert_extension_array_equal(result, expected) + result = a ** pd.NA + expected = pd.array([None, None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) result = a ** np.nan - expected = np.array([np.nan, 1, np.nan, np.nan], dtype="float64") + expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") tm.assert_numpy_array_equal(result, expected) # reversed + a = a[1:] # Can't raise integers to negative powers. + result = 0 ** a expected = pd.array([1, 0, None, 0], dtype="Int64") tm.assert_extension_array_equal(result, expected) @@ -376,9 +391,9 @@ def test_pow_scalar(self): expected = pd.array([1, 1, 1, 1], dtype="Int64") tm.assert_extension_array_equal(result, expected) - # result = pd.NA ** a - # expected = pd.array([1, None, None, None], dtype="Int64") - # tm.assert_extension_array_equal(result, expected) + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) result = np.nan ** a expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") @@ -406,10 +421,10 @@ def _compare_other(self, data, op_name, other): # array result = pd.Series(op(data, other)) - expected = pd.Series(op(data._data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA tm.assert_series_equal(result, expected) @@ -417,22 +432,61 @@ def _compare_other(self, data, op_name, other): s = pd.Series(data) result = op(s, other) - expected = pd.Series(data._data) - expected = op(expected, other) + expected = op(pd.Series(data._data), other) # fill the nan locations - expected[data._mask] = op_name == "__ne__" + expected[data._mask] = pd.NA + expected = expected.astype("boolean") tm.assert_series_equal(result, expected) - def test_compare_scalar(self, data, all_compare_operators): - op_name = all_compare_operators - self._compare_other(data, op_name, 0) + @pytest.mark.parametrize("other", [True, False, pd.NA, -1, 0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1, 0, None], dtype="Int64") - def test_compare_array(self, data, all_compare_operators): - op_name = all_compare_operators - other = pd.Series([0] * len(data)) - self._compare_other(data, op_name, other) + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1, 0, None], dtype="Int64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Int64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Int64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask + + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Int64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Int64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Int64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) def test_no_shared_mask(self, data): result = data + 1 @@ -442,20 +496,21 @@ def test_compare_to_string(self, any_nullable_int_dtype): # GH 28930 s = pd.Series([1, None], dtype=any_nullable_int_dtype) result = s == "a" - expected = pd.Series([False, False]) + expected = pd.Series([False, pd.NA], dtype="boolean") self.assert_series_equal(result, expected) def test_compare_to_int(self, any_nullable_int_dtype, all_compare_operators): # GH 28930 - s1 = pd.Series([1, 2, 3], dtype=any_nullable_int_dtype) - s2 = pd.Series([1, 2, 3], dtype="int") + s1 = pd.Series([1, None, 3], dtype=any_nullable_int_dtype) + s2 = pd.Series([1, None, 3], dtype="float") method = getattr(s1, all_compare_operators) result = method(2) method = getattr(s2, all_compare_operators) - expected = method(2) + expected = method(2).astype("boolean") + expected[s2.isna()] = pd.NA self.assert_series_equal(result, expected) @@ -543,6 +598,17 @@ def test_astype(self, all_data): expected = pd.Series(np.asarray(mixed)) tm.assert_series_equal(result, expected) + def test_astype_to_larger_numpy(self): + a = pd.array([1, 2], dtype="Int32") + result = a.astype("int64") + expected = np.array([1, 2], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + a = pd.array([1, 2], dtype="UInt32") + result = a.astype("uint64") + expected = np.array([1, 2], dtype="uint64") + tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize("dtype", [Int8Dtype(), "Int8", UInt32Dtype(), "UInt32"]) def test_astype_specific_casting(self, dtype): s = pd.Series([1, 2, 3], dtype="Int64") @@ -572,12 +638,54 @@ def test_construct_cast_invalid(self, dtype): with pytest.raises(TypeError, match=msg): pd.Series(arr).astype(dtype) + @pytest.mark.parametrize("in_series", [True, False]) + def test_to_numpy_na_nan(self, in_series): + a = pd.array([0, 1, None], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.0, 1.0, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="int64", na_value=-1) + expected = np.array([0, 1, -1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = a.to_numpy(dtype="bool", na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("in_series", [True, False]) + @pytest.mark.parametrize("dtype", ["int32", "int64", "bool"]) + def test_to_numpy_dtype(self, dtype, in_series): + a = pd.array([0, 1], dtype="Int64") + if in_series: + a = pd.Series(a) + + result = a.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["float64", "int64", "bool"]) + def test_to_numpy_na_raises(self, dtype): + a = pd.array([0, 1, None], dtype="Int64") + with pytest.raises(ValueError, match=dtype): + a.to_numpy(dtype=dtype) + + def test_astype_str(self): + a = pd.array([1, 2, None], dtype="Int64") + expected = np.array(["1", "2", ""], dtype=object) + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) + def test_frame_repr(data_missing): df = pd.DataFrame({"A": data_missing}) result = repr(df) - expected = " A\n0 NaN\n1 1" + expected = " A\n0 \n1 1" assert result == expected @@ -593,7 +701,7 @@ def test_conversions(data_missing): # we assert that we are exactly equal # including type conversions of scalars result = df["A"].astype("object").values - expected = np.array([np.nan, 1], dtype=object) + expected = np.array([pd.NA, 1], dtype=object) tm.assert_numpy_array_equal(result, expected) for r, e in zip(result, expected): @@ -756,7 +864,7 @@ def test_cross_type_arithmetic(): tm.assert_series_equal(result, expected) result = (df.A + df.C) * 3 == 12 - expected = pd.Series([False, True, False]) + expected = pd.Series([False, True, None], dtype="boolean") tm.assert_series_equal(result, expected) result = df.A + df.B @@ -820,7 +928,7 @@ def test_reduce_to_float(op): def test_astype_nansafe(): # see gh-22343 arr = integer_array([np.nan, 1, 2], dtype="Int8") - msg = "cannot convert float NaN to integer" + msg = "cannot convert to 'uint32'-dtype NumPy array with missing values." with pytest.raises(ValueError, match=msg): arr.astype("uint32") @@ -895,7 +1003,9 @@ def test_arrow_array(data): import pyarrow as pa arr = pa.array(data) - expected = pa.array(list(data), type=data.dtype.name.lower(), from_pandas=True) + expected = np.array(data, dtype=object) + expected[data.isna()] = None + expected = pa.array(expected, type=data.dtype.name.lower(), from_pandas=True) assert arr.equals(expected) @@ -931,6 +1041,17 @@ def test_stat_method(pandasmethname, kwargs): assert expected == result +def test_value_counts_na(): + arr = pd.array([1, 2, 1, pd.NA], dtype="Int64") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[1, 2, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=[1, 2], dtype="Int64") + tm.assert_series_equal(result, expected) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/arrays/test_numpy.py b/pandas/tests/arrays/test_numpy.py index 7a150c35fea09..86793c4ec50dd 100644 --- a/pandas/tests/arrays/test_numpy.py +++ b/pandas/tests/arrays/test_numpy.py @@ -6,9 +6,9 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.arrays import PandasArray from pandas.core.arrays.numpy_ import PandasDtype -import pandas.util.testing as tm @pytest.fixture( @@ -226,3 +226,25 @@ def test_setitem_no_coercion(): arr = PandasArray(np.array([1, 2, 3])) with pytest.raises(ValueError, match="int"): arr[0] = "a" + + # With a value that we do coerce, check that we coerce the value + # and not the underlying array. + arr[0] = 2.5 + assert isinstance(arr[0], (int, np.integer)), type(arr[0]) + + +def test_setitem_preserves_views(): + # GH#28150, see also extension test of the same name + arr = PandasArray(np.array([1, 2, 3])) + view1 = arr.view() + view2 = arr[:] + view3 = np.asarray(arr) + + arr[0] = 9 + assert view1[0] == 9 + assert view2[0] == 9 + assert view3[0] == 9 + + arr[-1] = 2.5 + view1[-1] = 5 + assert arr[-1] == 5 diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 252f278242fcc..1f4351c7e20ee 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -3,12 +3,13 @@ from pandas._libs.tslibs import iNaT from pandas._libs.tslibs.period import IncompatibleFrequency +import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import PeriodDtype, registry import pandas as pd +import pandas._testing as tm from pandas.core.arrays import PeriodArray, period_array -import pandas.util.testing as tm # ---------------------------------------------------------------------------- # Dtype @@ -323,3 +324,91 @@ def test_min_max_empty(self, skipna): result = arr.max(skipna=skipna) assert result is pd.NaT + + +# ---------------------------------------------------------------------------- +# Arrow interaction + +pyarrow_skip = pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.15.1.dev") + + +@pyarrow_skip +def test_arrow_extension_type(): + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + p1 = ArrowPeriodType("D") + p2 = ArrowPeriodType("D") + p3 = ArrowPeriodType("M") + + assert p1.freq == "D" + assert p1 == p2 + assert not p1 == p3 + assert hash(p1) == hash(p2) + assert not hash(p1) == hash(p3) + + +@pyarrow_skip +@pytest.mark.parametrize( + "data, freq", + [ + (pd.date_range("2017", periods=3), "D"), + (pd.date_range("2017", periods=3, freq="A"), "A-DEC"), + ], +) +def test_arrow_array(data, freq): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + periods = period_array(data, freq=freq) + result = pa.array(periods) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == freq + expected = pa.array(periods.asi8, type="int64") + assert result.storage.equals(expected) + + # convert to its storage type + result = pa.array(periods, type=pa.int64()) + assert result.equals(expected) + + # unsupported conversions + with pytest.raises(TypeError): + pa.array(periods, type="float64") + + with pytest.raises(TypeError, match="different 'freq'"): + pa.array(periods, type=ArrowPeriodType("T")) + + +@pyarrow_skip +def test_arrow_array_missing(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + + result = pa.array(arr) + assert isinstance(result.type, ArrowPeriodType) + assert result.type.freq == "D" + expected = pa.array([1, None, 3], type="int64") + assert result.storage.equals(expected) + + +@pyarrow_skip +def test_arrow_table_roundtrip(): + import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType + + arr = PeriodArray([1, 2, 3], freq="D") + arr[1] = pd.NaT + df = pd.DataFrame({"a": arr}) + + table = pa.table(df) + assert isinstance(table.field("a").type, ArrowPeriodType) + result = table.to_pandas() + assert isinstance(result["a"].dtype, PeriodDtype) + tm.assert_frame_equal(result, df) + + table2 = pa.concat_tables([table, table]) + result = table2.to_pandas() + expected = pd.concat([df, df], ignore_index=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 42e7bee97e671..c86b4f71ee592 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -2,8 +2,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.arrays import TimedeltaArray -import pandas.util.testing as tm class TestTimedeltaArrayConstructor: @@ -12,8 +12,8 @@ def test_only_1dim_accepted(self): arr = np.array([0, 1, 2, 3], dtype="m8[h]").astype("m8[ns]") with pytest.raises(ValueError, match="Only 1-dimensional"): - # 2-dim - TimedeltaArray(arr.reshape(2, 2)) + # 3-dim, we allow 2D to sneak in for ops purposes GH#29853 + TimedeltaArray(arr.reshape(2, 2, 1)) with pytest.raises(ValueError, match="Only 1-dimensional"): # 0-dim @@ -41,13 +41,12 @@ def test_other_type_raises(self): def test_incorrect_dtype_raises(self): # TODO: why TypeError for 'category' but ValueError for i8? with pytest.raises( - ValueError, match=r"category cannot be converted " r"to timedelta64\[ns\]" + ValueError, match=r"category cannot be converted to timedelta64\[ns\]" ): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") with pytest.raises( - ValueError, - match=r"dtype int64 cannot be converted " r"to timedelta64\[ns\]", + ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]", ): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) @@ -141,6 +140,36 @@ def test_setitem_objects(self, obj): arr[0] = obj assert arr[0] == pd.Timedelta(seconds=1) + @pytest.mark.parametrize( + "other", + [ + 1, + np.int64(1), + 1.0, + np.datetime64("NaT"), + pd.Timestamp.now(), + "invalid", + np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9, + (np.arange(10) * 24 * 3600 * 10 ** 9).view("datetime64[ns]"), + pd.Timestamp.now().to_period("D"), + ], + ) + @pytest.mark.parametrize("index", [True, False]) + def test_searchsorted_invalid_types(self, other, index): + data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 + arr = TimedeltaArray(data, freq="D") + if index: + arr = pd.Index(arr) + + msg = "|".join( + [ + "searchsorted requires compatible dtype or scalar", + "Unexpected type for 'value'", + ] + ) + with pytest.raises(TypeError, match=msg): + arr.searchsorted(other) + class TestReductions: @pytest.mark.parametrize("name", ["sum", "std", "min", "max", "median"]) diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index a9e0473ac067a..0b7274399aafc 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -8,9 +8,9 @@ import pandas as pd from pandas import DataFrame, Index, Series +import pandas._testing as tm from pandas.core.accessor import PandasDelegate from pandas.core.base import NoNewAttributesMixin, PandasObject -import pandas.util.testing as tm class TestPandasDelegate: diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 8fa52af832907..07a15d0619bb6 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -6,8 +6,15 @@ import pandas as pd from pandas import CategoricalIndex, Series, Timedelta, Timestamp -from pandas.core.arrays import DatetimeArray, PandasArray, TimedeltaArray -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays import ( + DatetimeArray, + IntervalArray, + PandasArray, + PeriodArray, + SparseArray, + TimedeltaArray, +) class TestToIterable: @@ -177,14 +184,10 @@ def test_iter_box(self): ), ( pd.PeriodIndex([2018, 2019], freq="A"), - pd.core.arrays.PeriodArray, + PeriodArray, pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), ), - ( - pd.IntervalIndex.from_breaks([0, 1, 2]), - pd.core.arrays.IntervalArray, - "interval", - ), + (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval",), # This test is currently failing for datetime64[ns] and timedelta64[ns]. # The NumPy type system is sufficient for representing these types, so # we just use NumPy for Series / DataFrame columns of these types (so @@ -270,8 +273,8 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): (pd.Categorical(["a", "b"]), "_codes"), (pd.core.arrays.period_array(["2000", "2001"], freq="D"), "_data"), (pd.core.arrays.integer_array([0, np.nan]), "_data"), - (pd.core.arrays.IntervalArray.from_breaks([0, 1]), "_left"), - (pd.SparseArray([0, 1]), "_sparse_values"), + (IntervalArray.from_breaks([0, 1]), "_left"), + (SparseArray([0, 1]), "_sparse_values"), (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), # tz-aware Datetime ( @@ -288,7 +291,7 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): def test_array(array, attr, index_or_series): box = index_or_series if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: - pytest.skip("No index type for {}".format(array.dtype)) + pytest.skip(f"No index type for {array.dtype}") result = box(array, copy=False).array if attr: @@ -315,13 +318,13 @@ def test_array_multiindex_raises(): ), ( pd.core.arrays.integer_array([0, np.nan]), - np.array([0, np.nan], dtype=object), + np.array([0, pd.NA], dtype=object), ), ( - pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), + IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), ), - (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), + (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), # tz-naive datetime ( DatetimeArray(np.array(["2000", "2001"], dtype="M8[ns]")), @@ -354,7 +357,7 @@ def test_to_numpy(array, expected, index_or_series): thing = box(array) if array.dtype.name in ("Int64", "Sparse[int64, 0]") and box is pd.Index: - pytest.skip("No index type for {}".format(array.dtype)) + pytest.skip(f"No index type for {array.dtype}") result = thing.to_numpy() tm.assert_numpy_array_equal(result, expected) @@ -401,3 +404,36 @@ def test_to_numpy_dtype(as_series): result = obj.to_numpy(dtype="M8[ns]") expected = np.array(["2000-01-01T05", "2001-01-01T05"], dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values, dtype, na_value, expected", + [ + ([1, 2, None], "float64", 0, [1.0, 2.0, 0.0]), + ( + [pd.Timestamp("2000"), pd.Timestamp("2000"), pd.NaT], + None, + pd.Timestamp("2000"), + [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + ), + ], +) +@pytest.mark.parametrize("container", [pd.Series, pd.Index]) # type: ignore +def test_to_numpy_na_value_numpy_dtype(container, values, dtype, na_value, expected): + s = container(values) + result = s.to_numpy(dtype=dtype, na_value=na_value) + expected = np.array(expected) + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_kwargs_raises(): + # numpy + s = pd.Series([1, 2, 3]) + match = r"to_numpy\(\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) + + # extension + s = pd.Series([1, 2, 3], dtype="Int64") + with pytest.raises(TypeError, match=match): + s.to_numpy(foo=True) diff --git a/pandas/tests/base/test_ops.py b/pandas/tests/base/test_ops.py index 04277ce929bca..2693eb12dda71 100644 --- a/pandas/tests/base/test_ops.py +++ b/pandas/tests/base/test_ops.py @@ -29,8 +29,8 @@ TimedeltaIndex, Timestamp, ) +import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm class Ops: @@ -62,8 +62,8 @@ def setup_method(self, method): self.unicode_series = Series(arr, index=self.unicode_index, name="a") types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"] - self.indexes = [getattr(self, "{}_index".format(t)) for t in types] - self.series = [getattr(self, "{}_series".format(t)) for t in types] + self.indexes = [getattr(self, f"{t}_index") for t in types] + self.series = [getattr(self, f"{t}_series") for t in types] # To test narrow dtypes, we use narrower *data* elements, not *index* elements index = self.int_index @@ -79,7 +79,7 @@ def setup_method(self, method): self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a") nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"] - self.narrow_series = [getattr(self, "{}_series".format(t)) for t in nrw_types] + self.narrow_series = [getattr(self, f"{t}_series") for t in nrw_types] self.objs = self.indexes + self.series + self.narrow_series @@ -698,9 +698,7 @@ def test_duplicated_drop_duplicates_index(self): with pytest.raises( TypeError, - match=( - r"drop_duplicates\(\) got an " r"unexpected keyword argument" - ), + match=r"drop_duplicates\(\) got an unexpected keyword argument", ): idx.drop_duplicates(inplace=True) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index e8b6491c5026c..656b274aa1a9e 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -16,6 +16,7 @@ import pandas as pd from pandas import DataFrame, Series, compat, date_range +import pandas._testing as tm from pandas.core.computation import pytables from pandas.core.computation.check import _NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines @@ -33,7 +34,6 @@ _special_case_arith_ops_syms, _unary_math_ops, ) -import pandas.util.testing as tm @pytest.fixture( @@ -274,9 +274,9 @@ def check_operands(left, right, cmp_op): def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = f"lhs {cmp1} rhs" msg = ( - r"only list-like( or dict-like)? objects are allowed to be" - r" passed to (DataFrame\.)?isin\(\), you passed a" - r" (\[|')bool(\]|')|" + r"only list-like( or dict-like)? objects are allowed to be " + r"passed to (DataFrame\.)?isin\(\), you passed a " + r"(\[|')bool(\]|')|" "argument of type 'bool' is not iterable" ) if cmp1 in ("in", "not in") and not is_list_like(rhs): @@ -339,8 +339,8 @@ def check_floor_division(self, lhs, arith1, rhs): self.check_equal(res, expected) else: msg = ( - r"unsupported operand type\(s\) for //: 'VariableNode' and" - " 'VariableNode'" + r"unsupported operand type\(s\) for //: 'VariableNode' and " + "'VariableNode'" ) with pytest.raises(TypeError, match=msg): pd.eval( @@ -408,9 +408,9 @@ def check_compound_invert_op(self, lhs, cmp1, rhs): ex = f"~(lhs {cmp1} rhs)" msg = ( - r"only list-like( or dict-like)? objects are allowed to be" - r" passed to (DataFrame\.)?isin\(\), you passed a" - r" (\[|')float(\]|')|" + r"only list-like( or dict-like)? objects are allowed to be " + r"passed to (DataFrame\.)?isin\(\), you passed a " + r"(\[|')float(\]|')|" "argument of type 'float' is not iterable" ) if is_scalar(rhs) and cmp1 in skip_these: @@ -1206,25 +1206,33 @@ def test_truediv(self): ex = "s / 1" d = {"s": s} # noqa - res = self.eval(ex, truediv=False) + # FutureWarning: The `truediv` parameter in pd.eval is deprecated and will be + # removed in a future version. + with tm.assert_produces_warning(FutureWarning): + res = self.eval(ex, truediv=False) tm.assert_numpy_array_equal(res, np.array([1.0])) - res = self.eval(ex, truediv=True) + with tm.assert_produces_warning(FutureWarning): + res = self.eval(ex, truediv=True) tm.assert_numpy_array_equal(res, np.array([1.0])) - res = self.eval("1 / 2", truediv=True) + with tm.assert_produces_warning(FutureWarning): + res = self.eval("1 / 2", truediv=True) expec = 0.5 assert res == expec - res = self.eval("1 / 2", truediv=False) + with tm.assert_produces_warning(FutureWarning): + res = self.eval("1 / 2", truediv=False) expec = 0.5 assert res == expec - res = self.eval("s / 2", truediv=False) + with tm.assert_produces_warning(FutureWarning): + res = self.eval("s / 2", truediv=False) expec = 0.5 assert res == expec - res = self.eval("s / 2", truediv=True) + with tm.assert_produces_warning(FutureWarning): + res = self.eval("s / 2", truediv=True) expec = 0.5 assert res == expec diff --git a/pandas/tests/config/test_localization.py b/pandas/tests/config/test_localization.py index 20a5be0c8a289..e815a90207a08 100644 --- a/pandas/tests/config/test_localization.py +++ b/pandas/tests/config/test_localization.py @@ -8,6 +8,8 @@ from pandas.compat import is_platform_windows +import pandas as pd + _all_locales = get_locales() or [] _current_locale = locale.getlocale() @@ -56,21 +58,21 @@ def test_get_locales_prefix(): @_skip_if_only_one_locale -def test_set_locale(): +@pytest.mark.parametrize( + "lang,enc", + [ + ("it_CH", "UTF-8"), + ("en_US", "ascii"), + ("zh_CN", "GB2312"), + ("it_IT", "ISO-8859-1"), + ], +) +def test_set_locale(lang, enc): if all(x is None for x in _current_locale): # Not sure why, but on some Travis runs with pytest, # getlocale() returned (None, None). pytest.skip("Current locale is not set.") - locale_override = os.environ.get("LOCALE_OVERRIDE", None) - - if locale_override is None: - lang, enc = "it_CH", "UTF-8" - elif locale_override == "C": - lang, enc = "en_US", "ascii" - else: - lang, enc = locale_override.split(".") - enc = codecs.lookup(enc).name new_locale = lang, enc @@ -91,3 +93,13 @@ def test_set_locale(): # Once we exit the "with" statement, locale should be back to what it was. current_locale = locale.getlocale() assert current_locale == _current_locale + + +def test_encoding_detected(): + system_locale = os.environ.get("LC_ALL") + system_encoding = system_locale.split(".")[-1] if system_locale else "utf-8" + + assert ( + codecs.lookup(pd.options.display.encoding).name + == codecs.lookup(system_encoding).name + ) diff --git a/pandas/tests/dtypes/cast/test_construct_from_scalar.py b/pandas/tests/dtypes/cast/test_construct_from_scalar.py index 71f41fcf5b447..cc823a3d6e02c 100644 --- a/pandas/tests/dtypes/cast/test_construct_from_scalar.py +++ b/pandas/tests/dtypes/cast/test_construct_from_scalar.py @@ -2,7 +2,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype from pandas import Categorical -import pandas.util.testing as tm +import pandas._testing as tm def test_cast_1d_array_like_from_scalar_categorical(): diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index 620e74f80d5fb..fe271392122a2 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -3,7 +3,7 @@ from pandas.core.dtypes.cast import construct_1d_ndarray_preserving_na -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/cast/test_downcast.py b/pandas/tests/dtypes/cast/test_downcast.py index 99afabfa42a04..d6e6ed3022b75 100644 --- a/pandas/tests/dtypes/cast/test_downcast.py +++ b/pandas/tests/dtypes/cast/test_downcast.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas import DatetimeIndex, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 37fa003668435..2744cfa8ddc62 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -19,7 +19,7 @@ Timestamp, date_range, ) -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[True, False]) diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 0939e35bd64fa..69f8f46356a4d 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -8,7 +8,6 @@ import pytest from pandas._libs.tslibs import NaT -from pandas.compat import is_platform_windows from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( @@ -406,7 +405,6 @@ def test_maybe_promote_any_with_datetime64( _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) -@pytest.mark.xfail(reason="Fails to upcast to object") def test_maybe_promote_datetimetz_with_any_numpy_dtype( tz_aware_fixture, any_numpy_dtype_reduced ): @@ -427,11 +425,6 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fix dtype = DatetimeTZDtype(tz=tz_aware_fixture) fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture2) - from dateutil.tz import tzlocal - - if is_platform_windows() and tz_aware_fixture2 == tzlocal(): - pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") - # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] @@ -441,7 +434,6 @@ def test_maybe_promote_datetimetz_with_datetimetz(tz_aware_fixture, tz_aware_fix expected_dtype = dtype else: expected_dtype = np.dtype(object) - pytest.xfail("fails to cast to object") _check_promote(dtype, fill_value, expected_dtype, exp_val_for_scalar) diff --git a/pandas/tests/dtypes/cast/test_upcast.py b/pandas/tests/dtypes/cast/test_upcast.py index 49e850f3e87b5..bb7a7d059c7ee 100644 --- a/pandas/tests/dtypes/cast/test_upcast.py +++ b/pandas/tests/dtypes/cast/test_upcast.py @@ -4,7 +4,7 @@ from pandas.core.dtypes.cast import maybe_upcast_putmask from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("result", [Series([10, 11, 12]), [10, 11, 12], (10, 11, 12)]) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 667ee467f2f29..097e83d93ee71 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List import numpy as np @@ -5,6 +6,7 @@ import pandas.util._test_decorators as td +from pandas.core.dtypes.cast import astype_nansafe import pandas.core.dtypes.common as com from pandas.core.dtypes.dtypes import ( CategoricalDtype, @@ -13,8 +15,11 @@ IntervalDtype, PeriodDtype, ) +from pandas.core.dtypes.missing import isna import pandas as pd +import pandas._testing as tm +from pandas.arrays import SparseArray from pandas.conftest import ( ALL_EA_INT_DTYPES, ALL_INT_DTYPES, @@ -23,7 +28,6 @@ UNSIGNED_EA_INT_DTYPES, UNSIGNED_INT_DTYPES, ) -import pandas.util.testing as tm # EA & Actual Dtypes @@ -179,7 +183,7 @@ def test_is_object(): "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)] ) def test_is_sparse(check_scipy): - assert com.is_sparse(pd.SparseArray([1, 2, 3])) + assert com.is_sparse(SparseArray([1, 2, 3])) assert not com.is_sparse(np.array([1, 2, 3])) @@ -195,7 +199,7 @@ def test_is_scipy_sparse(): assert com.is_scipy_sparse(bsr_matrix([1, 2, 3])) - assert not com.is_scipy_sparse(pd.SparseArray([1, 2, 3])) + assert not com.is_scipy_sparse(SparseArray([1, 2, 3])) def test_is_categorical(): @@ -488,7 +492,7 @@ def test_is_numeric_v_string_like(): def test_is_datetimelike_v_numeric(): - dt = np.datetime64(pd.datetime(2017, 1, 1)) + dt = np.datetime64(datetime(2017, 1, 1)) assert not com.is_datetimelike_v_numeric(1, 1) assert not com.is_datetimelike_v_numeric(dt, dt) @@ -573,7 +577,7 @@ def test_is_extension_type(check_scipy): cat = pd.Categorical([1, 2, 3]) assert com.is_extension_type(cat) assert com.is_extension_type(pd.Series(cat)) - assert com.is_extension_type(pd.SparseArray([1, 2, 3])) + assert com.is_extension_type(SparseArray([1, 2, 3])) assert com.is_extension_type(pd.DatetimeIndex(["2000"], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") @@ -602,7 +606,7 @@ def test_is_extension_array_dtype(check_scipy): cat = pd.Categorical([1, 2, 3]) assert com.is_extension_array_dtype(cat) assert com.is_extension_array_dtype(pd.Series(cat)) - assert com.is_extension_array_dtype(pd.SparseArray([1, 2, 3])) + assert com.is_extension_array_dtype(SparseArray([1, 2, 3])) assert com.is_extension_array_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern")) dtype = DatetimeTZDtype("ns", tz="US/Eastern") @@ -625,18 +629,6 @@ def test_is_complex_dtype(): assert com.is_complex_dtype(np.array([1 + 1j, 5])) -def test_is_offsetlike(): - assert com.is_offsetlike(np.array([pd.DateOffset(month=3), pd.offsets.Nano()])) - assert com.is_offsetlike(pd.offsets.MonthEnd()) - assert com.is_offsetlike(pd.Index([pd.DateOffset(second=1)])) - - assert not com.is_offsetlike(pd.Timedelta(1)) - assert not com.is_offsetlike(np.array([1 + 1j, 5])) - - # mixed case - assert not com.is_offsetlike(np.array([pd.DateOffset(), pd.Timestamp(0)])) - - @pytest.mark.parametrize( "input_param,result", [ @@ -676,7 +668,8 @@ def test__get_dtype(input_param, result): (None, "Cannot deduce dtype from null object"), (1, "data type not understood"), (1.2, "data type not understood"), - ("random string", 'data type "random string" not understood'), + # numpy dev changed from double-quotes to single quotes + ("random string", "data type [\"']random string[\"'] not understood"), (pd.DataFrame([1, 2]), "data type not understood"), ], ) @@ -721,3 +714,42 @@ def test__get_dtype_fails(input_param, expected_error_message): ) def test__is_dtype_type(input_param, result): assert com._is_dtype_type(input_param, lambda tipo: tipo == result) + + +@pytest.mark.parametrize("val", [np.datetime64("NaT"), np.timedelta64("NaT")]) +@pytest.mark.parametrize("typ", [np.int64]) +def test_astype_nansafe(val, typ): + arr = np.array([val]) + + msg = "Cannot convert NaT values to integer" + with pytest.raises(ValueError, match=msg): + astype_nansafe(arr, dtype=typ) + + +@pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64]) +@pytest.mark.parametrize( + "to_type", + [ + np.uint8, + np.uint16, + np.uint32, + np.int8, + np.int16, + np.int32, + np.float16, + np.float32, + ], +) +def test_astype_datetime64_bad_dtype_raises(from_type, to_type): + arr = np.array([from_type("2018")]) + + with pytest.raises(TypeError, match="cannot astype"): + astype_nansafe(arr, dtype=to_type) + + +@pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64]) +def test_astype_object_preserves_datetime_na(from_type): + arr = np.array([from_type("NaT")]) + result = astype_nansafe(arr, dtype="object") + + assert isna(result)[0] diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 4dee6e3e92a7f..fddd6239df309 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -27,8 +27,8 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, IntervalIndex, Series, date_range -from pandas.core.arrays.sparse import SparseDtype -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays.sparse import SparseArray, SparseDtype class Base: @@ -408,6 +408,9 @@ def test_construction_from_string(self): with pytest.raises(TypeError): PeriodDtype.construct_from_string("datetime64[ns, US/Eastern]") + with pytest.raises(TypeError, match="list"): + PeriodDtype.construct_from_string([1, 2, 3]) + def test_is_dtype(self): assert PeriodDtype.is_dtype(self.dtype) assert PeriodDtype.is_dtype("period[D]") @@ -685,6 +688,10 @@ def test_caching(self): tm.round_trip_pickle(dtype) assert len(IntervalDtype._cache) == 0 + def test_not_string(self): + # GH30568: though IntervalDtype has object kind, it cannot be string + assert not is_string_dtype(IntervalDtype()) + class TestCategoricalDtypeParametrized: @pytest.mark.parametrize( @@ -907,7 +914,7 @@ def test_registry_find(dtype, expected): (pd.Series([1, 2]), False), (np.array([True, False]), True), (pd.Series([True, False]), True), - (pd.SparseArray([True, False]), True), + (SparseArray([True, False]), True), (SparseDtype(bool), True), ], ) @@ -917,7 +924,7 @@ def test_is_bool_dtype(dtype, expected): def test_is_bool_dtype_sparse(): - result = is_bool_dtype(pd.Series(pd.SparseArray([True, False]))) + result = is_bool_dtype(pd.Series(SparseArray([True, False]))) assert result is True diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index c17a8997a9b8f..2c8631ac2d71d 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -5,7 +5,7 @@ from pandas.core.dtypes import generic as gt import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestABCClasses: @@ -17,7 +17,7 @@ class TestABCClasses: categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1]) categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical) df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) - sparse_array = pd.SparseArray(np.random.randn(10)) + sparse_array = pd.arrays.SparseArray(np.random.randn(10)) datetime_array = pd.core.arrays.DatetimeArray(datetime_index) timedelta_array = pd.core.arrays.TimedeltaArray(timedelta_index) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index f34a6effcc4f5..5eb85de2b90f5 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -4,6 +4,7 @@ """ import collections +from collections import namedtuple from datetime import date, datetime, time, timedelta from decimal import Decimal from fractions import Fraction @@ -51,8 +52,8 @@ Timestamp, isna, ) +import pandas._testing as tm from pandas.core.arrays import IntegerArray -import pandas.util.testing as tm @pytest.fixture(params=[True, False], ids=str) @@ -239,7 +240,7 @@ def __getitem__(self, key): if has_contains: - def __contains__(self, key): + def __contains__(self, key) -> bool: return self.d.__contains__(key) d = DictLike({1: 2}) @@ -1113,28 +1114,28 @@ def test_is_string_array(self): assert lib.is_string_array(np.array(["foo", "bar"])) assert not lib.is_string_array( - np.array(["foo", "bar", np.nan], dtype=object), skipna=False + np.array(["foo", "bar", pd.NA], dtype=object), skipna=False ) assert lib.is_string_array( + np.array(["foo", "bar", pd.NA], dtype=object), skipna=True + ) + # NaN is not valid for string array, just NA + assert not lib.is_string_array( np.array(["foo", "bar", np.nan], dtype=object), skipna=True ) + assert not lib.is_string_array(np.array([1, 2])) def test_to_object_array_tuples(self): r = (5, 6) values = [r] - result = lib.to_object_array_tuples(values) + lib.to_object_array_tuples(values) - try: - # make sure record array works - from collections import namedtuple - - record = namedtuple("record", "x y") - r = record(5, 6) - values = [r] - result = lib.to_object_array_tuples(values) # noqa - except ImportError: - pass + # make sure record array works + record = namedtuple("record", "x y") + r = record(5, 6) + values = [r] + lib.to_object_array_tuples(values) def test_object(self): @@ -1174,8 +1175,6 @@ def test_is_period(self): def test_categorical(self): # GH 8974 - from pandas import Categorical, Series - arr = Categorical(list("abc")) result = lib.infer_dtype(arr, skipna=True) assert result == "categorical" diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 5e7c6e4b48682..7ba59786bb0fa 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -1,6 +1,5 @@ from datetime import datetime from decimal import Decimal -from warnings import catch_warnings, filterwarnings import numpy as np import pytest @@ -23,7 +22,7 @@ import pandas as pd from pandas import DatetimeIndex, Float64Index, NaT, Series, TimedeltaIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm now = pd.Timestamp.now() utcnow = pd.Timestamp.now("UTC") @@ -295,6 +294,11 @@ def test_array_equivalent(): np.array([np.nan, None], dtype="object"), np.array([np.nan, None], dtype="object"), ) + # Check the handling of nested arrays in array_equivalent_object + assert array_equivalent( + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + ) assert array_equivalent( np.array([np.nan, 1 + 1j], dtype="complex"), np.array([np.nan, 1 + 1j], dtype="complex"), @@ -315,23 +319,21 @@ def test_array_equivalent(): assert not array_equivalent( TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan]) ) - with catch_warnings(): - filterwarnings("ignore", "Converting timezone", FutureWarning) - assert array_equivalent( - DatetimeIndex([0, np.nan], tz="US/Eastern"), - DatetimeIndex([0, np.nan], tz="US/Eastern"), - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz="US/Eastern"), - DatetimeIndex([1, np.nan], tz="US/Eastern"), - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") - ) - assert not array_equivalent( - DatetimeIndex([0, np.nan], tz="CET"), - DatetimeIndex([0, np.nan], tz="US/Eastern"), - ) + assert array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([1, np.nan], tz="US/Eastern"), + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan], tz="CET"), + DatetimeIndex([0, np.nan], tz="US/Eastern"), + ) assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index e88c63b19003f..94dd09d3eb053 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -2,10 +2,10 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.extension import base -import pandas.util.testing as tm -pytest.importorskip("pyarrow", minversion="0.12.0") +pytest.importorskip("pyarrow", minversion="0.13.0") from .arrays import ArrowBoolArray, ArrowBoolDtype # isort:skip diff --git a/pandas/tests/extension/arrow/test_string.py b/pandas/tests/extension/arrow/test_string.py index baedcf0dd9088..abd5c1f386dc5 100644 --- a/pandas/tests/extension/arrow/test_string.py +++ b/pandas/tests/extension/arrow/test_string.py @@ -2,7 +2,7 @@ import pandas as pd -pytest.importorskip("pyarrow", minversion="0.12.0") +pytest.importorskip("pyarrow", minversion="0.13.0") from .arrays import ArrowStringDtype # isort:skip diff --git a/pandas/tests/extension/base/base.py b/pandas/tests/extension/base/base.py index 2f808d20acd31..144b0825b39a2 100644 --- a/pandas/tests/extension/base/base.py +++ b/pandas/tests/extension/base/base.py @@ -1,4 +1,4 @@ -import pandas.util.testing as tm +import pandas._testing as tm class BaseExtensionTests: diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 7146443bf8de5..58859fc6ac54c 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas as pd from pandas.core.internals import ObjectBlock @@ -21,3 +23,12 @@ def test_astype_str(self, data): result = pd.Series(data[:5]).astype(str) expected = pd.Series(data[:5].astype(str)) self.assert_series_equal(result, expected) + + def test_to_numpy(self, data): + expected = np.asarray(data) + + result = data.to_numpy() + self.assert_equal(result, expected) + + result = pd.Series(data).to_numpy() + self.assert_equal(result, expected) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 9a442f346c19f..b6c12b5844086 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -16,8 +16,7 @@ def test_name(self, dtype): def test_kind(self, dtype): valid = set("biufcmMOSUV") - if dtype.kind is not None: - assert dtype.kind in valid + assert dtype.kind in valid def test_construct_from_string_own_name(self, dtype): result = dtype.construct_from_string(dtype.name) @@ -38,6 +37,9 @@ def test_is_dtype_from_self(self, dtype): result = type(dtype).is_dtype(dtype) assert result is True + def test_is_dtype_other_input(self, dtype): + assert dtype.is_dtype([1, 2, 3]) is False + def test_is_not_string_type(self, dtype): return not pd.api.types.is_string_dtype(dtype) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 71c7fbb986267..dc1f62c4c97c5 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -121,6 +121,45 @@ def test_getitem_mask(self, data): assert len(result) == 1 assert result.dtype == data.dtype + def test_getitem_mask_raises(self, data): + mask = np.array([True, False]) + with pytest.raises(IndexError): + data[mask] + + mask = pd.array(mask, dtype="boolean") + with pytest.raises(IndexError): + data[mask] + + def test_getitem_boolean_array_mask(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + result = data[mask] + assert len(result) == 0 + assert isinstance(result, type(data)) + + result = pd.Series(data)[mask] + assert len(result) == 0 + assert result.dtype == data.dtype + + mask[:5] = True + expected = data.take([0, 1, 2, 3, 4]) + result = data[mask] + self.assert_extension_array_equal(result, expected) + + expected = pd.Series(expected) + result = pd.Series(data)[mask] + self.assert_series_equal(result, expected) + + def test_getitem_boolean_array_mask_raises(self, data): + mask = pd.array(np.zeros(data.shape, dtype="bool"), dtype="boolean") + mask[:2] = pd.NA + with pytest.raises(ValueError): + data[mask] + + s = pd.Series(data) + + with pytest.raises(ValueError): + s[mask] + def test_getitem_slice(self, data): # getitem[slice] should return an array result = data[slice(0)] # empty diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index dc926d2ff6ab4..94d0ef7bbea84 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py index a29f6deeffae6..cdea96334be2a 100644 --- a/pandas/tests/extension/base/interface.py +++ b/pandas/tests/extension/base/interface.py @@ -4,7 +4,7 @@ from pandas.core.dtypes.dtypes import ExtensionDtype import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 973088cb72e7a..6b75176ebd35b 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -2,8 +2,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.sorting import nargsort -import pandas.util.testing as tm from .base import BaseExtensionTests @@ -261,6 +261,11 @@ def test_shift_fill_value(self, data): expected = data.take([2, 3, 0, 0]) self.assert_extension_array_equal(result, expected) + def test_not_hashable(self, data): + # We are in general mutable, so not hashable + with pytest.raises(TypeError, match="unhashable type"): + hash(data) + def test_hash_pandas_object_works(self, data, as_frame): # https://github.com/pandas-dev/pandas/issues/23066 data = pd.Series(data) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index 21bbb365ab0f3..2393d2edcd2c6 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 8766bb771f8a2..6f433d659575a 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .base import BaseExtensionTests diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 90e607343297d..ec21898852888 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -94,6 +94,19 @@ def test_concat_columns(self, data, na_value): result = pd.concat([df1["A"], df2["B"]], axis=1) self.assert_frame_equal(result, expected) + def test_concat_extension_arrays_copy_false(self, data, na_value): + # GH 20756 + df1 = pd.DataFrame({"A": data[:3]}) + df2 = pd.DataFrame({"B": data[3:7]}) + expected = pd.DataFrame( + { + "A": data._from_sequence(list(data[:3]) + [na_value], dtype=data.dtype), + "B": data[3:7], + } + ) + result = pd.concat([df1, df2], axis=1, copy=False) + self.assert_frame_equal(result, expected) + def test_align(self, data, na_value): a = data[:3] b = data[2:5] @@ -295,3 +308,19 @@ def test_ravel(self, data): # Check that we have a view, not a copy result[0] = result[1] assert data[0] == data[1] + + def test_transpose(self, data): + df = pd.DataFrame({"A": data[:4], "B": data[:4]}, index=["a", "b", "c", "d"]) + result = df.T + expected = pd.DataFrame( + { + "a": type(data)._from_sequence([data[0]] * 2, dtype=data.dtype), + "b": type(data)._from_sequence([data[1]] * 2, dtype=data.dtype), + "c": type(data)._from_sequence([data[2]] * 2, dtype=data.dtype), + "d": type(data)._from_sequence([data[3]] * 2, dtype=data.dtype), + }, + index=["A", "B"], + ) + self.assert_frame_equal(result, expected) + self.assert_frame_equal(np.transpose(np.transpose(df)), df) + self.assert_frame_equal(np.transpose(np.transpose(df[["A"]])), df[["A"]]) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index bb6bb02b462e2..0bb8aede6298c 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -186,3 +186,12 @@ def test_setitem_scalar_key_sequence_raise(self, data): arr = data[:5].copy() with pytest.raises(ValueError): arr[0] = arr[[0, 1]] + + def test_setitem_preserves_views(self, data): + # GH#28150 setitem shouldn't swap the underlying data + view1 = data.view() + view2 = data[:] + + data[0] = data[1] + assert view1[0] == data[1] + assert view2[0] == data[1] diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 74f1e3cfbaf20..85bd5f7a33fe1 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -8,7 +8,7 @@ from pandas.core.dtypes.base import ExtensionDtype import pandas as pd -from pandas.api.extensions import register_extension_dtype +from pandas.api.extensions import no_default, register_extension_dtype from pandas.core.arrays import ExtensionArray, ExtensionScalarOpsMixin @@ -84,6 +84,12 @@ def _from_factorized(cls, values, original): _HANDLED_TYPES = (decimal.Decimal, numbers.Number, np.ndarray) + def to_numpy(self, dtype=None, copy=False, na_value=no_default, decimals=None): + result = np.asarray(self, dtype=dtype) + if decimals is not None: + result = np.asarray([round(x, decimals) for x in result]) + return result + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): # if not all( @@ -109,6 +115,15 @@ def __getitem__(self, item): if isinstance(item, numbers.Integral): return self._data[item] else: + # array, slice. + if pd.api.types.is_list_like(item): + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = pd.api.indexers.check_bool_array_indexer(self, item) + elif pd.api.types.is_integer_dtype(dtype): + item = np.asarray(item, dtype="int") return type(self)(self._data[item]) def take(self, indexer, allow_fill=False, fill_value=None): diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index b5c3abd8ce8f6..de7c98ab96571 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -6,8 +6,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.extension import base -import pandas.util.testing as tm from .array import DecimalArray, DecimalDtype, make_data, to_decimal @@ -499,3 +499,17 @@ def DecimalArray__array__(self, dtype=None): df[s > 0.5] s.at[0] df.at[0, "a"] + + +def test_to_numpy_keyword(): + # test the extra keyword + values = [decimal.Decimal("1.1111"), decimal.Decimal("2.2222")] + expected = np.array( + [decimal.Decimal("1.11"), decimal.Decimal("2.22")], dtype="object" + ) + a = pd.array(values, dtype="decimal") + result = a.to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) + + result = pd.Series(a).to_numpy(decimals=2) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 014581682ac59..17bc2773aad19 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -19,9 +19,8 @@ import numpy as np -from pandas.core.dtypes.base import ExtensionDtype - -from pandas.core.arrays import ExtensionArray +import pandas as pd +from pandas.api.extensions import ExtensionArray, ExtensionDtype class JSONDtype(ExtensionDtype): @@ -76,17 +75,21 @@ def _from_factorized(cls, values, original): def __getitem__(self, item): if isinstance(item, numbers.Integral): return self.data[item] - elif isinstance(item, np.ndarray) and item.dtype == "bool": - return self._from_sequence([x for x, m in zip(self, item) if m]) - elif isinstance(item, abc.Iterable): - # fancy indexing - return type(self)([self.data[i] for i in item]) elif isinstance(item, slice) and item == slice(None): # Make sure we get a view return type(self)(self.data) - else: + elif isinstance(item, slice): # slice return type(self)(self.data[item]) + else: + if not pd.api.types.is_array_like(item): + item = pd.array(item) + dtype = item.dtype + if pd.api.types.is_bool_dtype(dtype): + item = pd.api.indexers.check_bool_array_indexer(self, item) + return self._from_sequence([x for x, m in zip(self, item) if m]) + # integer + return type(self)([self.data[i] for i in item]) def __setitem__(self, key, value): if isinstance(key, numbers.Integral): diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 16a4caa7d7ebe..4d3145109e3c2 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -4,8 +4,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.extension import base -import pandas.util.testing as tm from .array import JSONArray, JSONDtype, make_data @@ -163,6 +163,10 @@ def test_unstack(self, data, index): # this matches otherwise return super().test_unstack(data, index) + @pytest.mark.xfail(reason="Inconsistent sizes.") + def test_transpose(self, data): + super().test_transpose(data) + class TestGetitem(BaseJSON, base.BaseGetitemTests): pass diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index a02433da2da12..c489445d8512a 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -19,9 +19,9 @@ from pandas.compat.numpy import _np_version_under1p14 import pandas as pd +import pandas._testing as tm from pandas.core.arrays.boolean import BooleanDtype from pandas.tests.extension import base -import pandas.util.testing as tm def make_data(): @@ -226,6 +226,10 @@ def test_searchsorted(self, data_for_sorting, as_series): sorter = np.array([1, 0]) assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + @pytest.mark.skip(reason="uses nullable integer") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) + class TestCasting(base.BaseCastingTests): pass @@ -323,7 +327,9 @@ def check_reduce(self, s, op_name, skipna): result = getattr(s, op_name)(skipna=skipna) expected = getattr(s.astype("float64"), op_name)(skipna=skipna) # override parent function to cast to bool for min/max - if op_name in ("min", "max") and not pd.isna(expected): + if np.isnan(expected): + expected = pd.NA + elif op_name in ("min", "max"): expected = bool(expected) tm.assert_almost_equal(result, expected) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index dff1e58641ade..336b23e54d74c 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -20,9 +20,9 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Timestamp +import pandas._testing as tm from pandas.api.types import CategoricalDtype from pandas.tests.extension import base -import pandas.util.testing as tm def make_data(): diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 9b5f9d64f6b67..e43650c291200 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -5,8 +5,8 @@ from pandas.core.dtypes.common import is_extension_array_dtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import ExtensionArray -import pandas.util.testing as tm class DummyDtype(dtypes.ExtensionDtype): diff --git a/pandas/tests/extension/test_integer.py b/pandas/tests/extension/test_integer.py index d051345fdd12d..f55ec75b47dfa 100644 --- a/pandas/tests/extension/test_integer.py +++ b/pandas/tests/extension/test_integer.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import is_extension_array_dtype import pandas as pd +import pandas._testing as tm from pandas.core.arrays import integer_array from pandas.core.arrays.integer import ( Int8Dtype, @@ -34,7 +35,7 @@ def make_data(): - return list(range(1, 9)) + [np.nan] + list(range(10, 98)) + [np.nan] + [99, 100] + return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100] @pytest.fixture( @@ -65,7 +66,7 @@ def data_for_twos(dtype): @pytest.fixture def data_missing(dtype): - return integer_array([np.nan, 1], dtype=dtype) + return integer_array([pd.NA, 1], dtype=dtype) @pytest.fixture @@ -75,18 +76,18 @@ def data_for_sorting(dtype): @pytest.fixture def data_missing_for_sorting(dtype): - return integer_array([1, np.nan, 0], dtype=dtype) + return integer_array([1, pd.NA, 0], dtype=dtype) @pytest.fixture def na_cmp(): - # we are np.nan - return lambda x, y: np.isnan(x) and np.isnan(y) + # we are pd.NA + return lambda x, y: x is pd.NA and y is pd.NA @pytest.fixture def na_value(): - return np.nan + return pd.NA @pytest.fixture @@ -94,7 +95,7 @@ def data_for_grouping(dtype): b = 1 a = 0 c = 2 - na = np.nan + na = pd.NA return integer_array([b, b, na, na, a, a, b, c], dtype=dtype) @@ -129,7 +130,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): expected = s.combine(other, op) if op_name in ("__rtruediv__", "__truediv__", "__div__"): - expected = expected.astype(float) + expected = expected.fillna(np.nan).astype(float) if op_name == "__rtruediv__": # TODO reverse operators result in object dtype result = result.astype(float) @@ -142,6 +143,7 @@ def _check_op(self, s, op, other, op_name, exc=NotImplementedError): # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass + if (op_name == "__rpow__") and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 @@ -162,6 +164,16 @@ def test_error(self, data, all_arithmetic_operators): class TestComparisonOps(base.BaseComparisonOpsTests): + def _check_op(self, s, op, other, op_name, exc=NotImplementedError): + if exc is None: + result = op(s, other) + # Override to do the astype to boolean + expected = s.combine(other, op).astype("boolean") + self.assert_series_equal(result, expected) + else: + with pytest.raises(exc): + op(s, other) + def check_opname(self, s, op_name, other, exc=None): super().check_opname(s, op_name, other, exc=None) @@ -198,7 +210,7 @@ class TestMissing(base.BaseMissingTests): class TestMethods(base.BaseMethodsTests): - @pytest.mark.parametrize("dropna", [True, False]) + @pytest.mark.skip(reason="uses nullable integer") def test_value_counts(self, all_data, dropna): all_data = all_data[:10] if dropna: @@ -222,7 +234,14 @@ class TestGroupby(base.BaseGroupbyTests): class TestNumericReduce(base.BaseNumericReduceTests): - pass + def check_reduce(self, s, op_name, skipna): + # overwrite to ensure pd.NA is tested instead of np.nan + # https://github.com/pandas-dev/pandas/issues/30958 + result = getattr(s, op_name)(skipna=skipna) + expected = getattr(s.astype("float64"), op_name)(skipna=skipna) + if np.isnan(expected): + expected = pd.NA + tm.assert_almost_equal(result, expected) class TestBooleanReduce(base.BaseBooleanReduceTests): diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 4fdcf930d224f..2411f6cfbd936 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -147,7 +147,9 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): class TestSetitem(BaseInterval, base.BaseSetitemTests): - pass + @pytest.mark.xfail(reason="GH#27147 setitem changes underlying index") + def test_setitem_preserves_views(self, data): + super().test_setitem_preserves_views(data) class TestPrinting(BaseInterval, base.BasePrintingTests): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 221cf0787d839..7db38f41d4573 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -4,8 +4,8 @@ from pandas.compat.numpy import _np_version_under1p16 import pandas as pd +import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray, PandasDtype -import pandas.util.testing as tm from . import base @@ -51,7 +51,7 @@ def data_missing(allow_in_pandas, dtype): if dtype.numpy_dtype == "object": if _np_version_under1p16: raise pytest.skip("Skipping for NumPy <1.16") - return PandasArray(np.array([np.nan, (1,)])) + return PandasArray(np.array([np.nan, (1,)], dtype=object)) return PandasArray(np.array([np.nan, 1.0])) @@ -78,7 +78,7 @@ def data_for_sorting(allow_in_pandas, dtype): if dtype.numpy_dtype == "object": # Use an empty tuple for first element, then remove, # to disable np.array's shape inference. - return PandasArray(np.array([(), (2,), (3,), (1,)])[1:]) + return PandasArray(np.array([(), (2,), (3,), (1,)], dtype=object)[1:]) return PandasArray(np.array([1, 2, 0])) @@ -90,7 +90,7 @@ def data_missing_for_sorting(allow_in_pandas, dtype): A < B and NA missing. """ if dtype.numpy_dtype == "object": - return PandasArray(np.array([(1,), np.nan, (0,)])) + return PandasArray(np.array([(1,), np.nan, (0,)], dtype=object)) return PandasArray(np.array([1, np.nan, 0])) @@ -106,7 +106,9 @@ def data_for_grouping(allow_in_pandas, dtype): a, b, c = (1,), (2,), (3,) else: a, b, c = np.arange(3) - return PandasArray(np.array([b, b, np.nan, np.nan, a, a, b, c])) + return PandasArray( + np.array([b, b, np.nan, np.nan, a, a, b, c], dtype=dtype.numpy_dtype) + ) @pytest.fixture @@ -330,6 +332,10 @@ def test_merge_on_extension_array_duplicates(self, data): # Fails creating expected super().test_merge_on_extension_array_duplicates(data) + @skip_nested + def test_transpose(self, data): + super().test_transpose(data) + class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): @skip_nested diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 6ebe71e173ec2..198a228b621b4 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -4,9 +4,10 @@ from pandas.errors import PerformanceWarning import pandas as pd -from pandas import SparseArray, SparseDtype +from pandas import SparseDtype +import pandas._testing as tm +from pandas.arrays import SparseArray from pandas.tests.extension import base -import pandas.util.testing as tm def make_data(fill_value): @@ -132,6 +133,10 @@ def test_concat_columns(self, data, na_value): self._check_unsupported(data) super().test_concat_columns(data, na_value) + def test_concat_extension_arrays_copy_false(self, data, na_value): + self._check_unsupported(data) + super().test_concat_extension_arrays_copy_false(data, na_value) + def test_align(self, data, na_value): self._check_unsupported(data) super().test_align(data, na_value) @@ -231,7 +236,7 @@ def test_combine_le(self, data_repeated): s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 <= x2) expected = pd.Series( - pd.SparseArray( + SparseArray( [a <= b for (a, b) in zip(list(orig_data1), list(orig_data2))], fill_value=False, ) @@ -241,7 +246,7 @@ def test_combine_le(self, data_repeated): val = s1.iloc[0] result = s1.combine(val, lambda x1, x2: x1 <= x2) expected = pd.Series( - pd.SparseArray([a <= val for a in list(orig_data1)], fill_value=False) + SparseArray([a <= val for a in list(orig_data1)], fill_value=False) ) self.assert_series_equal(result, expected) @@ -346,7 +351,7 @@ def _compare_other(self, s, data, op_name, other): with np.errstate(all="ignore"): expected = pd.Series( - pd.SparseArray( + SparseArray( op(np.asarray(data), np.asarray(other)), fill_value=result.values.fill_value, ) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 8519c2999ade3..86aed671f1b88 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -81,7 +81,9 @@ class TestNoReduce(base.BaseNoReduceTests): class TestMethods(base.BaseMethodsTests): - pass + @pytest.mark.skip(reason="returns nullable") + def test_value_counts(self, all_data, dropna): + return super().test_value_counts(all_data, dropna) class TestCasting(base.BaseCastingTests): diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py index 915d6edcd8367..774eb443c45fe 100644 --- a/pandas/tests/frame/conftest.py +++ b/pandas/tests/frame/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, NaT, date_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index b595e48797d41..5de38915f04c1 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameIndexingCategorical: diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py index bde35c04acf4f..a1c12be2b0180 100644 --- a/pandas/tests/frame/indexing/test_datetime.py +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -1,6 +1,6 @@ import pandas as pd from pandas import DataFrame, Index, Series, date_range, notna -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameIndexingDatetimeWithTZ: diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index cd384d6fdbfad..cbb9dd09bbede 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -20,9 +20,10 @@ isna, notna, ) +import pandas._testing as tm +from pandas.arrays import SparseArray import pandas.core.common as com from pandas.core.indexing import IndexingError -import pandas.util.testing as tm from pandas.tseries.offsets import BDay @@ -446,8 +447,8 @@ def test_setitem(self, float_frame): tm.assert_series_equal(series, float_frame["col6"], check_names=False) msg = ( - r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the" - r" \[columns\]\"" + r"\"None of \[Float64Index\(\[.*dtype='float64'\)\] are in the " + r"\[columns\]\"" ) with pytest.raises(KeyError, match=msg): float_frame[np.random.randn(len(float_frame) + 1)] = 1 @@ -1038,9 +1039,9 @@ def test_getitem_setitem_float_labels(self): # positional slicing only via iloc! msg = ( - "cannot do slice indexing on" - r" with" - r" these indexers \[1.0\] of " + "cannot do slice indexing on " + r" with " + r"these indexers \[1.0\] of " ) with pytest.raises(TypeError, match=msg): df.iloc[1.0:5] @@ -1146,18 +1147,18 @@ def test_setitem_mixed_datetime(self): { "a": [0, 0, 0, 0, 13, 14], "b": [ - pd.datetime(2012, 1, 1), + datetime(2012, 1, 1), 1, "x", "y", - pd.datetime(2013, 1, 1), - pd.datetime(2014, 1, 1), + datetime(2013, 1, 1), + datetime(2014, 1, 1), ], } ) df = pd.DataFrame(0, columns=list("ab"), index=range(6)) df["b"] = pd.NaT - df.loc[0, "b"] = pd.datetime(2012, 1, 1) + df.loc[0, "b"] = datetime(2012, 1, 1) df.loc[1, "b"] = 1 df.loc[[2, 3], "b"] = "x", "y" A = np.array( @@ -1776,7 +1777,7 @@ def test_getitem_ix_float_duplicates(self): def test_getitem_sparse_column(self): # https://github.com/pandas-dev/pandas/issues/23559 - data = pd.SparseArray([0, 1]) + data = SparseArray([0, 1]) df = pd.DataFrame({"A": data}) expected = pd.Series(data, name="A") result = df["A"] @@ -1791,7 +1792,7 @@ def test_getitem_sparse_column(self): def test_setitem_with_sparse_value(self): # GH8131 df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) - sp_array = pd.SparseArray([0, 0, 1]) + sp_array = SparseArray([0, 0, 1]) df["new_column"] = sp_array tm.assert_series_equal( df["new_column"], pd.Series(sp_array, name="new_column"), check_names=False @@ -1799,9 +1800,9 @@ def test_setitem_with_sparse_value(self): def test_setitem_with_unaligned_sparse_value(self): df = pd.DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]}) - sp_series = pd.Series(pd.SparseArray([0, 0, 1]), index=[2, 1, 0]) + sp_series = pd.Series(SparseArray([0, 0, 1]), index=[2, 1, 0]) df["new_column"] = sp_series - exp = pd.Series(pd.SparseArray([1, 0, 0]), name="new_column") + exp = pd.Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], exp) def test_setitem_with_unaligned_tz_aware_datetime_column(self): @@ -2178,7 +2179,7 @@ def test_type_error_multiindex(self): dg = df.pivot_table(index="i", columns="c", values=["x", "y"]) with pytest.raises(TypeError, match="is an invalid key"): - str(dg[:, 0]) + dg[:, 0] index = Index(range(2), name="i") columns = MultiIndex( diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 4fea190f28d7b..df1b128dcd227 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameIndexingWhere: diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py new file mode 100644 index 0000000000000..9fc3629e794e2 --- /dev/null +++ b/pandas/tests/frame/methods/test_append.py @@ -0,0 +1,199 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm + + +class TestDataFrameAppend: + def test_append_empty_list(self): + # GH 28769 + df = DataFrame() + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df + + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + result = df.append([]) + expected = df + tm.assert_frame_equal(result, expected) + assert result is not df # .append() should return a new object + + def test_append_series_dict(self): + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + + series = df.loc[4] + msg = "Indexes have overlapping values" + with pytest.raises(ValueError, match=msg): + df.append(series, verify_integrity=True) + + series.name = None + msg = "Can only append a Series if ignore_index=True" + with pytest.raises(TypeError, match=msg): + df.append(series, verify_integrity=True) + + result = df.append(series[::-1], ignore_index=True) + expected = df.append( + DataFrame({0: series[::-1]}, index=df.columns).T, ignore_index=True + ) + tm.assert_frame_equal(result, expected) + + # dict + result = df.append(series.to_dict(), ignore_index=True) + tm.assert_frame_equal(result, expected) + + result = df.append(series[::-1][:3], ignore_index=True) + expected = df.append( + DataFrame({0: series[::-1][:3]}).T, ignore_index=True, sort=True + ) + tm.assert_frame_equal(result, expected.loc[:, result.columns]) + + msg = "Can only append a dict if ignore_index=True" + with pytest.raises(TypeError, match=msg): + df.append(series.to_dict()) + + # can append when name set + row = df.loc[4] + row.name = 5 + result = df.append(row) + expected = df.append(df[-1:], ignore_index=True) + tm.assert_frame_equal(result, expected) + + def test_append_list_of_series_dicts(self): + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + + dicts = [x.to_dict() for idx, x in df.iterrows()] + + result = df.append(dicts, ignore_index=True) + expected = df.append(df, ignore_index=True) + tm.assert_frame_equal(result, expected) + + # different columns + dicts = [ + {"foo": 1, "bar": 2, "baz": 3, "peekaboo": 4}, + {"foo": 5, "bar": 6, "baz": 7, "peekaboo": 8}, + ] + result = df.append(dicts, ignore_index=True, sort=True) + expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) + + def test_append_missing_cols(self): + # GH22252 + # exercise the conditional branch in append method where the data + # to be appended is a list and does not contain all columns that are in + # the target DataFrame + df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) + + dicts = [{"foo": 9}, {"bar": 10}] + with tm.assert_produces_warning(None): + result = df.append(dicts, ignore_index=True, sort=True) + + expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) + tm.assert_frame_equal(result, expected) + + def test_append_empty_dataframe(self): + + # Empty df append empty df + df1 = DataFrame() + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + # Non-empty df append empty df + df1 = DataFrame(np.random.randn(5, 2)) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + # Empty df with columns append empty df + df1 = DataFrame(columns=["bar", "foo"]) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + # Non-Empty df with columns append empty df + df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + def test_append_dtypes(self): + + # GH 5754 + # row appends of different dtypes (so need to do by-item) + # can sometimes infer the correct type + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) + df2 = DataFrame() + result = df1.append(df2) + expected = df1.copy() + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) + result = df1.append(df2) + expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) + result = df1.append(df2) + expected = DataFrame( + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) + result = df1.append(df2) + expected = DataFrame( + {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} + ) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": np.nan}, index=range(1)) + df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) + result = df1.append(df2) + expected = DataFrame( + {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} + ) + tm.assert_frame_equal(result, expected) + + df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) + df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) + result = df1.append(df2) + expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"] + ) + def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): + # GH 30238 + tz = tz_naive_fixture + df = pd.DataFrame([pd.Timestamp(timestamp, tz=tz)]) + result = df.append(df.iloc[0]).iloc[-1] + expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data, dtype", + [ + ([1], pd.Int64Dtype()), + ([1], pd.CategoricalDtype()), + ([pd.Interval(left=0, right=5)], pd.IntervalDtype()), + ([pd.Period("2000-03", freq="M")], pd.PeriodDtype("M")), + ([1], pd.SparseDtype()), + ], + ) + def test_other_dtypes(self, data, dtype): + df = pd.DataFrame(data, dtype=dtype) + result = df.append(df.iloc[0]).iloc[-1] + expected = pd.Series(data, name=0, dtype=dtype) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 89be3779e5748..0291be0a4083e 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -1,8 +1,8 @@ import numpy as np import pytest -from pandas import DataFrame, Series, Timestamp, date_range, to_datetime -import pandas.util.testing as tm +from pandas import DataFrame, Period, Series, Timestamp, date_range, to_datetime +import pandas._testing as tm @pytest.fixture @@ -30,6 +30,7 @@ def test_basic(self, date_range_frame): ub = df.index[30] dates = list(dates) + result = df.asof(dates) assert result.notna().all(1).all() @@ -65,6 +66,7 @@ def test_missing(self, date_range_frame): # no match found - `where` value before earliest date in index N = 10 df = date_range_frame.iloc[:N].copy() + result = df.asof("1989-12-31") expected = Series( @@ -78,6 +80,12 @@ def test_missing(self, date_range_frame): ) tm.assert_frame_equal(result, expected) + # Check that we handle PeriodIndex correctly, dont end up with + # period.ordinal for series name + df = df.to_period("D") + result = df.asof("1989-12-31") + assert isinstance(result.name, Period) + def test_all_nans(self, date_range_frame): # GH 15713 # DataFrame is all nans @@ -132,5 +140,6 @@ def test_time_zone_aware_index(self, stamp, expected): Timestamp("2018-01-01 22:35:10.550+00:00"), ], ) + result = df.asof(stamp) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 48444e909ee01..34727da3b95ae 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameClip: diff --git a/pandas/tests/frame/methods/test_count.py b/pandas/tests/frame/methods/test_count.py index b5d3d60579f54..13a93e3efc48c 100644 --- a/pandas/tests/frame/methods/test_count.py +++ b/pandas/tests/frame/methods/test_count.py @@ -1,5 +1,5 @@ from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameCount: diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 04bc87a243a9b..5c13b60aae0d0 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DataFrame, Series, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameCov: @@ -62,32 +62,15 @@ def test_cov(self, float_frame, float_string_frame): class TestDataFrameCorr: # DataFrame.corr(), as opposed to DataFrame.corrwith - @staticmethod - def _check_method(frame, method="pearson"): - correls = frame.corr(method=method) - expected = frame["A"].corr(frame["C"], method=method) - tm.assert_almost_equal(correls["A"]["C"], expected) - - @td.skip_if_no_scipy - def test_corr_pearson(self, float_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan - - self._check_method(float_frame, "pearson") - + @pytest.mark.parametrize("method", ["pearson", "kendall", "spearman"]) @td.skip_if_no_scipy - def test_corr_kendall(self, float_frame): + def test_corr_scipy_method(self, float_frame, method): float_frame["A"][:5] = np.nan float_frame["B"][5:10] = np.nan - self._check_method(float_frame, "kendall") - - @td.skip_if_no_scipy - def test_corr_spearman(self, float_frame): - float_frame["A"][:5] = np.nan - float_frame["B"][5:10] = np.nan - - self._check_method(float_frame, "spearman") + correls = float_frame.corr(method=method) + expected = float_frame["A"].corr(float_frame["C"], method=method) + tm.assert_almost_equal(correls["A"]["C"], expected) # --------------------------------------------------------------------- diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 09510fc931546..251563e51e15a 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameDescribe: diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 9293855e79b1c..43c25f4c05c2d 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameDiff: diff --git a/pandas/tests/frame/test_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py similarity index 81% rename from pandas/tests/frame/test_duplicates.py rename to pandas/tests/frame/methods/test_drop_duplicates.py index d2a1fc43d2046..fd4bae26ade57 100644 --- a/pandas/tests/frame/test_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -3,95 +3,20 @@ import numpy as np import pytest -from pandas import DataFrame, Series -import pandas.util.testing as tm +from pandas import DataFrame +import pandas._testing as tm @pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) -def test_duplicated_with_misspelled_column_name(subset): +def test_drop_duplicates_with_misspelled_column_name(subset): # GH 19730 df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) msg = re.escape("Index(['a'], dtype='object')") - with pytest.raises(KeyError, match=msg): - df.duplicated(subset) - with pytest.raises(KeyError, match=msg): df.drop_duplicates(subset) -@pytest.mark.slow -def test_duplicated_do_not_fail_on_wide_dataframes(): - # gh-21524 - # Given the wide dataframe with a lot of columns - # with different (important!) values - data = { - "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) - } - df = DataFrame(data).T - result = df.duplicated() - - # Then duplicates produce the bool Series as a result and don't fail during - # calculation. Actual values doesn't matter here, though usually it's all - # False in this case - assert isinstance(result, Series) - assert result.dtype == np.bool - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), - ], -) -def test_duplicated_keep(keep, expected): - df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - -@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), - ], -) -def test_duplicated_nan_none(keep, expected): - df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) - - result = df.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("keep", ["first", "last", False]) -@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) -def test_duplicated_subset(subset, keep): - df = DataFrame( - { - "A": [0, 1, 1, 2, 0], - "B": ["a", "b", "b", "c", "a"], - "C": [np.nan, 3, 3, None, np.nan], - } - ) - - if subset is None: - subset = list(df.columns) - elif isinstance(subset, str): - # need to have a DataFrame, not a Series - # -> select columns with singleton list, not string - subset = [subset] - - expected = df[subset].duplicated(keep=keep) - result = df.duplicated(keep=keep, subset=subset) - tm.assert_series_equal(result, expected) - - def test_drop_duplicates(): df = DataFrame( { @@ -188,17 +113,6 @@ def test_drop_duplicates(): assert df.duplicated(keep=keep).sum() == 0 -def test_duplicated_on_empty_frame(): - # GH 25184 - - df = DataFrame(columns=["a", "b"]) - dupes = df.duplicated("a") - - result = df[dupes] - expected = df.copy() - tm.assert_frame_equal(result, expected) - - def test_drop_duplicates_with_duplicate_column_names(): # GH17836 df = DataFrame([[1, 2, 5], [3, 4, 6], [3, 4, 7]], columns=["a", "a", "b"]) @@ -477,3 +391,30 @@ def test_drop_duplicates_inplace(): expected = orig2.drop_duplicates(["A", "B"], keep=False) result = df2 tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("inplace", [True, False]) +@pytest.mark.parametrize( + "origin_dict, output_dict, ignore_index, output_index", + [ + ({"A": [2, 2, 3]}, {"A": [2, 3]}, True, [0, 1]), + ({"A": [2, 2, 3]}, {"A": [2, 3]}, False, [0, 2]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, True, [0, 1]), + ({"A": [2, 2, 3], "B": [2, 2, 4]}, {"A": [2, 3], "B": [2, 4]}, False, [0, 2]), + ], +) +def test_drop_duplicates_ignore_index( + inplace, origin_dict, output_dict, ignore_index, output_index +): + # GH 30114 + df = DataFrame(origin_dict) + expected = DataFrame(output_dict, index=output_index) + + if inplace: + result_df = df.copy() + result_df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + else: + result_df = df.drop_duplicates(ignore_index=ignore_index, inplace=inplace) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(origin_dict)) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py new file mode 100644 index 0000000000000..72eec8753315c --- /dev/null +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -0,0 +1,100 @@ +import re + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +@pytest.mark.parametrize("subset", ["a", ["a"], ["a", "B"]]) +def test_duplicated_with_misspelled_column_name(subset): + # GH 19730 + df = DataFrame({"A": [0, 0, 1], "B": [0, 0, 1], "C": [0, 0, 1]}) + msg = re.escape("Index(['a'], dtype='object')") + + with pytest.raises(KeyError, match=msg): + df.duplicated(subset) + + +@pytest.mark.slow +def test_duplicated_do_not_fail_on_wide_dataframes(): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = { + "col_{0:02d}".format(i): np.random.randint(0, 1000, 30000) for i in range(100) + } + df = DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool Series as a result and don't fail during + # calculation. Actual values doesn't matter here, though usually it's all + # False in this case + assert isinstance(result, Series) + assert result.dtype == np.bool + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_keep(keep, expected): + df = DataFrame({"A": [0, 1, 1, 2, 0], "B": ["a", "b", "b", "c", "a"]}) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.xfail(reason="GH#21720; nan/None falsely considered equal") +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + df = DataFrame({"C": [np.nan, 3, 3, None, np.nan]}, dtype=object) + + result = df.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("keep", ["first", "last", False]) +@pytest.mark.parametrize("subset", [None, ["A", "B"], "A"]) +def test_duplicated_subset(subset, keep): + df = DataFrame( + { + "A": [0, 1, 1, 2, 0], + "B": ["a", "b", "b", "c", "a"], + "C": [np.nan, 3, 3, None, np.nan], + } + ) + + if subset is None: + subset = list(df.columns) + elif isinstance(subset, str): + # need to have a DataFrame, not a Series + # -> select columns with singleton list, not string + subset = [subset] + + expected = df[subset].duplicated(keep=keep) + result = df.duplicated(keep=keep, subset=subset) + tm.assert_series_equal(result, expected) + + +def test_duplicated_on_empty_frame(): + # GH 25184 + + df = DataFrame(columns=["a", "b"]) + dupes = df.duplicated("a") + + result = df[dupes] + expected = df.copy() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index 545a4b5f9421e..76c87ed355492 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_error(): diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 5d7dc5c843ec1..0eb94afc99d94 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameIsIn: diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 72299ad6b2bf6..4ce474230b686 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -8,7 +8,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/methods/test_pct_change.py b/pandas/tests/frame/methods/test_pct_change.py new file mode 100644 index 0000000000000..8f3f37fb9fff7 --- /dev/null +++ b/pandas/tests/frame/methods/test_pct_change.py @@ -0,0 +1,96 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +class TestDataFramePctChange: + def test_pct_change_numeric(self): + # GH#11150 + pnl = DataFrame( + [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] + ).astype(np.float64) + pnl.iat[1, 0] = np.nan + pnl.iat[1, 1] = np.nan + pnl.iat[2, 3] = 60 + + for axis in range(2): + expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 + result = pnl.pct_change(axis=axis, fill_method="pad") + + tm.assert_frame_equal(result, expected) + + def test_pct_change(self, datetime_frame): + rs = datetime_frame.pct_change(fill_method=None) + tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) + + rs = datetime_frame.pct_change(2) + filled = datetime_frame.fillna(method="pad") + tm.assert_frame_equal(rs, filled / filled.shift(2) - 1) + + rs = datetime_frame.pct_change(fill_method="bfill", limit=1) + filled = datetime_frame.fillna(method="bfill", limit=1) + tm.assert_frame_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_frame.pct_change(freq="5D") + filled = datetime_frame.fillna(method="pad") + tm.assert_frame_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + df = DataFrame({"a": s, "b": s}) + + chg = df.pct_change() + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + edf = DataFrame({"a": expected, "b": expected}) + tm.assert_frame_equal(chg, edf) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, datetime_frame, freq, periods, fill_method, limit + ): + # GH#7292 + rs_freq = datetime_frame.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = datetime_frame.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_frame_equal(rs_freq, rs_periods) + + empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + tm.assert_frame_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + data = DataFrame( + {0: [np.nan, 1, 2, 3, 9, 18], 1: [0, 1, np.nan, 3, 9, 18]}, index=["a", "b"] * 3 + ) + result = data.pct_change(fill_method=fill_method) + if fill_method is None: + second_column = [np.nan, np.inf, np.nan, np.nan, 2.0, 1.0] + else: + second_column = [np.nan, np.inf, 0.0, 2.0, 2.0, 1.0] + expected = DataFrame( + {0: [np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], 1: second_column}, + index=["a", "b"] * 3, + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index c25b24121d481..64461c08d34f4 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -3,14 +3,14 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameQuantile: def test_quantile_sparse(self): # GH#17198 - s = pd.Series(pd.SparseArray([1, 2])) - s1 = pd.Series(pd.SparseArray([3, 4])) + s = pd.Series(pd.arrays.SparseArray([1, 2])) + s1 = pd.Series(pd.arrays.SparseArray([3, 4])) df = pd.DataFrame({0: s, 1: s1}) result = df.quantile() @@ -103,8 +103,8 @@ def test_quantile_axis_parameter(self): with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis=-1) msg = ( - "No axis named column for object type" - " " + "No axis named column for object type " + "" ) with pytest.raises(ValueError, match=msg): df.quantile(0.1, axis="column") diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index be1a423c22aea..bab2db3192b4a 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -3,8 +3,10 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestRank: @@ -26,8 +28,10 @@ def method(self, request): """ return request.param + @td.skip_if_no_scipy def test_rank(self, float_frame): - rankdata = pytest.importorskip("scipy.stats.rankdata") + import scipy.stats # noqa:F401 + from scipy.stats import rankdata float_frame["A"][::2] = np.nan float_frame["B"][::3] = np.nan @@ -109,6 +113,15 @@ def test_rank2(self): exp = DataFrame({"a": [3.5, 1.0, 3.5, 5.0, 6.0, 7.0, 2.0]}) tm.assert_frame_equal(df.rank(), exp) + def test_rank_does_not_mutate(self): + # GH#18521 + # Check rank does not mutate DataFrame + df = DataFrame(np.random.randn(10, 3), dtype="float64") + expected = df.copy() + df.rank() + result = df + tm.assert_frame_equal(result, expected) + def test_rank_mixed_frame(self, float_string_frame): float_string_frame["datetime"] = datetime.now() float_string_frame["timedelta"] = timedelta(days=1, seconds=1) @@ -117,8 +130,10 @@ def test_rank_mixed_frame(self, float_string_frame): expected = float_string_frame.rank(1, numeric_only=True) tm.assert_frame_equal(result, expected) + @td.skip_if_no_scipy def test_rank_na_option(self, float_frame): - rankdata = pytest.importorskip("scipy.stats.rankdata") + import scipy.stats # noqa:F401 + from scipy.stats import rankdata float_frame["A"][::2] = np.nan float_frame["B"][::3] = np.nan @@ -199,9 +214,10 @@ def test_rank_axis(self): tm.assert_frame_equal(df.rank(axis=0), df.rank(axis="index")) tm.assert_frame_equal(df.rank(axis=1), df.rank(axis="columns")) + @td.skip_if_no_scipy def test_rank_methods_frame(self): - pytest.importorskip("scipy.stats.special") - rankdata = pytest.importorskip("scipy.stats.rankdata") + import scipy.stats # noqa:F401 + from scipy.stats import rankdata xs = np.random.randint(0, 21, (100, 26)) xs = (xs - 10.0) / 10.0 diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3b01ae0c3c2e8..aa91e7a489356 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1,23 +1,23 @@ from datetime import datetime from io import StringIO import re -from typing import Dict +from typing import Dict, List, Union import numpy as np import pytest import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture -def mix_ab() -> Dict[str, list]: +def mix_ab() -> Dict[str, List[Union[int, str]]]: return {"a": list(range(4)), "b": list("ab..")} @pytest.fixture -def mix_abc() -> Dict[str, list]: +def mix_abc() -> Dict[str, List[Union[float, str]]]: return {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} diff --git a/pandas/tests/frame/methods/test_round.py b/pandas/tests/frame/methods/test_round.py index 96ac012ce7892..0865e03cedc50 100644 --- a/pandas/tests/frame/methods/test_round.py +++ b/pandas/tests/frame/methods/test_round.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameRound: diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 7fb8fbbc95627..cfb17de892b1c 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, date_range, offsets -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameShift: diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 4f311bbaa8eb9..2c25e1f3740a3 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import CategoricalDtype, DataFrame, IntervalIndex, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSortIndex: @@ -229,3 +229,92 @@ def test_sort_index_intervalindex(self): ) result = result.columns.levels[1].categories tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [2, 3, 1]}, False, False, [5, 3, 2]), + ({"A": [1, 2, 3]}, {"A": [1, 3, 2]}, True, False, [2, 3, 5]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114 + original_index = [2, 5, 3] + df = DataFrame(original_dict, index=original_index) + expected_df = DataFrame(sorted_dict, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=original_index)) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ascending, ignore_index, output_index", + [ + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + True, + [0, 1], + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [1, 2], "M2": [3, 4]}, + True, + False, + MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")), + ), + ( + {"M1": [1, 2], "M2": [3, 4]}, + {"M1": [2, 1], "M2": [4, 3]}, + False, + False, + MultiIndex.from_tuples([[3, 4], [2, 1]], names=list("AB")), + ), + ], + ) + def test_sort_index_ignore_index_multi_index( + self, inplace, original_dict, sorted_dict, ascending, ignore_index, output_index + ): + # GH 30114, this is to test ignore_index on MulitIndex of index + mi = MultiIndex.from_tuples([[2, 1], [3, 4]], names=list("AB")) + df = DataFrame(original_dict, index=mi) + expected_df = DataFrame(sorted_dict, index=output_index) + + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_df = df.copy() + result_df.sort_index(**kwargs) + else: + result_df = df.sort_index(**kwargs) + + tm.assert_frame_equal(result_df, expected_df) + tm.assert_frame_equal(df, DataFrame(original_dict, index=mi)) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 540bed452d9e9..96f4d6ed90d6b 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Categorical, DataFrame, NaT, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSortValues: @@ -460,3 +460,59 @@ def test_sort_values_na_position_with_categories_raises(self): with pytest.raises(ValueError): df.sort_values(by="c", ascending=False, na_position="bad_position") + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_dict, sorted_dict, ignore_index, output_index", + [ + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, True, [0, 1, 2]), + ({"A": [1, 2, 3]}, {"A": [3, 2, 1]}, False, [2, 1, 0]), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + True, + [0, 1, 2], + ), + ( + {"A": [1, 2, 3], "B": [2, 3, 4]}, + {"A": [3, 2, 1], "B": [4, 3, 2]}, + False, + [2, 1, 0], + ), + ], + ) + def test_sort_values_ignore_index( + self, inplace, original_dict, sorted_dict, ignore_index, output_index + ): + # GH 30114 + df = DataFrame(original_dict) + expected = DataFrame(sorted_dict, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} + + if inplace: + result_df = df.copy() + result_df.sort_values("A", ascending=False, **kwargs) + else: + result_df = df.sort_values("A", ascending=False, **kwargs) + + tm.assert_frame_equal(result_df, expected) + tm.assert_frame_equal(df, DataFrame(original_dict)) + + def test_sort_values_nat_na_position_default(self): + # GH 13230 + expected = pd.DataFrame( + { + "A": [1, 2, 3, 4, 4], + "date": pd.DatetimeIndex( + [ + "2010-01-01 09:00:00", + "2010-01-01 09:00:01", + "2010-01-01 09:00:02", + "2010-01-01 09:00:03", + "NaT", + ] + ), + } + ) + result = expected.sort_values(["A", "date"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_to_dict.py b/pandas/tests/frame/methods/test_to_dict.py index 556d86bed8f14..7b0adceb57668 100644 --- a/pandas/tests/frame/methods/test_to_dict.py +++ b/pandas/tests/frame/methods/test_to_dict.py @@ -6,7 +6,7 @@ import pytz from pandas import DataFrame, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameToDict: diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index eb69e8b297a6a..d0181f0309af1 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -4,7 +4,7 @@ import pytest from pandas import CategoricalDtype, DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameToRecords: @@ -74,7 +74,7 @@ def test_to_records_with_unicode_index(self): tm.assert_almost_equal(result, expected) def test_to_records_with_unicode_column_names(self): - # xref GH#2407 + # xref issue: https://github.com/numpy/numpy/issues/2407 # Issue GH#11879. to_records used to raise an exception when used # with column names containing non-ascii characters in Python 2 result = DataFrame(data={"accented_name_é": [1.0]}).to_records() @@ -235,7 +235,7 @@ def test_to_records_with_categorical(self): # Check that bad types raise ( dict(index=False, column_dtypes={"A": "int32", "B": "foo"}), - (TypeError, 'data type "foo" not understood'), + (TypeError, "data type [\"']foo[\"'] not understood"), ), ], ) @@ -326,7 +326,7 @@ def __init__(self, **kwargs): def __getitem__(self, key): return self.d.__getitem__(key) - def __contains__(self, key): + def __contains__(self, key) -> bool: return key in self.d def keys(self): diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 71843053cf3a8..428b9e5068407 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -1,5 +1,5 @@ import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestTranspose: diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index a021a99a45a5c..ad86ee1266874 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameTruncate: diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 48b373d9c7901..602ea9ca0471a 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -25,7 +25,7 @@ date_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameAlterAxes: @@ -1312,7 +1312,7 @@ def test_rename_mapper_multi(self): def test_rename_positional_named(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"a": [1, 2], "b": [1, 2]}, index=["X", "Y"]) - result = df.rename(str.lower, columns=str.upper) + result = df.rename(index=str.lower, columns=str.upper) expected = DataFrame({"A": [1, 2], "B": [1, 2]}, index=["x", "y"]) tm.assert_frame_equal(result, expected) @@ -1336,12 +1336,12 @@ def test_rename_axis_style_raises(self): # Multiple targets and axis with pytest.raises(TypeError, match=over_spec_msg): - df.rename(str.lower, str.lower, axis="columns") + df.rename(str.lower, index=str.lower, axis="columns") # Too many targets - over_spec_msg = "Cannot specify all of 'mapper', 'index', 'columns'." + over_spec_msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" with pytest.raises(TypeError, match=over_spec_msg): - df.rename(str.lower, str.lower, str.lower) + df.rename(str.lower, index=str.lower, columns=str.lower) # Duplicates with pytest.raises(TypeError, match="multiple values"): @@ -1375,16 +1375,42 @@ def test_reindex_api_equivalence(self): for res in [res2, res3]: tm.assert_frame_equal(res1, res) - def test_rename_positional(self): + def test_rename_positional_raises(self): + # GH 29136 df = DataFrame(columns=["A", "B"]) - with tm.assert_produces_warning(FutureWarning) as rec: - result = df.rename(None, str.lower) - expected = DataFrame(columns=["a", "b"]) - tm.assert_frame_equal(result, expected) - assert len(rec) == 1 - message = str(rec[0].message) - assert "rename" in message - assert "Use named arguments" in message + msg = r"rename\(\) takes from 1 to 2 positional arguments" + + with pytest.raises(TypeError, match=msg): + df.rename(None, str.lower) + + def test_rename_no_mappings_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "must pass an index to rename" + with pytest.raises(TypeError, match=msg): + df.rename() + + with pytest.raises(TypeError, match=msg): + df.rename(None, index=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None) + + with pytest.raises(TypeError, match=msg): + df.rename(None, columns=None, index=None) + + def test_rename_mapper_and_positional_arguments_raises(self): + # GH 29136 + df = DataFrame([[1]]) + msg = "Cannot specify both 'mapper' and any of 'index' or 'columns'" + with pytest.raises(TypeError, match=msg): + df.rename({}, index={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}) + + with pytest.raises(TypeError, match=msg): + df.rename({}, columns={}, index={}) def test_assign_columns(self, float_frame): float_frame["hi"] = "there" @@ -1409,14 +1435,6 @@ def test_set_index_preserve_categorical_dtype(self): result = result.reindex(columns=df.columns) tm.assert_frame_equal(result, df) - def test_ambiguous_warns(self): - df = DataFrame({"A": [1, 2]}) - with tm.assert_produces_warning(FutureWarning): - df.rename(id, id) - - with tm.assert_produces_warning(FutureWarning): - df.rename({0: 10}, {"A": "B"}) - def test_rename_signature(self): sig = inspect.signature(DataFrame.rename) parameters = set(sig.parameters) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9ddb14470f6e4..25b2997eb088f 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -20,9 +20,9 @@ to_datetime, to_timedelta, ) +import pandas._testing as tm import pandas.core.algorithms as algorithms import pandas.core.nanops as nanops -import pandas.util.testing as tm def assert_stat_op_calc( @@ -823,6 +823,16 @@ def test_sum_bool(self, float_frame): bools.sum(1) bools.sum(0) + def test_sum_mixed_datetime(self): + # GH#30886 + df = pd.DataFrame( + {"A": pd.date_range("2000", periods=4), "B": [1, 2, 3, 4]} + ).reindex([2, 3, 4]) + result = df.sum() + + expected = pd.Series({"B": 7.0}) + tm.assert_series_equal(result, expected) + def test_mean_corner(self, float_frame, float_string_frame): # unit test when have object data the_mean = float_string_frame.mean(axis=0) @@ -893,24 +903,6 @@ def test_sum_bools(self): bools = isna(df) assert bools.sum(axis=1)[0] == 10 - # --------------------------------------------------------------------- - # Miscellanea - - def test_pct_change(self): - # GH#11150 - pnl = DataFrame( - [np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(0, 40, 10)] - ).astype(np.float64) - pnl.iat[1, 0] = np.nan - pnl.iat[1, 1] = np.nan - pnl.iat[2, 3] = 60 - - for axis in range(2): - expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(axis=axis) - 1 - result = pnl.pct_change(axis=axis, fill_method="pad") - - tm.assert_frame_equal(result, expected) - # ---------------------------------------------------------------------- # Index of max / min @@ -1266,15 +1258,6 @@ def test_matmul(self): # --------------------------------------------------------------------- # Unsorted - def test_series_nat_conversion(self): - # GH 18521 - # Check rank does not mutate DataFrame - df = DataFrame(np.random.randn(10, 3), dtype="float64") - expected = df.copy() - df.rank() - result = df - tm.assert_frame_equal(result, expected) - def test_series_broadcasting(self): # smoke test for numpy warnings # GH 16378, GH 16306 diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 91fb71c9de7a4..9de5d6fe16a0d 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -5,9 +5,12 @@ import numpy as np import pytest +from pandas.compat import PY37 +from pandas.util._test_decorators import async_mark + import pandas as pd from pandas import Categorical, DataFrame, Series, compat, date_range, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameMisc: @@ -261,8 +264,27 @@ def test_itertuples(self, float_frame): df3 = DataFrame({"f" + str(i): [i] for i in range(1024)}) # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) - assert not hasattr(tup3, "_fields") assert isinstance(tup3, tuple) + if PY37: + assert hasattr(tup3, "_fields") + else: + assert not hasattr(tup3, "_fields") + + # GH 28282 + df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) + result_254_columns = next(df_254_columns.itertuples(index=False)) + assert isinstance(result_254_columns, tuple) + assert hasattr(result_254_columns, "_fields") + + df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) + result_255_columns = next(df_255_columns.itertuples(index=False)) + assert isinstance(result_255_columns, tuple) + + # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7 + if PY37: + assert hasattr(result_255_columns, "_fields") + else: + assert not hasattr(result_255_columns, "_fields") def test_sequence_like_with_categorical(self): @@ -360,8 +382,8 @@ def test_swapaxes(self): tm.assert_frame_equal(df.T, df.swapaxes(1, 0)) tm.assert_frame_equal(df, df.swapaxes(0, 0)) msg = ( - "No axis named 2 for object type" - r" " + "No axis named 2 for object type " + r"" ) with pytest.raises(ValueError, match=msg): df.swapaxes(2, 5) @@ -518,13 +540,22 @@ def _check_f(base, f): f = lambda x: x.rename({1: "foo"}, inplace=True) _check_f(d.copy(), f) - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # GH 16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; df = pd.DataFrame()" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("df.", 1)) + + def test_attrs(self): + df = pd.DataFrame({"A": [2, 3]}) + assert df.attrs == {} + df.attrs["version"] = 1 + + result = df.rename(columns=str) + assert result.attrs == {"version": 1} diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index eb98bdc49f976..e98f74e133ea9 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -11,10 +11,10 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, notna +import pandas._testing as tm from pandas.conftest import _get_cython_table_params from pandas.core.apply import frame_apply from pandas.core.base import SpecificationError -import pandas.util.testing as tm @pytest.fixture @@ -691,6 +691,18 @@ def test_apply_dup_names_multi_agg(self): tm.assert_frame_equal(result, expected) + def test_apply_nested_result_axis_1(self): + # GH 13820 + def apply_list(row): + return [2 * row["A"], 2 * row["C"], 2 * row["B"]] + + df = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCD")) + result = df.apply(apply_list, axis=1) + expected = Series( + [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] + ) + tm.assert_series_equal(result, expected) + class TestInferOutputShape: # the user has supplied an opaque UDF where @@ -1331,8 +1343,8 @@ def test_agg_cython_table(self, df, func, expected, axis): _get_cython_table_params( DataFrame([[np.nan, 1], [1, 2]]), [ - ("cumprod", DataFrame([[np.nan, 1], [1.0, 2.0]])), - ("cumsum", DataFrame([[np.nan, 1], [1.0, 3.0]])), + ("cumprod", DataFrame([[np.nan, 1], [1, 2]])), + ("cumsum", DataFrame([[np.nan, 1], [1, 3]])), ], ), ), @@ -1341,6 +1353,10 @@ def test_agg_cython_table_transform(self, df, func, expected, axis): # GH 21224 # test transforming functions in # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum) + if axis == "columns" or axis == 1: + # operating blockwise doesn't let us preserve dtypes + expected = expected.astype("float64") + result = df.agg(func, axis=axis) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 5ecbe21d113b5..659b55756c4b6 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -6,8 +6,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int -import pandas.util.testing as tm # ------------------------------------------------------------------- # Comparisons @@ -726,3 +726,14 @@ def test_zero_len_frame_with_series_corner_cases(): result = df + ser expected = df tm.assert_frame_equal(result, expected) + + +def test_frame_single_columns_object_sum_axis_1(): + # GH 13758 + data = { + "One": pd.Series(["A", 1.2, np.nan]), + } + df = pd.DataFrame(data) + result = df.sum(axis=1) + expected = pd.Series(["A", 1.2, 0]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index d6ef3a7600abb..7effa98fd8213 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, MultiIndex, Series, date_range, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSelectReindex: diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index eb8febb10a646..d301ed969789e 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -15,10 +15,10 @@ date_range, option_context, ) +import pandas._testing as tm from pandas.core.arrays import IntervalArray, integer_array from pandas.core.internals import ObjectBlock from pandas.core.internals.blocks import IntBlock -import pandas.util.testing as tm # Segregated collection of methods that require the BlockManager internal data # structure diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index ebc4438366001..9bad54b051d6c 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameConcatCommon: @@ -128,177 +128,6 @@ def test_concat_tuple_keys(self): ) tm.assert_frame_equal(results, expected) - def test_append_empty_list(self): - # GH 28769 - df = DataFrame() - result = df.append([]) - expected = df - tm.assert_frame_equal(result, expected) - assert result is not df - - df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) - result = df.append([]) - expected = df - tm.assert_frame_equal(result, expected) - assert result is not df # .append() should return a new object - - def test_append_series_dict(self): - df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) - - series = df.loc[4] - msg = "Indexes have overlapping values" - with pytest.raises(ValueError, match=msg): - df.append(series, verify_integrity=True) - - series.name = None - msg = "Can only append a Series if ignore_index=True" - with pytest.raises(TypeError, match=msg): - df.append(series, verify_integrity=True) - - result = df.append(series[::-1], ignore_index=True) - expected = df.append( - DataFrame({0: series[::-1]}, index=df.columns).T, ignore_index=True - ) - tm.assert_frame_equal(result, expected) - - # dict - result = df.append(series.to_dict(), ignore_index=True) - tm.assert_frame_equal(result, expected) - - result = df.append(series[::-1][:3], ignore_index=True) - expected = df.append( - DataFrame({0: series[::-1][:3]}).T, ignore_index=True, sort=True - ) - tm.assert_frame_equal(result, expected.loc[:, result.columns]) - - # can append when name set - row = df.loc[4] - row.name = 5 - result = df.append(row) - expected = df.append(df[-1:], ignore_index=True) - tm.assert_frame_equal(result, expected) - - def test_append_list_of_series_dicts(self): - df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) - - dicts = [x.to_dict() for idx, x in df.iterrows()] - - result = df.append(dicts, ignore_index=True) - expected = df.append(df, ignore_index=True) - tm.assert_frame_equal(result, expected) - - # different columns - dicts = [ - {"foo": 1, "bar": 2, "baz": 3, "peekaboo": 4}, - {"foo": 5, "bar": 6, "baz": 7, "peekaboo": 8}, - ] - result = df.append(dicts, ignore_index=True, sort=True) - expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) - tm.assert_frame_equal(result, expected) - - def test_append_missing_cols(self): - # GH22252 - # exercise the conditional branch in append method where the data - # to be appended is a list and does not contain all columns that are in - # the target DataFrame - df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) - - dicts = [{"foo": 9}, {"bar": 10}] - with tm.assert_produces_warning(None): - result = df.append(dicts, ignore_index=True, sort=True) - - expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) - tm.assert_frame_equal(result, expected) - - def test_append_empty_dataframe(self): - - # Empty df append empty df - df1 = DataFrame() - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - # Non-empty df append empty df - df1 = DataFrame(np.random.randn(5, 2)) - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - # Empty df with columns append empty df - df1 = DataFrame(columns=["bar", "foo"]) - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - # Non-Empty df with columns append empty df - df1 = DataFrame(np.random.randn(5, 2), columns=["bar", "foo"]) - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - def test_append_dtypes(self): - - # GH 5754 - # row appends of different dtypes (so need to do by-item) - # can sometimes infer the correct type - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(5)) - df2 = DataFrame() - result = df1.append(df2) - expected = df1.copy() - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) - df2 = DataFrame({"bar": "foo"}, index=range(1, 2)) - result = df1.append(df2) - expected = DataFrame({"bar": [Timestamp("20130101"), "foo"]}) - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) - df2 = DataFrame({"bar": np.nan}, index=range(1, 2)) - result = df1.append(df2) - expected = DataFrame( - {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} - ) - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) - df2 = DataFrame({"bar": np.nan}, index=range(1, 2), dtype=object) - result = df1.append(df2) - expected = DataFrame( - {"bar": Series([Timestamp("20130101"), np.nan], dtype="M8[ns]")} - ) - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": np.nan}, index=range(1)) - df2 = DataFrame({"bar": Timestamp("20130101")}, index=range(1, 2)) - result = df1.append(df2) - expected = DataFrame( - {"bar": Series([np.nan, Timestamp("20130101")], dtype="M8[ns]")} - ) - tm.assert_frame_equal(result, expected) - - df1 = DataFrame({"bar": Timestamp("20130101")}, index=range(1)) - df2 = DataFrame({"bar": 1}, index=range(1, 2), dtype=object) - result = df1.append(df2) - expected = DataFrame({"bar": Series([Timestamp("20130101"), 1])}) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize( - "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"] - ) - def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): - # GH 30238 - tz = tz_naive_fixture - df = pd.DataFrame([pd.Timestamp(timestamp, tz=tz)]) - result = df.append(df.iloc[0]).iloc[-1] - expected = pd.Series(pd.Timestamp(timestamp, tz=tz), name=0) - tm.assert_series_equal(result, expected) - def test_update(self): df = DataFrame( [[1.5, np.nan, 3.0], [1.5, np.nan, 3.0], [1.5, np.nan, 3], [1.5, np.nan, 3]] diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3e5027ee54cb3..a861e0eb52391 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1,5 +1,5 @@ from collections import OrderedDict, abc -from datetime import datetime, timedelta +from datetime import date, datetime, timedelta import functools import itertools @@ -25,9 +25,9 @@ date_range, isna, ) -from pandas.arrays import IntervalArray, PeriodArray +import pandas._testing as tm +from pandas.arrays import IntervalArray, PeriodArray, SparseArray from pandas.core.construction import create_series_with_explicit_dtype -import pandas.util.testing as tm MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"] MIXED_INT_DTYPES = [ @@ -479,11 +479,11 @@ def test_constructor_error_msgs(self): DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1]) # wrong size axis labels - msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(1, 3\)" + msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=["A", "B", "C"], index=[1]) - msg = "Shape of passed values " r"is \(2, 3\), indices " r"imply \(2, 2\)" + msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)" with pytest.raises(ValueError, match=msg): DataFrame(np.random.rand(2, 3), columns=["A", "B"], index=[1, 2]) @@ -511,17 +511,17 @@ def test_constructor_with_embedded_frames(self): result = df2.loc[1, 0] tm.assert_frame_equal(result, df1 + 10) - def test_constructor_subclass_dict(self, float_frame): + def test_constructor_subclass_dict(self, float_frame, dict_subclass): # Test for passing dict subclass to constructor data = { - "col1": tm.TestSubDict((x, 10.0 * x) for x in range(10)), - "col2": tm.TestSubDict((x, 20.0 * x) for x in range(10)), + "col1": dict_subclass((x, 10.0 * x) for x in range(10)), + "col2": dict_subclass((x, 20.0 * x) for x in range(10)), } df = DataFrame(data) refdf = DataFrame({col: dict(val.items()) for col, val in data.items()}) tm.assert_frame_equal(refdf, df) - data = tm.TestSubDict(data.items()) + data = dict_subclass(data.items()) df = DataFrame(data) tm.assert_frame_equal(refdf, df) @@ -1854,9 +1854,9 @@ def check(df): # No NaN found -> error if len(indexer) == 0: msg = ( - "cannot do label indexing on" - r" " - r" with these indexers \[nan\] of " + "cannot do label indexing on " + r" " + r"with these indexers \[nan\] of " ) with pytest.raises(TypeError, match=msg): df.loc[:, np.nan] @@ -2414,7 +2414,7 @@ class List(list): "extension_arr", [ Categorical(list("aabbc")), - pd.SparseArray([1, np.nan, np.nan, np.nan]), + SparseArray([1, np.nan, np.nan, np.nan]), IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), PeriodArray(pd.period_range(start="1/1/2017", end="1/1/2018", freq="M")), ], @@ -2425,6 +2425,14 @@ def test_constructor_with_extension_array(self, extension_arr): result = DataFrame(extension_arr) tm.assert_frame_equal(result, expected) + def test_datetime_date_tuple_columns_from_dict(self): + # GH 10863 + v = date.today() + tup = v, v + result = DataFrame({tup: Series(range(3), index=range(3))}, columns=[tup]) + expected = DataFrame([0, 1, 2], columns=pd.Index(pd.Series([tup]))) + tm.assert_frame_equal(result, expected) + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): @@ -2551,3 +2559,11 @@ def test_from_tzaware_mixed_object_array(self): "datetime64[ns, CET]", ] assert (res.dtypes == expected_dtypes).all() + + def test_from_2d_ndarray_with_dtype(self): + # GH#12513 + array_dim2 = np.arange(10).reshape((5, 2)) + df = pd.DataFrame(array_dim2, dtype="datetime64[ns, UTC]") + + expected = pd.DataFrame(array_dim2).astype("datetime64[ns, UTC]") + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py index ad2cbff888b2e..b545d6aa8afd3 100644 --- a/pandas/tests/frame/test_cumulative.py +++ b/pandas/tests/frame/test_cumulative.py @@ -9,7 +9,7 @@ import numpy as np from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameCumulativeOps: @@ -118,3 +118,18 @@ def test_cummax(self, datetime_frame): # fix issue cummax_xs = datetime_frame.cummax(axis=1) assert np.shape(cummax_xs) == np.shape(datetime_frame) + + def test_cumulative_ops_preserve_dtypes(self): + # GH#19296 dont incorrectly upcast to object + df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]}) + + result = df.cumsum() + + expected = DataFrame( + { + "A": Series([1, 3, 6], dtype=np.int64), + "B": Series([1, 3, 6], dtype=np.float64), + "C": df["C"].cumsum(), + } + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index d8d56e90a2f31..0d34f61ef1e5a 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -18,8 +18,8 @@ date_range, option_context, ) +import pandas._testing as tm from pandas.core.arrays import integer_array -import pandas.util.testing as tm def _check_cast(df, v): @@ -897,15 +897,15 @@ def test_astype_to_incorrect_datetimelike(self, unit): df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) msg = ( - r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" - r" \[timedelta64\[{}\]\]" + r"cannot astype a datetimelike from \[datetime64\[ns\]\] to " + r"\[timedelta64\[{}\]\]" ).format(unit) with pytest.raises(TypeError, match=msg): df.astype(other) msg = ( - r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" - r" \[datetime64\[{}\]\]" + r"cannot astype a timedelta from \[timedelta64\[ns\]\] to " + r"\[datetime64\[{}\]\]" ).format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/frame/test_join.py b/pandas/tests/frame/test_join.py index a0cbc1456afa4..c6e28f3c64f12 100644 --- a/pandas/tests/frame/test_join.py +++ b/pandas/tests/frame/test_join.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Index, period_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 0b77c0067e5f2..ae0516dd29a1f 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -8,17 +8,8 @@ import pandas as pd from pandas import Categorical, DataFrame, Series, Timestamp, date_range +import pandas._testing as tm from pandas.tests.frame.common import _check_mixed_float -import pandas.util.testing as tm - - -def _skip_if_no_pchip(): - try: - from scipy.interpolate import pchip_interpolate # noqa - except ImportError: - import pytest - - pytest.skip("scipy.interpolate.pchip missing") class TestDataFrameMissingData: @@ -671,7 +662,7 @@ def test_fillna_invalid_method(self, float_frame): def test_fillna_invalid_value(self, float_frame): # list - msg = '"value" parameter must be a scalar or dict, but you passed' ' a "{}"' + msg = '"value" parameter must be a scalar or dict, but you passed a "{}"' with pytest.raises(TypeError, match=msg.format("list")): float_frame.fillna([1, 2]) # tuple @@ -679,8 +670,8 @@ def test_fillna_invalid_value(self, float_frame): float_frame.fillna((1, 2)) # frame with series msg = ( - '"value" parameter must be a scalar, dict or Series, but you' - ' passed a "DataFrame"' + '"value" parameter must be a scalar, dict or Series, but you ' + 'passed a "DataFrame"' ) with pytest.raises(TypeError, match=msg): float_frame.iloc[:, 0].fillna(float_frame) @@ -837,8 +828,6 @@ def test_interp_alt_scipy(self): expectedk["A"] = expected["A"] tm.assert_frame_equal(result, expectedk) - _skip_if_no_pchip() - result = df.interpolate(method="pchip") expected.loc[2, "A"] = 3 expected.loc[5, "A"] = 6.0 @@ -981,3 +970,16 @@ def test_interp_ignore_all_good(self): # all good result = df[["B", "D"]].interpolate(downcast=None) tm.assert_frame_equal(result, df[["B", "D"]]) + + @pytest.mark.parametrize("axis", [0, 1]) + def test_interp_time_inplace_axis(self, axis): + # GH 9687 + periods = 5 + idx = pd.date_range(start="2014-01-01", periods=periods) + data = np.random.rand(periods, periods) + data[data < 0.5] = np.nan + expected = pd.DataFrame(index=idx, columns=idx, data=data) + + result = expected.interpolate(axis=0, method="time") + expected.interpolate(axis=0, method="time", inplace=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_mutate_columns.py b/pandas/tests/frame/test_mutate_columns.py index 8c0dd67af4e7d..8bc2aa214e035 100644 --- a/pandas/tests/frame/test_mutate_columns.py +++ b/pandas/tests/frame/test_mutate_columns.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm # Column add, remove, delete. diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py index 8fed695a483f5..32ead406a3e86 100644 --- a/pandas/tests/frame/test_nonunique_indexes.py +++ b/pandas/tests/frame/test_nonunique_indexes.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameNonuniqueIndexes: diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index a4f1c0688b144..c727cb398d53e 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm import pandas.core.common as com from pandas.tests.frame.common import _check_mixed_float -import pandas.util.testing as tm class TestDataFrameUnaryOperators: diff --git a/pandas/tests/frame/test_period.py b/pandas/tests/frame/test_period.py index a545db3365e36..a6b2b334d3ec8 100644 --- a/pandas/tests/frame/test_period.py +++ b/pandas/tests/frame/test_period.py @@ -14,7 +14,7 @@ period_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm def _permute(obj): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index d577ff7c71277..703e05998e93c 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -8,8 +8,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, date_range +import pandas._testing as tm from pandas.core.computation.check import _NUMEXPR_INSTALLED -import pandas.util.testing as tm PARSERS = "python", "pandas" ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne) @@ -1048,13 +1048,35 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op): class TestDataFrameQueryBacktickQuoting: @pytest.fixture(scope="class") def df(self): + """ + Yields a dataframe with strings that may or may not need escaping + by backticks. The last two columns cannot be escaped by backticks + and should raise a ValueError. + """ yield DataFrame( { "A": [1, 2, 3], "B B": [3, 2, 1], "C C": [4, 5, 6], + "C C": [7, 4, 3], "C_C": [8, 9, 10], "D_D D": [11, 1, 101], + "E.E": [6, 3, 5], + "F-F": [8, 1, 10], + "1e1": [2, 4, 8], + "def": [10, 11, 2], + "A (x)": [4, 1, 3], + "B(x)": [1, 1, 5], + "B (x)": [2, 7, 4], + " &^ :!€$?(} > <++*'' ": [2, 5, 6], + "": [10, 11, 1], + " A": [4, 7, 9], + " ": [1, 2, 1], + "it's": [6, 3, 1], + "that's": [9, 1, 8], + "☺": [8, 7, 6], + "foo#bar": [2, 4, 5], + 1: [5, 7, 9], } ) @@ -1093,7 +1115,64 @@ def test_mixed_underscores_and_spaces(self, df): expect = df["A"] + df["D_D D"] tm.assert_series_equal(res, expect) - def backtick_quote_name_with_no_spaces(self, df): + def test_backtick_quote_name_with_no_spaces(self, df): res = df.eval("A + `C_C`") expect = df["A"] + df["C_C"] tm.assert_series_equal(res, expect) + + def test_special_characters(self, df): + res = df.eval("`E.E` + `F-F` - A") + expect = df["E.E"] + df["F-F"] - df["A"] + tm.assert_series_equal(res, expect) + + def test_start_with_digit(self, df): + res = df.eval("A + `1e1`") + expect = df["A"] + df["1e1"] + tm.assert_series_equal(res, expect) + + def test_keyword(self, df): + res = df.eval("A + `def`") + expect = df["A"] + df["def"] + tm.assert_series_equal(res, expect) + + def test_unneeded_quoting(self, df): + res = df.query("`A` > 2") + expect = df[df["A"] > 2] + tm.assert_frame_equal(res, expect) + + def test_parenthesis(self, df): + res = df.query("`A (x)` > 2") + expect = df[df["A (x)"] > 2] + tm.assert_frame_equal(res, expect) + + def test_empty_string(self, df): + res = df.query("`` > 5") + expect = df[df[""] > 5] + tm.assert_frame_equal(res, expect) + + def test_multiple_spaces(self, df): + res = df.query("`C C` > 5") + expect = df[df["C C"] > 5] + tm.assert_frame_equal(res, expect) + + def test_start_with_spaces(self, df): + res = df.eval("` A` + ` `") + expect = df[" A"] + df[" "] + tm.assert_series_equal(res, expect) + + def test_lots_of_operators_string(self, df): + res = df.query("` &^ :!€$?(} > <++*'' ` > 4") + expect = df[df[" &^ :!€$?(} > <++*'' "] > 4] + tm.assert_frame_equal(res, expect) + + def test_failing_quote(self, df): + with pytest.raises(SyntaxError): + df.query("`it's` > `that's`") + + def test_failing_character_outside_range(self, df): + with pytest.raises(SyntaxError): + df.query("`☺` > 4") + + def test_failing_hashtag(self, df): + with pytest.raises(SyntaxError): + df.query("`foo#bar` > 4") diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 318b1c6add91e..49e6fe4940e18 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -3,6 +3,7 @@ import re import sys import textwrap +import warnings import numpy as np import pytest @@ -18,7 +19,7 @@ option_context, period_range, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt @@ -29,17 +30,17 @@ class TestDataFrameReprInfoEtc: def test_repr_empty(self): # empty - foo = repr(DataFrame()) # noqa + repr(DataFrame()) # empty with index frame = DataFrame(index=np.arange(1000)) - foo = repr(frame) # noqa + repr(frame) def test_repr_mixed(self, float_string_frame): buf = StringIO() # mixed - foo = repr(float_string_frame) # noqa + repr(float_string_frame) float_string_frame.info(verbose=False, buf=buf) @pytest.mark.slow @@ -51,13 +52,13 @@ def test_repr_mixed_big(self): biggie.loc[:20, "A"] = np.nan biggie.loc[:20, "B"] = np.nan - foo = repr(biggie) # noqa + repr(biggie) def test_repr(self, float_frame): buf = StringIO() # small one - foo = repr(float_frame) + repr(float_frame) float_frame.info(verbose=False, buf=buf) # even smaller @@ -68,7 +69,7 @@ def test_repr(self, float_frame): # columns but no index no_index = DataFrame(columns=[0, 1, 3]) - foo = repr(no_index) # noqa + repr(no_index) # no columns or index DataFrame().info(buf=buf) @@ -97,7 +98,6 @@ def test_repr_big(self): def test_repr_unsortable(self, float_frame): # columns are not sortable - import warnings warn_filters = warnings.filters warnings.filterwarnings("ignore", category=FutureWarning, module=".*format") @@ -164,13 +164,13 @@ def test_repr_column_name_unicode_truncation_bug(self): "Id": [7117434], "StringCol": ( "Is it possible to modify drop plot code" - " so that the output graph is displayed " + "so that the output graph is displayed " "in iphone simulator, Is it possible to " "modify drop plot code so that the " "output graph is \xe2\x80\xa8displayed " "in iphone simulator.Now we are adding " - "the CSV file externally. I want to Call" - " the File through the code.." + "the CSV file externally. I want to Call " + "the File through the code.." ), } ) @@ -205,6 +205,28 @@ def test_info(self, float_frame, datetime_frame): frame.info() frame.info(verbose=False) + def test_info_verbose(self): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.randn(3, size)) + frame.info(verbose=True, buf=buf) + + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if i >= start and i < start + size: + index = i - start + line_nr = " {} ".format(index) + assert line.startswith(line_nr) + def test_info_memory(self): # https://github.com/pandas-dev/pandas/issues/21056 df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")}) @@ -218,7 +240,9 @@ def test_info_memory(self): RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): - a 2 non-null int64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 dtypes: int64(1) memory usage: {} bytes """.format( @@ -262,8 +286,8 @@ def test_info_duplicate_columns_shows_correct_dtypes(self): frame.info(buf=io) io.seek(0) lines = io.readlines() - assert "a 1 non-null int64\n" == lines[3] - assert "a 1 non-null float64\n" == lines[4] + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] def test_info_shows_column_dtypes(self): dtypes = [ @@ -283,13 +307,20 @@ def test_info_shows_column_dtypes(self): buf = StringIO() df.info(buf=buf) res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res for i, dtype in enumerate(dtypes): - name = "{i:d} {n:d} non-null {dtype}".format(i=i, n=n, dtype=dtype) + name = " {i:d} {i:d} {n:d} non-null {dtype}".format( + i=i, n=n, dtype=dtype + ) assert name in res def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) - for len_, verbose in [(5, None), (5, False), (10, True)]: + for len_, verbose in [(5, None), (5, False), (12, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context("max_info_columns", 4): buf = StringIO() @@ -297,16 +328,16 @@ def test_info_max_cols(self): res = buf.getvalue() assert len(res.strip().split("\n")) == len_ - for len_, verbose in [(10, None), (5, False), (10, True)]: + for len_, verbose in [(12, None), (5, False), (12, True)]: - # max_cols no exceeded + # max_cols not exceeded with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ - for len_, max_cols in [(10, 5), (5, 4)]: + for len_, max_cols in [(12, 5), (5, 4)]: # setting truncates with option_context("max_info_columns", 4): buf = StringIO() diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 5acd681933914..60b7611c8b9be 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Period, Series, Timedelta, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameReshape: @@ -424,8 +424,8 @@ def test_stack_mixed_levels(self): # When mixed types are passed and the ints are not level # names, raise msg = ( - "level should contain all level names or all level numbers, not" - " a mixture of the two" + "level should contain all level names or all level numbers, not " + "a mixture of the two" ) with pytest.raises(ValueError, match=msg): df2.stack(level=["animal", 0]) @@ -1128,3 +1128,34 @@ def test_stack_timezone_aware_values(): ), ) tm.assert_series_equal(result, expected) + + +def test_unstacking_multi_index_df(): + # see gh-30740 + df = DataFrame( + { + "name": ["Alice", "Bob"], + "score": [9.5, 8], + "employed": [False, True], + "kids": [0, 0], + "gender": ["female", "male"], + } + ) + df = df.set_index(["name", "employed", "kids", "gender"]) + df = df.unstack(["gender"], fill_value=0) + expected = df.unstack("employed", fill_value=0).unstack("kids", fill_value=0) + result = df.unstack(["employed", "kids"], fill_value=0) + expected = DataFrame( + [[9.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 8.0]], + index=Index(["Alice", "Bob"], name="name"), + columns=MultiIndex.from_tuples( + [ + ("score", "female", False, 0), + ("score", "female", True, 0), + ("score", "male", False, 0), + ("score", "male", True, 0), + ], + names=[None, "gender", "employed", "kids"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/test_sort_values_level_as_str.py b/pandas/tests/frame/test_sort_values_level_as_str.py index b0287d9180859..40526ab27ac9a 100644 --- a/pandas/tests/frame/test_sort_values_level_as_str.py +++ b/pandas/tests/frame/test_sort_values_level_as_str.py @@ -4,7 +4,7 @@ from pandas.errors import PerformanceWarning from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index e1e546256f7cd..4a436d70dc48f 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestDataFrameSubclassing: diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index b9df3ce305dbc..e89f4ee07ea00 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -16,7 +16,7 @@ period_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.tseries.offsets as offsets @@ -27,62 +27,6 @@ def close_open_fixture(request): class TestDataFrameTimeSeriesMethods: - def test_pct_change(self, datetime_frame): - rs = datetime_frame.pct_change(fill_method=None) - tm.assert_frame_equal(rs, datetime_frame / datetime_frame.shift(1) - 1) - - rs = datetime_frame.pct_change(2) - filled = datetime_frame.fillna(method="pad") - tm.assert_frame_equal(rs, filled / filled.shift(2) - 1) - - rs = datetime_frame.pct_change(fill_method="bfill", limit=1) - filled = datetime_frame.fillna(method="bfill", limit=1) - tm.assert_frame_equal(rs, filled / filled.shift(1) - 1) - - rs = datetime_frame.pct_change(freq="5D") - filled = datetime_frame.fillna(method="pad") - tm.assert_frame_equal( - rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) - ) - - def test_pct_change_shift_over_nas(self): - s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - - df = DataFrame({"a": s, "b": s}) - - chg = df.pct_change() - expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) - edf = DataFrame({"a": expected, "b": expected}) - tm.assert_frame_equal(chg, edf) - - @pytest.mark.parametrize( - "freq, periods, fill_method, limit", - [ - ("5B", 5, None, None), - ("3B", 3, None, None), - ("3B", 3, "bfill", None), - ("7B", 7, "pad", 1), - ("7B", 7, "bfill", 3), - ("14B", 14, None, None), - ], - ) - def test_pct_change_periods_freq( - self, datetime_frame, freq, periods, fill_method, limit - ): - # GH 7292 - rs_freq = datetime_frame.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - rs_periods = datetime_frame.pct_change( - periods, fill_method=fill_method, limit=limit - ) - tm.assert_frame_equal(rs_freq, rs_periods) - - empty_ts = DataFrame(index=datetime_frame.index, columns=datetime_frame.columns) - rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) - tm.assert_frame_equal(rs_freq, rs_periods) - def test_frame_ctor_datetime64_column(self): rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s") dates = np.asarray(rng) diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index 26ab4ff0ded85..b60f2052a988f 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -11,8 +11,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm class TestDataFrameTimezones: diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 5c39dcc1a7659..aeff92971b42a 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -18,8 +18,8 @@ read_csv, to_datetime, ) +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm from pandas.io.common import get_handle diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py index 270a7c70a2e81..7fe22e77c5bf3 100644 --- a/pandas/tests/generic/test_frame.py +++ b/pandas/tests/generic/test_frame.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm from .test_generic import Generic @@ -196,7 +196,7 @@ def test_set_attribute(self): def test_to_xarray_index_types(self, index): from xarray import Dataset - index = getattr(tm, "make{}".format(index)) + index = getattr(tm, f"make{index}") df = DataFrame( { "a": list("abc"), @@ -222,11 +222,10 @@ def test_to_xarray_index_types(self, index): # idempotency # categoricals are not preserved - # datetimes w/tz are not preserved + # datetimes w/tz are preserved # column names are lost expected = df.copy() expected["f"] = expected["f"].astype(object) - expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None tm.assert_frame_equal( result.to_dataframe(), @@ -271,7 +270,6 @@ def test_to_xarray(self): result = result.to_dataframe() expected = df.copy() expected["f"] = expected["f"].astype(object) - expected["h"] = expected["h"].astype("datetime64[ns]") expected.columns.name = None tm.assert_frame_equal(result, expected, check_index_type=False) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 0ff9d7fcdb209..efb04c7f63c66 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm # ---------------------------------------------------------------------- # Generic types test cases @@ -125,7 +125,7 @@ def test_nonzero(self): # GH 4633 # look at the boolean/nonzero behavior for objects obj = self._construct(shape=4) - msg = "The truth value of a {} is ambiguous".format(self._typ.__name__) + msg = f"The truth value of a {self._typ.__name__} is ambiguous" with pytest.raises(ValueError, match=msg): bool(obj == 0) with pytest.raises(ValueError, match=msg): @@ -203,9 +203,9 @@ def test_constructor_compound_dtypes(self): def f(dtype): return self._construct(shape=3, value=1, dtype=dtype) - msg = "compound dtypes are not implemented in the {} constructor".format( - self._typ.__name__ - ) + msg = "compound dtypes are not implemented" + f"in the {self._typ.__name__} constructor" + with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) @@ -548,9 +548,6 @@ def test_validate_bool_args(self): with pytest.raises(ValueError): super(DataFrame, df).drop("a", axis=1, inplace=value) - with pytest.raises(ValueError): - super(DataFrame, df).sort_index(inplace=value) - with pytest.raises(ValueError): super(DataFrame, df)._consolidate(inplace=value) @@ -820,6 +817,18 @@ def test_take_invalid_kwargs(self): with pytest.raises(ValueError, match=msg): obj.take(indices, mode="clip") + def test_depr_take_kwarg_is_copy(self): + # GH 27357 + df = DataFrame({"A": [1, 2, 3]}) + msg = ( + "is_copy is deprecated and will be removed in a future version. " + "take will always return a copy in the future." + ) + with tm.assert_produces_warning(FutureWarning) as w: + df.take([0, 1], is_copy=True) + + assert w[0].message.args[0] == msg + def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py index aaf523956aaed..8ad8355f2d530 100644 --- a/pandas/tests/generic/test_series.py +++ b/pandas/tests/generic/test_series.py @@ -8,7 +8,7 @@ import pandas as pd from pandas import MultiIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm from .test_generic import Generic @@ -205,7 +205,7 @@ def finalize(self, other, method=None, **kwargs): def test_to_xarray_index_types(self, index): from xarray import DataArray - index = getattr(tm, "make{}".format(index)) + index = getattr(tm, f"make{index}") s = Series(range(6), index=index(6)) s.index.name = "foo" result = s.to_xarray() diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0d8379407fef7..0b72a61ed84de 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -8,10 +8,10 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat +import pandas._testing as tm from pandas.core.base import SpecificationError from pandas.core.groupby.generic import _make_unique, _maybe_mangle_lambdas from pandas.core.groupby.grouper import Grouping -import pandas.util.testing as tm def test_agg_regression1(tsframe): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5d50c044cf9f5..5ddda264642de 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series, Timedelta, Timestamp, bdate_range +import pandas._testing as tm from pandas.core.groupby.groupby import DataError -import pandas.util.testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 765bc3bab5d4a..52ee3e652501c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -18,15 +18,15 @@ date_range, period_range, ) +import pandas._testing as tm from pandas.core.base import SpecificationError -import pandas.util.testing as tm from pandas.io.formats.printing import pprint_thing def test_agg_api(): # GH 6337 - # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error + # https://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame df = DataFrame( @@ -473,8 +473,7 @@ def test_agg_timezone_round_trip(): assert result3 == ts dates = [ - pd.Timestamp("2016-01-0{i:d} 12:00:00".format(i=i), tz="US/Pacific") - for i in range(1, 5) + pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5) ] df = pd.DataFrame({"A": ["a", "b"] * 2, "B": dates}) grouped = df.groupby("A") diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py index 5b8cc86513954..8901af7a90acc 100644 --- a/pandas/tests/groupby/conftest.py +++ b/pandas/tests/groupby/conftest.py @@ -2,8 +2,8 @@ import pytest from pandas import DataFrame, MultiIndex +import pandas._testing as tm from pandas.core.groupby.base import reduction_kernels, transformation_kernels -import pandas.util.testing as tm @pytest.fixture diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 0e62569fffeb6..e81ff37510dc0 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, bdate_range -import pandas.util.testing as tm +import pandas._testing as tm def test_apply_issues(): @@ -265,7 +265,7 @@ def desc3(group): result = group.describe() # names are different - result.index.name = "stat_{:d}".format(len(group)) + result.index.name = f"stat_{len(group):d}" result = result[: len(group)] # weirdo @@ -467,6 +467,29 @@ def filt2(x): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("test_series", [True, False]) +def test_apply_with_duplicated_non_sorted_axis(test_series): + # GH 30667 + df = pd.DataFrame( + [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2] + ) + if test_series: + ser = df.set_index("Y")["X"] + result = ser.groupby(level=0).apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_index() + expected = ser.sort_index() + tm.assert_series_equal(result, expected) + else: + result = df.groupby("Y").apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_values("Y") + expected = df.sort_values("Y") + tm.assert_frame_equal(result, expected) + + def test_apply_corner_cases(): # #535, can't use sliding iterator @@ -686,6 +709,17 @@ def test_apply_with_mixed_types(): tm.assert_frame_equal(result, expected) +def test_func_returns_object(): + # GH 28652 + df = DataFrame({"a": [1, 2]}, index=pd.Int64Index([1, 2])) + result = df.groupby("a").apply(lambda g: g.index) + expected = Series( + [pd.Int64Index([1]), pd.Int64Index([2])], index=pd.Int64Index([1, 2], name="a") + ) + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "group_column_dtlike", [datetime.today(), datetime.today().date(), datetime.today().time()], @@ -703,3 +737,41 @@ def test_apply_datetime_issue(group_column_dtlike): ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42] ) tm.assert_frame_equal(result, expected) + + +def test_apply_series_return_dataframe_groups(): + # GH 10078 + tdf = DataFrame( + { + "day": { + 0: pd.Timestamp("2015-02-24 00:00:00"), + 1: pd.Timestamp("2015-02-24 00:00:00"), + 2: pd.Timestamp("2015-02-24 00:00:00"), + 3: pd.Timestamp("2015-02-24 00:00:00"), + 4: pd.Timestamp("2015-02-24 00:00:00"), + }, + "userAgent": { + 0: "some UA string", + 1: "some UA string", + 2: "some UA string", + 3: "another UA string", + 4: "some UA string", + }, + "userId": { + 0: "17661101", + 1: "17661101", + 2: "17661101", + 3: "17661101", + 4: "17661101", + }, + } + ) + + def most_common_values(df): + return Series({c: s.value_counts().index[0] for c, s in df.iteritems()}) + + result = tdf.groupby("day").apply(most_common_values)["userId"] + expected = pd.Series( + ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index fcdf599e4ba33..ad71f73e80e64 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.common import ensure_int64 from pandas import Index, Series, isna -import pandas.util.testing as tm +import pandas._testing as tm def test_series_grouper(): @@ -87,7 +87,7 @@ def _check(dtype): counts = np.zeros(len(out), dtype=np.int64) labels = ensure_int64(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) - func = getattr(groupby, "group_ohlc_{dtype}".format(dtype=dtype)) + func = getattr(groupby, f"group_ohlc_{dtype}") func(out, counts, obj[:, None], labels) def _ohlc(group): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 89ffcd9ee313e..9323946581a0d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -15,7 +15,7 @@ Series, qcut, ) -import pandas.util.testing as tm +import pandas._testing as tm def cartesian_product_for_groupers(result, args, names): @@ -497,10 +497,10 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort): aggr[aggr.isna()] = "missing" if not all(label == aggr): msg = ( - "Labels and aggregation results not consistently sorted\n" - + "for (ordered={}, observed={}, sort={})\n" - + "Result:\n{}" - ).format(ordered, observed, sort, result) + f"Labels and aggregation results not consistently sorted\n" + + "for (ordered={ordered}, observed={observed}, sort={sort})\n" + + "Result:\n{result}" + ) assert False, msg @@ -798,14 +798,14 @@ def test_groupby_empty_with_category(): def test_sort(): - # http://stackoverflow.com/questions/23814368/sorting-pandas- + # https://stackoverflow.com/questions/23814368/sorting-pandas- # categorical-labels-after-groupby # This should result in a properly sorted Series so that the plot # has a sorted x axis # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') df = DataFrame({"value": np.random.randint(0, 10000, 100)}) - labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] + labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=["value"], ascending=True) @@ -1330,3 +1330,15 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o # If we expect unobserved values to be zero, we also expect the dtype to be int if zero_or_nan == 0: assert np.issubdtype(result.dtype, np.integer) + + +def test_series_groupby_categorical_aggregation_getitem(): + # GH 8870 + d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} + df = pd.DataFrame(d) + cat = pd.cut(df["foo"], np.linspace(0, 20, 5)) + df["range"] = cat + groups = df.groupby(["range", "baz"], as_index=True, sort=True) + result = groups["foo"].agg("mean") + expected = groups.agg("mean")["foo"] + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py index 8e9554085b9ee..b4239d7d34a90 100644 --- a/pandas/tests/groupby/test_counting.py +++ b/pandas/tests/groupby/test_counting.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, MultiIndex, Period, Series, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestCounting: @@ -197,11 +197,8 @@ def test_ngroup_respects_groupby_order(self): @pytest.mark.parametrize( "datetimelike", [ - [ - Timestamp("2016-05-{i:02d} 20:09:25+00:00".format(i=i)) - for i in range(1, 4) - ], - [Timestamp("2016-05-{i:02d} 20:09:25".format(i=i)) for i in range(1, 4)], + [Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)], + [Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)], [Timedelta(x, unit="h") for x in range(1, 4)], [Period(freq="2W", year=2017, month=x) for x in range(1, 4)], ], diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index b3ee12b6691d7..c16ad812eb634 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm def test_filter_series(): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4ca23c61ba920..97cf1af1d2e9e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -20,8 +20,9 @@ date_range, isna, ) +import pandas._testing as tm import pandas.core.nanops as nanops -from pandas.util import _test_decorators as td, testing as tm +from pandas.util import _test_decorators as td @pytest.mark.parametrize("agg_func", ["any", "all"]) @@ -102,9 +103,7 @@ def test_builtins_apply(keys, f): result = df.groupby(keys).apply(f) ngroups = len(df.drop_duplicates(subset=keys)) - assert_msg = "invalid frame shape: {} (expected ({}, 3))".format( - result.shape, ngroups - ) + assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))" assert result.shape == (ngroups, 3), assert_msg tm.assert_frame_equal( @@ -1398,6 +1397,35 @@ def test_quantile_array_multiple_levels(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)]) +@pytest.mark.parametrize("groupby", [[0], [0, 1]]) +@pytest.mark.parametrize("q", [[0.5, 0.6]]) +def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q): + # GH30289 + nrow, ncol = frame_size + df = pd.DataFrame( + np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol) + ) + + idx_levels = [list(range(min(nrow, 4)))] * len(groupby) + [q] + idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [ + list(range(len(q))) * min(nrow, 4) + ] + expected_index = pd.MultiIndex( + levels=idx_levels, codes=idx_codes, names=groupby + [None] + ) + expected_values = [ + [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q + ] + expected_columns = [x for x in range(ncol) if x not in groupby] + expected = pd.DataFrame( + expected_values, index=expected_index, columns=expected_columns + ) + result = df.groupby(groupby).quantile(q) + + tm.assert_frame_equal(result, expected) + + def test_quantile_raises(): df = pd.DataFrame( [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8f88f68c69f2b..7e374811d1960 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -9,9 +9,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv +import pandas._testing as tm from pandas.core.base import SpecificationError import pandas.core.common as com -import pandas.util.testing as tm def test_repr(): @@ -588,6 +588,20 @@ def test_groupby_multiple_columns(df, op): tm.assert_series_equal(result, expected) +def test_as_index_select_column(): + # GH 5764 + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) + result = df.groupby("A", as_index=False)["B"].get_group(1) + expected = pd.Series([2, 4], name="B") + tm.assert_series_equal(result, expected) + + result = df.groupby("A", as_index=False)["B"].apply(lambda x: x.cumsum()) + expected = pd.Series( + [2, 6, 6], name="B", index=pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) + ) + tm.assert_series_equal(result, expected) + + def test_groupby_as_index_agg(df): grouped = df.groupby("A", as_index=False) @@ -771,7 +785,7 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = r"unsupported operand type\(s\) for \+: 'Timestamp'" + msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) @@ -921,7 +935,7 @@ def test_mutate_groups(): + ["c"] * 2 + ["d"] * 2 + ["e"] * 2, - "cat3": ["g{}".format(x) for x in range(1, 15)], + "cat3": [f"g{x}" for x in range(1, 15)], "val": np.random.randint(100, size=14), } ) @@ -1715,9 +1729,7 @@ def test_pivot_table_values_key_error(): # This test is designed to replicate the error in issue #14938 df = pd.DataFrame( { - "eventDate": pd.date_range( - pd.datetime.today(), periods=20, freq="M" - ).tolist(), + "eventDate": pd.date_range(datetime.today(), periods=20, freq="M").tolist(), "thename": range(0, 20), } ) @@ -2011,3 +2023,10 @@ def test_groupby_crash_on_nunique(axis): expected = expected.T tm.assert_frame_equal(result, expected) + + +def test_groupby_list_level(): + # GH 9790 + expected = pd.DataFrame(np.arange(0, 9).reshape(3, 3)) + result = expected.groupby(level=[0]).mean() + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index f2af397357e4f..e424913804c33 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -13,8 +13,8 @@ Timestamp, date_range, ) +import pandas._testing as tm from pandas.core.groupby.grouper import Grouping -import pandas.util.testing as tm # selection # -------------------------------- @@ -71,14 +71,12 @@ def test_getitem_list_of_columns(self): ) result = df.groupby("A")[["C", "D"]].mean() - result2 = df.groupby("A")["C", "D"].mean() - result3 = df.groupby("A")[df.columns[2:4]].mean() + result2 = df.groupby("A")[df.columns[2:4]].mean() expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean() tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) def test_getitem_numeric_column_names(self): # GH #13731 @@ -91,14 +89,40 @@ def test_getitem_numeric_column_names(self): } ) result = df.groupby(0)[df.columns[1:3]].mean() - result2 = df.groupby(0)[2, 4].mean() - result3 = df.groupby(0)[[2, 4]].mean() + result2 = df.groupby(0)[[2, 4]].mean() expected = df.loc[:, [0, 2, 4]].groupby(0).mean() tm.assert_frame_equal(result, expected) tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) + + # per GH 23566 this should raise a FutureWarning + with tm.assert_produces_warning(FutureWarning): + df.groupby(0)[2, 4].mean() + + def test_getitem_single_list_of_columns(self, df): + # per GH 23566 this should raise a FutureWarning + with tm.assert_produces_warning(FutureWarning): + df.groupby("A")["C", "D"].mean() + + def test_getitem_single_column(self): + df = DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + "E": np.random.randn(8), + } + ) + + result = df.groupby("A")["C"].mean() + + as_frame = df.loc[:, ["A", "C"]].groupby("A").mean() + as_series = as_frame.iloc[:, 0] + expected = as_series + + tm.assert_series_equal(result, expected) # grouping @@ -701,10 +725,7 @@ def test_get_group(self): g.get_group("foo") with pytest.raises(ValueError, match=msg): g.get_group(("foo")) - msg = ( - "must supply a same-length tuple to get_group with multiple" - " grouping keys" - ) + msg = "must supply a same-length tuple to get_group with multiple grouping keys" with pytest.raises(ValueError, match=msg): g.get_group(("foo", "bar", "baz")) diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py index f5c8873ff9417..971a447b84cae 100644 --- a/pandas/tests/groupby/test_index_as_string.py +++ b/pandas/tests/groupby/test_index_as_string.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[["inner"], ["inner", "outer"]]) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index f83b284a35377..0f850f2e94581 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna -import pandas.util.testing as tm +import pandas._testing as tm def test_first_last_nth(df): @@ -89,6 +89,25 @@ def test_first_last_nth_dtypes(df_mixed_floats): assert f.dtype == "int64" +def test_first_strings_timestamps(): + # GH 11244 + test = pd.DataFrame( + { + pd.Timestamp("2012-01-01 00:00:00"): ["a", "b"], + pd.Timestamp("2012-01-02 00:00:00"): ["c", "d"], + "name": ["e", "e"], + "aaaa": ["f", "g"], + } + ) + result = test.groupby("name").first() + expected = DataFrame( + [["a", "c", "f"]], + columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]), + index=Index(["e"], name="name"), + ) + tm.assert_frame_equal(result, expected) + + def test_nth(): df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) g = df.groupby("A") diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 8f0df9051fc73..3461bf6e10662 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Series, concat +import pandas._testing as tm from pandas.core.base import DataError -import pandas.util.testing as tm def test_rank_apply(): diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 109382d97440e..6b8bd9e805a0c 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -9,9 +9,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range +import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper -import pandas.util.testing as tm class TestGroupBy: diff --git a/pandas/tests/groupby/test_transform.py b/pandas/tests/groupby/test_transform.py index c46180c1d11cd..6c05c4038a829 100644 --- a/pandas/tests/groupby/test_transform.py +++ b/pandas/tests/groupby/test_transform.py @@ -18,8 +18,8 @@ concat, date_range, ) +import pandas._testing as tm from pandas.core.groupby.groupby import DataError -import pandas.util.testing as tm def assert_fp_equal(a, b): @@ -319,7 +319,7 @@ def test_dispatch_transform(tsframe): def test_transform_select_columns(df): f = lambda x: x.mean() - result = df.groupby("A")["C", "D"].transform(f) + result = df.groupby("A")[["C", "D"]].transform(f) selection = df[["C", "D"]] expected = selection.groupby(df["A"]).transform(f) @@ -765,9 +765,12 @@ def test_transform_with_non_scalar_group(): ], ) @pytest.mark.parametrize("agg_func", ["count", "rank", "size"]) -def test_transform_numeric_ret(cols, exp, comp_func, agg_func): +def test_transform_numeric_ret(cols, exp, comp_func, agg_func, request): if agg_func == "size" and isinstance(cols, list): - pytest.xfail("'size' transformation not supported with NDFrameGroupy") + # https://github.com/pytest-dev/pytest/issues/6300 + # workaround to xfail fixture/param permutations + reason = "'size' transformation not supported with NDFrameGroupy" + request.node.add_marker(pytest.mark.xfail(reason=reason)) # GH 19200 df = pd.DataFrame( @@ -874,27 +877,19 @@ def test_pad_stable_sorting(fill_method): ), ], ) -@pytest.mark.parametrize( - "periods,fill_method,limit", - [ - (1, "ffill", None), - (1, "ffill", 1), - (1, "bfill", None), - (1, "bfill", 1), - (-1, "ffill", None), - (-1, "ffill", 1), - (-1, "bfill", None), - (-1, "bfill", 1), - ], -) +@pytest.mark.parametrize("periods", [1, -1]) +@pytest.mark.parametrize("fill_method", ["ffill", "bfill", None]) +@pytest.mark.parametrize("limit", [None, 1]) def test_pct_change(test_series, freq, periods, fill_method, limit): - # GH 21200, 21621 + # GH 21200, 21621, 30463 vals = [3, np.nan, np.nan, np.nan, 1, 2, 4, 10, np.nan, 4] keys = ["a", "b"] key_v = np.repeat(keys, len(vals)) df = DataFrame({"key": key_v, "vals": vals * 2}) - df_g = getattr(df.groupby("key"), fill_method)(limit=limit) + df_g = df + if fill_method is not None: + df_g = getattr(df.groupby("key"), fill_method)(limit=limit) grp = df_g.groupby(df.key) expected = grp["vals"].obj / grp["vals"].shift(periods) - 1 @@ -967,9 +962,7 @@ def demean_rename(x): if isinstance(x, pd.Series): return result - result = result.rename( - columns={c: "{}_demeaned".format(c) for c in result.columns} - ) + result = result.rename(columns={c: "{c}_demeaned" for c in result.columns}) return result @@ -1138,3 +1131,40 @@ def func(grp): expected = pd.DataFrame([2, -2, 2, 4], columns=["B"]) tm.assert_frame_equal(result, expected) + + +def test_transform_lambda_indexing(): + # GH 7883 + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "flux", "foo", "flux"], + "B": ["one", "one", "two", "three", "two", "six", "five", "three"], + "C": range(8), + "D": range(8), + "E": range(8), + } + ) + df = df.set_index(["A", "B"]) + df = df.sort_index() + result = df.groupby(level="A").transform(lambda x: x.iloc[-1]) + expected = DataFrame( + { + "C": [3, 3, 7, 7, 4, 4, 4, 4], + "D": [3, 3, 7, 7, 4, 4, 4, 4], + "E": [3, 3, 7, 7, 4, 4, 4, 4], + }, + index=MultiIndex.from_tuples( + [ + ("bar", "one"), + ("bar", "three"), + ("flux", "six"), + ("flux", "three"), + ("foo", "five"), + ("foo", "one"), + ("foo", "two"), + ("foo", "two"), + ], + names=["A", "B"], + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index c76ee09f977b5..c86cb4532bc26 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -10,7 +10,7 @@ import pytest from pandas import DataFrame, Grouper, MultiIndex, Series, date_range, to_datetime -import pandas.util.testing as tm +import pandas._testing as tm # our starting frame @@ -47,7 +47,7 @@ def seed_df(seed_nans, n, m): keys = "1st", "2nd", ["1st", "2nd"] for k, b in product(keys, bins): binned.append((df, k, b, n, m)) - ids.append("{}-{}-{}".format(k, n, m)) + ids.append(f"{k}-{n}-{m}") @pytest.mark.slow diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 48ea2646c52fc..8e387e9202ef6 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -9,12 +9,12 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series, date_range +import pandas._testing as tm from pandas.core.groupby.base import ( groupby_other_methods, reduction_kernels, transformation_kernels, ) -import pandas.util.testing as tm AGG_FUNCTIONS = [ "sum", @@ -404,7 +404,7 @@ def test_all_methods_categorized(mframe): # new public method? if new_names: - msg = """ + msg = f""" There are uncatgeorized methods defined on the Grouper class: {names}. @@ -418,19 +418,19 @@ def test_all_methods_categorized(mframe): see the comments in pandas/core/groupby/base.py for guidance on how to fix this test. """ - raise AssertionError(msg.format(names=names)) + raise AssertionError(msg) # removed a public method? all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods print(names) print(all_categorized) if not (names == all_categorized): - msg = """ + msg = f""" Some methods which are supposed to be on the Grouper class are missing: -{names}. +{all_categorized - names}. They're still defined in one of the lists that live in pandas/core/groupby/base.py. If you removed a method, you should update them """ - raise AssertionError(msg.format(names=all_categorized - names)) + raise AssertionError(msg) diff --git a/pandas/tests/indexes/categorical/__init__.py b/pandas/tests/indexes/categorical/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/categorical/test_category.py similarity index 86% rename from pandas/tests/indexes/test_category.py rename to pandas/tests/indexes/categorical/test_category.py index 7286fca42848c..e027641288bb9 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -9,10 +9,10 @@ import pandas as pd from pandas import Categorical, IntervalIndex +import pandas._testing as tm from pandas.core.indexes.api import CategoricalIndex, Index -import pandas.util.testing as tm -from .common import Base +from ..common import Base class TestCategoricalIndex(Base): @@ -32,147 +32,6 @@ def test_can_hold_identifiers(self): key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is True - def test_construction(self): - - ci = self.create_index(categories=list("abcd")) - categories = ci.categories - - result = Index(ci) - tm.assert_index_equal(result, ci, exact=True) - assert not result.ordered - - result = Index(ci.values) - tm.assert_index_equal(result, ci, exact=True) - assert not result.ordered - - # empty - result = CategoricalIndex(categories=categories) - tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) - assert not result.ordered - - # passing categories - result = CategoricalIndex(list("aabbca"), categories=categories) - tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") - ) - - c = pd.Categorical(list("aabbca")) - result = CategoricalIndex(c) - tm.assert_index_equal(result.categories, Index(list("abc"))) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") - ) - assert not result.ordered - - result = CategoricalIndex(c, categories=categories) - tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") - ) - assert not result.ordered - - ci = CategoricalIndex(c, categories=list("abcd")) - result = CategoricalIndex(ci) - tm.assert_index_equal(result.categories, Index(categories)) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") - ) - assert not result.ordered - - result = CategoricalIndex(ci, categories=list("ab")) - tm.assert_index_equal(result.categories, Index(list("ab"))) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") - ) - assert not result.ordered - - result = CategoricalIndex(ci, categories=list("ab"), ordered=True) - tm.assert_index_equal(result.categories, Index(list("ab"))) - tm.assert_numpy_array_equal( - result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") - ) - assert result.ordered - - result = pd.CategoricalIndex(ci, categories=list("ab"), ordered=True) - expected = pd.CategoricalIndex( - ci, categories=list("ab"), ordered=True, dtype="category" - ) - tm.assert_index_equal(result, expected, exact=True) - - # turn me to an Index - result = Index(np.array(ci)) - assert isinstance(result, Index) - assert not isinstance(result, CategoricalIndex) - - def test_construction_with_dtype(self): - - # specify dtype - ci = self.create_index(categories=list("abc")) - - result = Index(np.array(ci), dtype="category") - tm.assert_index_equal(result, ci, exact=True) - - result = Index(np.array(ci).tolist(), dtype="category") - tm.assert_index_equal(result, ci, exact=True) - - # these are generally only equal when the categories are reordered - ci = self.create_index() - - result = Index(np.array(ci), dtype="category").reorder_categories(ci.categories) - tm.assert_index_equal(result, ci, exact=True) - - # make sure indexes are handled - expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], ordered=True) - idx = Index(range(3)) - result = CategoricalIndex(idx, categories=idx, ordered=True) - tm.assert_index_equal(result, expected, exact=True) - - def test_construction_empty_with_bool_categories(self): - # see gh-22702 - cat = pd.CategoricalIndex([], categories=[True, False]) - categories = sorted(cat.categories.tolist()) - assert categories == [False, True] - - def test_construction_with_categorical_dtype(self): - # construction with CategoricalDtype - # GH18109 - data, cats, ordered = "a a b b".split(), "c b a".split(), True - dtype = CategoricalDtype(categories=cats, ordered=ordered) - - result = CategoricalIndex(data, dtype=dtype) - expected = CategoricalIndex(data, categories=cats, ordered=ordered) - tm.assert_index_equal(result, expected, exact=True) - - # GH 19032 - result = Index(data, dtype=dtype) - tm.assert_index_equal(result, expected, exact=True) - - # error when combining categories/ordered and dtype kwargs - msg = "Cannot specify `categories` or `ordered` together with `dtype`." - with pytest.raises(ValueError, match=msg): - CategoricalIndex(data, categories=cats, dtype=dtype) - - with pytest.raises(ValueError, match=msg): - Index(data, categories=cats, dtype=dtype) - - with pytest.raises(ValueError, match=msg): - CategoricalIndex(data, ordered=ordered, dtype=dtype) - - with pytest.raises(ValueError, match=msg): - Index(data, ordered=ordered, dtype=dtype) - - def test_create_categorical(self): - # https://github.com/pandas-dev/pandas/pull/17513 - # The public CI constructor doesn't hit this code path with - # instances of CategoricalIndex, but we still want to test the code - ci = CategoricalIndex(["a", "b", "c"]) - # First ci is self, second ci is data. - result = CategoricalIndex._create_categorical(ci, ci) - expected = Categorical(["a", "b", "c"]) - tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize( "func,op_name", [ @@ -184,7 +43,7 @@ def test_create_categorical(self): (lambda idx: ["a", "b"] + idx, "__radd__"), ], ) - def test_disallow_set_ops(self, func, op_name): + def test_disallow_addsub_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(["a", "b"])) @@ -439,8 +298,8 @@ def test_insert(self): # invalid msg = ( - "cannot insert an item into a CategoricalIndex that is not" - " already an existing category" + "cannot insert an item into a CategoricalIndex that is not " + "already an existing category" ) with pytest.raises(TypeError, match=msg): ci.insert(0, "d") @@ -669,8 +528,8 @@ def test_get_indexer(self): tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) msg = ( - "method='pad' and method='backfill' not implemented yet for" - " CategoricalIndex" + "method='pad' and method='backfill' not implemented yet for " + "CategoricalIndex" ) with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="pad") @@ -814,8 +673,8 @@ def test_equals_categorical(self): ci1 == Index(["a", "b", "c"]) msg = ( - "categorical index comparisons must have the same categories" - " and ordered attributes" + "categorical index comparisons must have the same categories " + "and ordered attributes" "|" "Categoricals can only be compared if 'categories' are the same. " "Categories are different lengths" @@ -1116,3 +975,9 @@ def test_engine_type(self, dtype, engine_type): ci.values._codes = ci.values._codes.astype("int64") assert np.issubdtype(ci.codes.dtype, dtype) assert isinstance(ci._engine, engine_type) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = self.create_index() + with pytest.raises(ValueError, match="cannot mask with array containing NA"): + idx[:, None] diff --git a/pandas/tests/indexes/categorical/test_constructors.py b/pandas/tests/indexes/categorical/test_constructors.py new file mode 100644 index 0000000000000..1df0874e2f947 --- /dev/null +++ b/pandas/tests/indexes/categorical/test_constructors.py @@ -0,0 +1,147 @@ +import numpy as np +import pytest + +from pandas import Categorical, CategoricalDtype, CategoricalIndex, Index +import pandas._testing as tm + + +class TestCategoricalIndexConstructors: + def test_construction(self): + + ci = CategoricalIndex(list("aabbca"), categories=list("abcd"), ordered=False) + categories = ci.categories + + result = Index(ci) + tm.assert_index_equal(result, ci, exact=True) + assert not result.ordered + + result = Index(ci.values) + tm.assert_index_equal(result, ci, exact=True) + assert not result.ordered + + # empty + result = CategoricalIndex(categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) + assert not result.ordered + + # passing categories + result = CategoricalIndex(list("aabbca"), categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + + c = Categorical(list("aabbca")) + result = CategoricalIndex(c) + tm.assert_index_equal(result.categories, Index(list("abc"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(c, categories=categories) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + ci = CategoricalIndex(c, categories=list("abcd")) + result = CategoricalIndex(ci) + tm.assert_index_equal(result.categories, Index(categories)) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(ci, categories=list("ab")) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) + assert not result.ordered + + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + tm.assert_index_equal(result.categories, Index(list("ab"))) + tm.assert_numpy_array_equal( + result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") + ) + assert result.ordered + + result = CategoricalIndex(ci, categories=list("ab"), ordered=True) + expected = CategoricalIndex( + ci, categories=list("ab"), ordered=True, dtype="category" + ) + tm.assert_index_equal(result, expected, exact=True) + + # turn me to an Index + result = Index(np.array(ci)) + assert isinstance(result, Index) + assert not isinstance(result, CategoricalIndex) + + def test_construction_with_dtype(self): + + # specify dtype + ci = CategoricalIndex(list("aabbca"), categories=list("abc"), ordered=False) + + result = Index(np.array(ci), dtype="category") + tm.assert_index_equal(result, ci, exact=True) + + result = Index(np.array(ci).tolist(), dtype="category") + tm.assert_index_equal(result, ci, exact=True) + + # these are generally only equal when the categories are reordered + ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) + + result = Index(np.array(ci), dtype="category").reorder_categories(ci.categories) + tm.assert_index_equal(result, ci, exact=True) + + # make sure indexes are handled + expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], ordered=True) + idx = Index(range(3)) + result = CategoricalIndex(idx, categories=idx, ordered=True) + tm.assert_index_equal(result, expected, exact=True) + + def test_construction_empty_with_bool_categories(self): + # see GH#22702 + cat = CategoricalIndex([], categories=[True, False]) + categories = sorted(cat.categories.tolist()) + assert categories == [False, True] + + def test_construction_with_categorical_dtype(self): + # construction with CategoricalDtype + # GH#18109 + data, cats, ordered = "a a b b".split(), "c b a".split(), True + dtype = CategoricalDtype(categories=cats, ordered=ordered) + + result = CategoricalIndex(data, dtype=dtype) + expected = CategoricalIndex(data, categories=cats, ordered=ordered) + tm.assert_index_equal(result, expected, exact=True) + + # GH#19032 + result = Index(data, dtype=dtype) + tm.assert_index_equal(result, expected, exact=True) + + # error when combining categories/ordered and dtype kwargs + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + with pytest.raises(ValueError, match=msg): + CategoricalIndex(data, categories=cats, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Index(data, categories=cats, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + CategoricalIndex(data, ordered=ordered, dtype=dtype) + + with pytest.raises(ValueError, match=msg): + Index(data, ordered=ordered, dtype=dtype) + + def test_create_categorical(self): + # GH#17513 The public CI constructor doesn't hit this code path with + # instances of CategoricalIndex, but we still want to test the code + ci = CategoricalIndex(["a", "b", "c"]) + # First ci is self, second ci is data. + result = CategoricalIndex._create_categorical(ci, ci) + expected = Categorical(["a", "b", "c"]) + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 102949fe3f05e..f3ebe8313d0c6 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -23,9 +23,9 @@ UInt64Index, isna, ) +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm class Base: @@ -37,8 +37,8 @@ class Base: def test_pickle_compat_construction(self): # need an object to create with msg = ( - r"Index\(\.\.\.\) must be called with a collection of some" - r" kind, None was passed|" + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, None was passed|" r"__new__\(\) missing 1 required positional argument: 'data'|" r"__new__\(\) takes at least 2 arguments \(1 given\)" ) @@ -103,6 +103,13 @@ def test_shift(self): with pytest.raises(NotImplementedError, match=msg): idx.shift(1, 2) + def test_constructor_name_unhashable(self): + # GH#29069 check that name is hashable + # See also same-named test in tests.series.test_constructors + idx = self.create_index() + with pytest.raises(TypeError, match="Index.name must be a hashable type"): + type(idx)(idx, name=[]) + def test_create_index_existing_name(self): # GH11193, when an existing index is passed, and a new name is not @@ -868,3 +875,19 @@ def test_engine_reference_cycle(self): nrefs_pre = len(gc.get_referrers(index)) index._engine assert len(gc.get_referrers(index)) == nrefs_pre + + def test_getitem_2d_deprecated(self): + # GH#30588 + idx = self.create_index() + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + res = idx[:, None] + + assert isinstance(res, np.ndarray), type(res) + + def test_contains_requires_hashable_raises(self): + idx = self.create_index() + with pytest.raises(TypeError, match="unhashable type"): + [] in idx + + with pytest.raises(TypeError): + {} in idx._engine diff --git a/pandas/tests/indexes/conftest.py b/pandas/tests/indexes/conftest.py index 2a9a8bf8d824f..e3e7ff4093b76 100644 --- a/pandas/tests/indexes/conftest.py +++ b/pandas/tests/indexes/conftest.py @@ -2,8 +2,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.core.indexes.api import Index, MultiIndex -import pandas.util.testing as tm indices_dict = { "unicode": tm.makeUnicodeIndex(100), diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index 6eedfca129856..3c72d34d84b28 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from .common import Base diff --git a/pandas/tests/indexes/datetimes/test_astype.py b/pandas/tests/indexes/datetimes/test_astype.py index eabf293ae915f..6139726dc34e4 100644 --- a/pandas/tests/indexes/datetimes/test_astype.py +++ b/pandas/tests/indexes/datetimes/test_astype.py @@ -17,7 +17,7 @@ Timestamp, date_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndex: diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 2f7ed3238b767..95d14ad4c86f7 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -1,4 +1,4 @@ -from datetime import timedelta +from datetime import datetime, timedelta from functools import partial from operator import attrgetter @@ -10,17 +10,9 @@ from pandas._libs.tslibs import OutOfBoundsDatetime, conversion import pandas as pd -from pandas import ( - DatetimeIndex, - Index, - Timestamp, - date_range, - datetime, - offsets, - to_datetime, -) +from pandas import DatetimeIndex, Index, Timestamp, date_range, offsets, to_datetime +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, period_array -import pandas.util.testing as tm class TestDatetimeIndex: @@ -37,6 +29,25 @@ def test_freq_validation_with_nat(self, dt_cls): with pytest.raises(ValueError, match=msg): dt_cls([pd.NaT, pd.Timestamp("2011-01-01").value], freq="D") + # TODO: better place for tests shared by DTI/TDI? + @pytest.mark.parametrize( + "index", + [ + pd.date_range("2016-01-01", periods=5, tz="US/Pacific"), + pd.timedelta_range("1 Day", periods=5), + ], + ) + def test_shallow_copy_inherits_array_freq(self, index): + # If we pass a DTA/TDA to shallow_copy and dont specify a freq, + # we should inherit the array's freq, not our own. + array = index._data + + arr = array[[0, 3, 2, 4, 1]] + assert arr.freq is None + + result = index._shallow_copy(arr) + assert result.freq is None + def test_categorical_preserves_tz(self): # GH#18664 retain tz when going DTI-->Categorical-->DTI # TODO: parametrize over DatetimeIndex/DatetimeArray @@ -536,15 +547,15 @@ def test_constructor_coverage(self): # non-conforming msg = ( - "Inferred frequency None from passed values does not conform" - " to passed frequency D" + "Inferred frequency None from passed values does not conform " + "to passed frequency D" ) with pytest.raises(ValueError, match=msg): DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-04"], freq="D") msg = ( - "Of the four parameters: start, end, periods, and freq, exactly" - " three must be specified" + "Of the four parameters: start, end, periods, and freq, exactly " + "three must be specified" ) with pytest.raises(ValueError, match=msg): date_range(start="2011-01-01", freq="b") @@ -633,8 +644,8 @@ def test_constructor_dtype(self): ) msg = ( - "cannot supply both a tz and a timezone-naive dtype" - r" \(i\.e\. datetime64\[ns\]\)" + "cannot supply both a tz and a timezone-naive dtype " + r"\(i\.e\. datetime64\[ns\]\)" ) with pytest.raises(ValueError, match=msg): DatetimeIndex(idx, dtype="datetime64[ns]") @@ -711,7 +722,6 @@ def test_constructor_timestamp_near_dst(self): expected = DatetimeIndex([ts[0].to_pydatetime(), ts[1].to_pydatetime()]) tm.assert_index_equal(result, expected) - # TODO(GH-24559): Remove the xfail for the tz-aware case. @pytest.mark.parametrize("klass", [Index, DatetimeIndex]) @pytest.mark.parametrize("box", [np.array, partial(np.array, dtype=object), list]) @pytest.mark.parametrize( @@ -725,15 +735,10 @@ def test_constructor_with_int_tz(self, klass, box, tz, dtype): expected = klass([ts]) assert result == expected - # This is the desired future behavior - # Note: this xfail is not strict because the test passes with - # None or any of the UTC variants for tz_naive_fixture - @pytest.mark.xfail(reason="Future behavior", strict=False) - @pytest.mark.filterwarnings("ignore:\\n Passing:FutureWarning") def test_construction_int_rountrip(self, tz_naive_fixture): - # GH 12619 - # TODO(GH-24559): Remove xfail + # GH 12619, GH#24559 tz = tz_naive_fixture + result = 1293858000000000000 expected = DatetimeIndex([result], tz=tz).asi8[0] assert result == expected diff --git a/pandas/tests/indexes/datetimes/test_date_range.py b/pandas/tests/indexes/datetimes/test_date_range.py index 36cdaa8a6029b..4d0beecbbf5d3 100644 --- a/pandas/tests/indexes/datetimes/test_date_range.py +++ b/pandas/tests/indexes/datetimes/test_date_range.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import DatetimeIndex, Timestamp, bdate_range, date_range, offsets -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import ( BDay, @@ -945,3 +945,19 @@ def test_range_with_millisecond_resolution(self, start_end): result = pd.date_range(start=start, end=end, periods=2, closed="left") expected = DatetimeIndex([start]) tm.assert_index_equal(result, expected) + + +def test_date_range_with_custom_holidays(): + # GH 30593 + freq = pd.offsets.CustomBusinessHour(start="15:00", holidays=["2020-11-26"]) + result = pd.date_range(start="2020-11-25 15:00", periods=4, freq=freq) + expected = pd.DatetimeIndex( + [ + "2020-11-25 15:00:00", + "2020-11-25 16:00:00", + "2020-11-27 15:00:00", + "2020-11-27 16:00:00", + ], + freq=freq, + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 03b9502be2735..ca18d6fbea11a 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, Timestamp, date_range, offsets -import pandas.util.testing as tm +import pandas._testing as tm randn = np.random.randn @@ -393,15 +393,13 @@ def test_asarray_tz_naive(self): # This shouldn't produce a warning. idx = pd.date_range("2000", periods=2) # M8[ns] by default - with tm.assert_produces_warning(None): - result = np.asarray(idx) + result = np.asarray(idx) expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) # optionally, object - with tm.assert_produces_warning(None): - result = np.asarray(idx, dtype=object) + result = np.asarray(idx, dtype=object) expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) tm.assert_numpy_array_equal(result, expected) @@ -410,15 +408,12 @@ def test_asarray_tz_aware(self): tz = "US/Central" idx = pd.date_range("2000", periods=2, tz=tz) expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") - # We warn by default and return an ndarray[M8[ns]] - with tm.assert_produces_warning(FutureWarning): - result = np.asarray(idx) + result = np.asarray(idx, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) # Old behavior with no warning - with tm.assert_produces_warning(None): - result = np.asarray(idx, dtype="M8[ns]") + result = np.asarray(idx, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -426,8 +421,7 @@ def test_asarray_tz_aware(self): expected = np.array( [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] ) - with tm.assert_produces_warning(None): - result = np.asarray(idx, dtype=object) + result = np.asarray(idx, dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 2ff6853b98929..da1bd6f091d1a 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -2,7 +2,7 @@ import pytest from pandas import DatetimeIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm from ..datetimelike import DatetimeLike diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index 33a744cc25ca1..f34019e06fd5f 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DatetimeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm def test_to_native_types(): diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index cd5efc86320c2..f3c255d50aba1 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DatetimeIndex, Index, Timestamp, date_range, notna -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay, CDay @@ -86,7 +86,9 @@ def test_dti_business_getitem(self): def test_dti_business_getitem_matplotlib_hackaround(self): rng = pd.bdate_range(START, END) - values = rng[:, None] + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + values = rng[:, None] expected = rng.values[:, None] tm.assert_numpy_array_equal(values, expected) @@ -110,7 +112,9 @@ def test_dti_custom_getitem(self): def test_dti_custom_getitem_matplotlib_hackaround(self): rng = pd.bdate_range(START, END, freq="C") - values = rng[:, None] + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + values = rng[:, None] expected = rng.values[:, None] tm.assert_numpy_array_equal(values, expected) @@ -132,9 +136,32 @@ def test_where_other(self): i2 = i.copy() i2 = Index([pd.NaT, pd.NaT] + i[2:].tolist()) - result = i.where(notna(i2), i2.values) + result = i.where(notna(i2), i2._values) tm.assert_index_equal(result, i2) + def test_where_invalid_dtypes(self): + dti = pd.date_range("20130101", periods=3, tz="US/Eastern") + + i2 = dti.copy() + i2 = Index([pd.NaT, pd.NaT] + dti[2:].tolist()) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + # passing tz-naive ndarray to tzaware DTI + dti.where(notna(i2), i2.values) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + # passing tz-aware DTI to tznaive DTI + dti.tz_localize(None).where(notna(i2), i2) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + dti.where(notna(i2), i2.tz_localize(None).to_period("D")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + dti.where(notna(i2), i2.asi8.view("timedelta64[ns]")) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + dti.where(notna(i2), i2.asi8) + def test_where_tz(self): i = pd.date_range("20130101", periods=3, tz="US/Eastern") result = i.where(notna(i)) @@ -317,7 +344,9 @@ def test_take_fill_value_with_timezone(self): class TestDatetimeIndex: - @pytest.mark.parametrize("null", [None, np.nan, pd.NaT]) + @pytest.mark.parametrize( + "null", [None, np.nan, np.datetime64("NaT"), pd.NaT, pd.NA] + ) @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) def test_insert_nat(self, tz, null): # GH#16537, GH#18295 (test missing) @@ -326,6 +355,12 @@ def test_insert_nat(self, tz, null): res = idx.insert(0, null) tm.assert_index_equal(res, expected) + @pytest.mark.parametrize("tz", [None, "UTC", "US/Eastern"]) + def test_insert_invalid_na(self, tz): + idx = pd.DatetimeIndex(["2017-01-01"], tz=tz) + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.timedelta64("NaT")) + def test_insert(self): idx = DatetimeIndex(["2000-01-04", "2000-01-01", "2000-01-02"], name="idx") @@ -403,9 +438,9 @@ def test_insert(self): # see gh-7299 idx = date_range("1/1/2000", periods=3, freq="D", tz="Asia/Tokyo", name="idx") - with pytest.raises(ValueError): + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): idx.insert(3, pd.Timestamp("2000-01-04")) - with pytest.raises(ValueError): + with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"): idx.insert(3, datetime(2000, 1, 4)) with pytest.raises(ValueError): idx.insert(3, pd.Timestamp("2000-01-04", tz="US/Eastern")) @@ -457,7 +492,7 @@ def test_insert(self): def test_delete(self): idx = date_range(start="2000-01-01", periods=5, freq="M", name="idx") - # prserve freq + # preserve freq expected_0 = date_range(start="2000-02-01", periods=4, freq="M", name="idx") expected_4 = date_range(start="2000-01-01", periods=4, freq="M", name="idx") @@ -511,7 +546,7 @@ def test_delete(self): def test_delete_slice(self): idx = date_range(start="2000-01-01", periods=10, freq="D", name="idx") - # prserve freq + # preserve freq expected_0_2 = date_range(start="2000-01-04", periods=7, freq="D", name="idx") expected_7_9 = date_range(start="2000-01-01", periods=7, freq="D", name="idx") @@ -582,6 +617,27 @@ def test_delete_slice(self): assert result.freq == expected.freq assert result.tz == expected.tz + def test_get_value(self): + # specifically make sure we have test for np.datetime64 key + dti = pd.date_range("2016-01-01", periods=3) + + arr = np.arange(6, 9) + ser = pd.Series(arr, index=dti) + + key = dti[1] + + with pytest.raises(AttributeError, match="has no attribute '_values'"): + dti.get_value(arr, key) + + result = dti.get_value(ser, key) + assert result == 7 + + result = dti.get_value(ser, key.to_pydatetime()) + assert result == 7 + + result = dti.get_value(ser, key.to_datetime64()) + assert result == 7 + def test_get_loc(self): idx = pd.date_range("2000-01-01", periods=3) diff --git a/pandas/tests/indexes/datetimes/test_misc.py b/pandas/tests/indexes/datetimes/test_misc.py index c144f2a447ed3..340f53b2868bd 100644 --- a/pandas/tests/indexes/datetimes/test_misc.py +++ b/pandas/tests/indexes/datetimes/test_misc.py @@ -1,4 +1,5 @@ import calendar +from datetime import datetime import locale import unicodedata @@ -6,8 +7,8 @@ import pytest import pandas as pd -from pandas import DatetimeIndex, Index, Timestamp, date_range, datetime, offsets -import pandas.util.testing as tm +from pandas import DatetimeIndex, Index, Timestamp, date_range, offsets +import pandas._testing as tm class TestTimeSeries: diff --git a/pandas/tests/indexes/datetimes/test_missing.py b/pandas/tests/indexes/datetimes/test_missing.py index 6d94319b33b02..3399c8eaf6750 100644 --- a/pandas/tests/indexes/datetimes/test_missing.py +++ b/pandas/tests/indexes/datetimes/test_missing.py @@ -1,7 +1,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndex: diff --git a/pandas/tests/indexes/datetimes/test_ops.py b/pandas/tests/indexes/datetimes/test_ops.py index fb032947143d3..ecd4ace705e9e 100644 --- a/pandas/tests/indexes/datetimes/test_ops.py +++ b/pandas/tests/indexes/datetimes/test_ops.py @@ -16,8 +16,8 @@ bdate_range, date_range, ) +import pandas._testing as tm from pandas.tests.base.test_ops import Ops -import pandas.util.testing as tm from pandas.tseries.offsets import BDay, BMonthEnd, CDay, Day, Hour diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 8d5aa64a49cf2..946d658e90132 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -16,8 +16,8 @@ Timestamp, date_range, ) +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm class TestSlicing: @@ -142,6 +142,26 @@ def test_slice_year(self): expected = slice(3288, 3653) assert result == expected + @pytest.mark.parametrize( + "partial_dtime", + [ + "2019", + "2019Q4", + "Dec 2019", + "2019-12-31", + "2019-12-31 23", + "2019-12-31 23:59", + ], + ) + def test_slice_end_of_period_resolution(self, partial_dtime): + # GH#31064 + dti = date_range("2019-12-31 23:59:55.999999999", periods=10, freq="s") + + ser = pd.Series(range(10), index=dti) + result = ser[partial_dtime] + expected = ser.iloc[:5] + tm.assert_series_equal(result, expected) + def test_slice_quarter(self): dti = date_range(freq="D", start=datetime(2000, 6, 1), periods=500) diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 62383555f6048..84eee2419f0b8 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DatetimeIndex, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.frequencies import to_offset diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 3fb39b2081d83..78188c54b1d85 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -16,7 +16,7 @@ date_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BMonthEnd, Minute, MonthEnd @@ -163,6 +163,21 @@ def test_union_freq_both_none(self, sort): tm.assert_index_equal(result, expected) assert result.freq is None + def test_union_freq_infer(self): + # When taking the union of two DatetimeIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # TimedeltaIndex behavior. + dti = pd.date_range("2016-01-01", periods=5) + left = dti[[0, 1, 3, 4]] + right = dti[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, dti) + assert result.freq == "D" + def test_union_dataframe_index(self): rng1 = date_range("1/1/1999", "1/1/2012", freq="MS") s1 = Series(np.random.randn(len(rng1)), rng1) diff --git a/pandas/tests/indexes/datetimes/test_shift.py b/pandas/tests/indexes/datetimes/test_shift.py index 6f8315debdfa9..1c87995931c62 100644 --- a/pandas/tests/indexes/datetimes/test_shift.py +++ b/pandas/tests/indexes/datetimes/test_shift.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import DatetimeIndex, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndexShift: diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index 3f942f9b79428..df64820777f3f 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -22,7 +22,7 @@ isna, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm class FixedOffset(tzinfo): @@ -573,13 +573,7 @@ def test_dti_construction_ambiguous_endpoint(self, tz): "2013-10-26 23:00", "2013-10-27 01:00", freq="H", tz=tz, ambiguous="infer" ) assert times[0] == Timestamp("2013-10-26 23:00", tz=tz, freq="H") - - if str(tz).startswith("dateutil"): - # fixed ambiguous behavior - # see GH#14621 - assert times[-1] == Timestamp("2013-10-27 01:00:00+0100", tz=tz, freq="H") - else: - assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz, freq="H") + assert times[-1] == Timestamp("2013-10-27 01:00:00+0000", tz=tz, freq="H") @pytest.mark.parametrize( "tz, option, expected", diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py index 6e919571d1423..a5332eaea0432 100644 --- a/pandas/tests/indexes/datetimes/test_tools.py +++ b/pandas/tests/indexes/datetimes/test_tools.py @@ -30,9 +30,9 @@ isna, to_datetime, ) +import pandas._testing as tm from pandas.core.arrays import DatetimeArray from pandas.core.tools import datetimes as tools -import pandas.util.testing as tm class TestTimeConversionFormats: @@ -616,8 +616,8 @@ def test_to_datetime_tz(self, cache): pd.Timestamp("2013-01-02 14:00:00", tz="US/Eastern"), ] msg = ( - "Tz-aware datetime.datetime cannot be converted to datetime64" - " unless utc=True" + "Tz-aware datetime.datetime cannot be " + "converted to datetime64 unless utc=True" ) with pytest.raises(ValueError, match=msg): pd.to_datetime(arr, cache=cache) @@ -720,13 +720,11 @@ def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, dtype): tm.assert_series_equal(result, expected) @pytest.mark.parametrize("cache", [True, False]) + @td.skip_if_no("psycopg2") def test_to_datetime_tz_psycopg2(self, cache): # xref 8260 - try: - import psycopg2 - except ImportError: - pytest.skip("no psycopg2 installed") + import psycopg2 # misc cases tz1 = psycopg2.tz.FixedOffsetTimezone(offset=-300, name=None) @@ -1300,7 +1298,7 @@ def test_dataframe(self, cache): tm.assert_series_equal(result, expected) # extra columns - msg = "extra keys have been passed to the datetime assemblage: " r"\[foo\]" + msg = r"extra keys have been passed to the datetime assemblage: \[foo\]" with pytest.raises(ValueError, match=msg): df2 = df.copy() df2["foo"] = 1 @@ -1876,7 +1874,7 @@ def test_parsers(self, date_str, expected, cache): # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) + result1, _ = parsing.parse_time_string(date_str, yearfirst=yearfirst) result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below @@ -1912,7 +1910,7 @@ def test_na_values_with_cache( def test_parsers_nat(self): # Test that each of several string-accepting methods return pd.NaT - result1, _, _ = parsing.parse_time_string("NaT") + result1, _ = parsing.parse_time_string("NaT") result2 = to_datetime("NaT") result3 = Timestamp("NaT") result4 = DatetimeIndex(["NaT"])[0] @@ -1988,7 +1986,7 @@ def test_parsers_dayfirst_yearfirst(self, cache): ) assert dateutil_result == expected - result1, _, _ = parsing.parse_time_string( + result1, _ = parsing.parse_time_string( date_str, dayfirst=dayfirst, yearfirst=yearfirst ) @@ -2018,7 +2016,7 @@ def test_parsers_timestring(self, cache): } for date_str, (exp_now, exp_def) in cases.items(): - result1, _, _ = parsing.parse_time_string(date_str) + result1, _ = parsing.parse_time_string(date_str) result2 = to_datetime(date_str) result3 = to_datetime([date_str]) result4 = Timestamp(date_str) @@ -2293,3 +2291,25 @@ def test_should_cache_errors(unique_share, check_count, err_message): with pytest.raises(AssertionError, match=err_message): tools.should_cache(arg, unique_share, check_count) + + +def test_nullable_integer_to_datetime(): + # Test for #30050 + ser = pd.Series([1, 2, None, 2 ** 61, None]) + ser = ser.astype("Int64") + ser_copy = ser.copy() + + res = pd.to_datetime(ser, unit="ns") + + expected = pd.Series( + [ + np.datetime64("1970-01-01 00:00:00.000000001"), + np.datetime64("1970-01-01 00:00:00.000000002"), + np.datetime64("NaT"), + np.datetime64("2043-01-25 23:56:49.213693952"), + np.datetime64("NaT"), + ] + ) + tm.assert_series_equal(res, expected) + # Check that ser isn't mutated + tm.assert_series_equal(ser, ser_copy) diff --git a/pandas/tests/indexes/interval/test_astype.py b/pandas/tests/indexes/interval/test_astype.py index 708cd8a4579e8..c94af6c0d533e 100644 --- a/pandas/tests/indexes/interval/test_astype.py +++ b/pandas/tests/indexes/interval/test_astype.py @@ -12,7 +12,7 @@ Timestamp, interval_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class Base: @@ -67,7 +67,7 @@ def test_astype_cannot_cast(self, index, dtype): index.astype(dtype) def test_astype_invalid_dtype(self, index): - msg = 'data type "fake_dtype" not understood' + msg = "data type [\"']fake_dtype[\"'] not understood" with pytest.raises(TypeError, match=msg): index.astype("fake_dtype") diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index 339bdaf79c690..d8c2ba8413cfb 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -2,8 +2,8 @@ import pytest from pandas import IntervalIndex, Series, date_range +import pandas._testing as tm from pandas.tests.indexes.common import Base -import pandas.util.testing as tm class TestBase(Base): @@ -79,3 +79,10 @@ def test_where(self, closed, klass): expected = IntervalIndex([np.nan] + idx[1:].tolist()) result = idx.where(klass(cond)) tm.assert_index_equal(result, expected) + + def test_getitem_2d_deprecated(self): + # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable + idx = self.create_index() + with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + idx[:, None] diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index 98c1f7c6c2a8a..837c124db2bed 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -19,9 +19,9 @@ period_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import IntervalArray import pandas.core.common as com -import pandas.util.testing as tm @pytest.fixture(params=[None, "foo"]) @@ -164,7 +164,7 @@ def test_generic_errors(self, constructor): constructor(dtype="int64", **filler) # invalid dtype - msg = 'data type "invalid" not understood' + msg = "data type [\"']invalid[\"'] not understood" with pytest.raises(TypeError, match=msg): constructor(dtype="invalid", **filler) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index dcc0c818182ab..7acf5c1e0906c 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, IntervalIndex, Series, Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm class TestIntervalIndexRendering: diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 15ea9a6b62c20..1bfc58733a110 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -11,8 +11,8 @@ date_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError -import pandas.util.testing as tm class TestGetLoc: @@ -349,8 +349,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get left slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get left slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(start=Interval(0, 2), end=Interval(2, 4)) @@ -358,8 +358,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get left slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get left slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(start=Interval(0, 2)) @@ -369,8 +369,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get right slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get right slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(end=Interval(0, 2)) @@ -378,8 +378,8 @@ def test_slice_locs_with_interval(self): with pytest.raises( KeyError, match=re.escape( - '"Cannot get right slice bound for non-unique label:' - " Interval(0, 2, closed='right')\"" + '"Cannot get right slice bound for non-unique label: ' + "Interval(0, 2, closed='right')\"" ), ): index.slice_locs(start=Interval(2, 4), end=Interval(0, 2)) @@ -431,8 +431,8 @@ def test_slice_locs_with_ints_and_floats_errors(self, tuples, query): with pytest.raises( KeyError, match=( - "'can only get slices from an IntervalIndex if bounds are" - " non-overlapping and all monotonic increasing or decreasing'" + "'can only get slices from an IntervalIndex if bounds are " + "non-overlapping and all monotonic increasing or decreasing'" ), ): index.slice_locs(start, stop) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 6ad7dfb22f2b3..47a0ba7fe0f21 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -17,8 +17,8 @@ notna, timedelta_range, ) +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm @pytest.fixture(scope="class", params=[None, "foo"]) @@ -586,8 +586,8 @@ def test_missing_values(self, closed): assert idx.equals(idx2) msg = ( - "missing values must be missing in the same location both left" - " and right sides" + "missing values must be missing in the same location both left " + "and right sides" ) with pytest.raises(ValueError, match=msg): IntervalIndex.from_arrays( diff --git a/pandas/tests/indexes/interval/test_interval_range.py b/pandas/tests/indexes/interval/test_interval_range.py index b102444b4ec9c..2f28c33a3bbc6 100644 --- a/pandas/tests/indexes/interval/test_interval_range.py +++ b/pandas/tests/indexes/interval/test_interval_range.py @@ -15,7 +15,7 @@ interval_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Day @@ -84,7 +84,7 @@ def test_constructor_timestamp(self, closed, name, freq, periods, tz): tm.assert_index_equal(result, expected) # GH 20976: linspace behavior defined from start/end/periods - if not breaks.freq.isAnchored() and tz is None: + if not breaks.freq.is_anchored() and tz is None: # matches expected only for non-anchored offsets and tz naive # (anchored/DST transitions cause unequal spacing in expected) result = interval_range( diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 85dac5ea35950..476ec1dd10b4b 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -6,7 +6,7 @@ from pandas._libs.interval import IntervalTree from pandas import compat -import pandas.util.testing as tm +import pandas._testing as tm def skipif_32bit(param): @@ -20,9 +20,7 @@ def skipif_32bit(param): return pytest.param(param, marks=marks) -@pytest.fixture( - scope="class", params=["int32", "int64", "float32", "float64", "uint64"] -) +@pytest.fixture(scope="class", params=["int64", "float64", "uint64"]) def dtype(request): return request.param @@ -39,12 +37,9 @@ def leaf_size(request): @pytest.fixture( params=[ np.arange(5, dtype="int64"), - np.arange(5, dtype="int32"), np.arange(5, dtype="uint64"), np.arange(5, dtype="float64"), - np.arange(5, dtype="float32"), np.array([0, 1, 2, 3, 4, np.nan], dtype="float64"), - np.array([0, 1, 2, 3, 4, np.nan], dtype="float32"), ] ) def tree(request, leaf_size): @@ -53,18 +48,6 @@ def tree(request, leaf_size): class TestIntervalTree: - def test_get_loc(self, tree): - result = tree.get_loc(1) - expected = np.array([0], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - - result = np.sort(tree.get_loc(2)) - expected = np.array([0, 1], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - - with pytest.raises(KeyError, match="-1"): - tree.get_loc(-1) - def test_get_indexer(self, tree): result = tree.get_indexer(np.array([1.0, 5.5, 6.5])) expected = np.array([0, 4, -1], dtype="intp") @@ -75,6 +58,18 @@ def test_get_indexer(self, tree): ): tree.get_indexer(np.array([3.0])) + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 1], dtype=dtype), np.array([1, 2], dtype=dtype) + tree = IntervalTree(left, right) + + result = tree.get_indexer(np.array([target_value], dtype=target_dtype)) + expected = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result, expected) + def test_get_indexer_non_unique(self, tree): indexer, missing = tree.get_indexer_non_unique(np.array([1.0, 2.0, 6.5])) @@ -94,14 +89,26 @@ def test_get_indexer_non_unique(self, tree): expected = np.array([2], dtype="intp") tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "dtype, target_value, target_dtype", + [("int64", 2 ** 63 + 1, "uint64"), ("uint64", -1, "int64")], + ) + def test_get_indexer_non_unique_overflow(self, dtype, target_value, target_dtype): + left, right = np.array([0, 2], dtype=dtype), np.array([1, 3], dtype=dtype) + tree = IntervalTree(left, right) + target = np.array([target_value], dtype=target_dtype) + + result_indexer, result_missing = tree.get_indexer_non_unique(target) + expected_indexer = np.array([-1], dtype="intp") + tm.assert_numpy_array_equal(result_indexer, expected_indexer) + + expected_missing = np.array([0], dtype="intp") + tm.assert_numpy_array_equal(result_missing, expected_missing) + def test_duplicates(self, dtype): left = np.array([0, 0, 0], dtype=dtype) tree = IntervalTree(left, left + 1) - result = np.sort(tree.get_loc(0.5)) - expected = np.array([0, 1, 2], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - with pytest.raises( KeyError, match="'indexer does not intersect a unique set of intervals'" ): @@ -116,17 +123,6 @@ def test_duplicates(self, dtype): expected = np.array([], dtype="intp") tm.assert_numpy_array_equal(result, expected) - def test_get_loc_closed(self, closed): - tree = IntervalTree([0], [1], closed=closed) - for p, errors in [(0, tree.open_left), (1, tree.open_right)]: - if errors: - with pytest.raises(KeyError, match=str(p)): - tree.get_loc(p) - else: - result = tree.get_loc(p) - expected = np.array([0], dtype="intp") - tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize( "leaf_size", [skipif_32bit(1), skipif_32bit(10), skipif_32bit(100), 10000] ) @@ -147,10 +143,10 @@ def test_get_indexer_closed(self, closed, leaf_size): @pytest.mark.parametrize( "left, right, expected", [ - (np.array([0, 1, 4]), np.array([2, 3, 5]), True), - (np.array([0, 1, 2]), np.array([5, 4, 3]), True), + (np.array([0, 1, 4], dtype="int64"), np.array([2, 3, 5]), True), + (np.array([0, 1, 2], dtype="int64"), np.array([5, 4, 3]), True), (np.array([0, 1, np.nan]), np.array([5, 4, np.nan]), True), - (np.array([0, 2, 4]), np.array([1, 3, 5]), False), + (np.array([0, 2, 4], dtype="int64"), np.array([1, 3, 5]), False), (np.array([0, 2, np.nan]), np.array([1, 3, np.nan]), False), ], ) @@ -165,7 +161,7 @@ def test_is_overlapping(self, closed, order, left, right, expected): def test_is_overlapping_endpoints(self, closed, order): """shared endpoints are marked as overlapping""" # GH 23309 - left, right = np.arange(3), np.arange(1, 4) + left, right = np.arange(3, dtype="int64"), np.arange(1, 4) tree = IntervalTree(left[order], right[order], closed=closed) result = tree.is_overlapping expected = closed == "both" @@ -188,7 +184,7 @@ def test_is_overlapping_trivial(self, closed, left, right): @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440") def test_construction_overflow(self): # GH 25485 - left, right = np.arange(101), [np.iinfo(np.int64).max] * 101 + left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101 tree = IntervalTree(left, right) # pivot should be average of left/right medians diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index 89e733c30b1e3..3246ac6bafde9 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -2,7 +2,7 @@ import pytest from pandas import Index, IntervalIndex, Timestamp, interval_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(scope="class", params=[None, "foo"]) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index a6d08c845d941..2db61d4f4b852 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Index, MultiIndex, date_range, period_range -import pandas.util.testing as tm +import pandas._testing as tm def test_shift(idx): @@ -218,7 +218,7 @@ def test_take_fill_value(): with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) - msg = "index -5 is out of bounds for size 4" + msg = "index -5 is out of bounds for( axis 0 with)? size 4" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5])) @@ -334,8 +334,8 @@ def test_numpy_ufuncs(idx, func): else: expected_exception = TypeError msg = ( - "loop of ufunc does not support argument 0 of type tuple which" - f" has no callable {func.__name__} method" + "loop of ufunc does not support argument 0 of type tuple which " + f"has no callable {func.__name__} method" ) with pytest.raises(expected_exception, match=msg): func(idx) @@ -348,9 +348,9 @@ def test_numpy_ufuncs(idx, func): ) def test_numpy_type_funcs(idx, func): msg = ( - f"ufunc '{func.__name__}' not supported for the input types, and the inputs" - " could not be safely coerced to any supported types according to" - " the casting rule ''safe''" + f"ufunc '{func.__name__}' not supported for the input types, and the inputs " + "could not be safely coerced to any supported types according to " + "the casting rule ''safe''" ) with pytest.raises(TypeError, match=msg): func(idx) diff --git a/pandas/tests/indexes/multi/test_astype.py b/pandas/tests/indexes/multi/test_astype.py index 93fdeb10b849a..29908537fbe59 100644 --- a/pandas/tests/indexes/multi/test_astype.py +++ b/pandas/tests/indexes/multi/test_astype.py @@ -3,7 +3,7 @@ from pandas.core.dtypes.dtypes import CategoricalDtype -import pandas.util.testing as tm +import pandas._testing as tm def test_astype(idx): diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index b02f87dc4aacb..d92cff1e10496 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -2,7 +2,7 @@ import pytest from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_numeric_compat(idx): diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 0e4d144c0fd34..2c4b3ce04f96d 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import Index, MultiIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm def test_constructor_single_level(): @@ -65,8 +65,8 @@ def test_constructor_mismatched_codes_levels(idx): MultiIndex(levels=levels, codes=codes) length_error = ( - r"On level 0, code max \(3\) >= length of level \(1\)\." - " NOTE: this index is in an inconsistent state" + r"On level 0, code max \(3\) >= length of level \(1\)\. " + "NOTE: this index is in an inconsistent state" ) label_error = r"Unequal code lengths: \[4, 2\]" code_value_error = r"On level 0, code value \(-2\) < -1" diff --git a/pandas/tests/indexes/multi/test_contains.py b/pandas/tests/indexes/multi/test_contains.py index 64d2859cd13db..49aa63210cd5e 100644 --- a/pandas/tests/indexes/multi/test_contains.py +++ b/pandas/tests/indexes/multi/test_contains.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_contains_top_level(): @@ -98,3 +98,27 @@ def test_isin_level_kwarg(): with pytest.raises(KeyError, match="'Level C not found'"): idx.isin(vals_1, level="C") + + +def test_contains_with_missing_value(): + # issue 19132 + idx = MultiIndex.from_arrays([[1, np.nan, 2]]) + assert np.nan in idx + + idx = MultiIndex.from_arrays([[1, 2], [np.nan, 3]]) + assert np.nan not in idx + assert (1, np.nan) in idx + + +@pytest.mark.parametrize( + "labels,expected,level", + [ + ([("b", np.nan)], np.array([False, False, True]), None,), + ([np.nan, "a"], np.array([True, True, False]), 0), + (["d", np.nan], np.array([False, True, True]), 1), + ], +) +def test_isin_multi_index_with_missing_value(labels, expected, level): + # GH 19132 + midx = MultiIndex.from_arrays([[np.nan, "a", "b"], ["c", "d", np.nan]]) + tm.assert_numpy_array_equal(midx.isin(labels, level=level), expected) diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py index fab4f72dc153b..8956e6ed4996f 100644 --- a/pandas/tests/indexes/multi/test_conversion.py +++ b/pandas/tests/indexes/multi/test_conversion.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm def test_tolist(idx): diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 12cd0db6936f5..1acc65aef8b8a 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -3,7 +3,7 @@ import pytest from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def assert_multiindex_copied(copy, original): diff --git a/pandas/tests/indexes/multi/test_drop.py b/pandas/tests/indexes/multi/test_drop.py index 2c24c5bd57085..b909025b3f2f9 100644 --- a/pandas/tests/indexes/multi/test_drop.py +++ b/pandas/tests/indexes/multi/test_drop.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_drop(idx): @@ -108,8 +108,8 @@ def test_droplevel_list(): assert dropped.equals(expected) msg = ( - "Cannot remove 3 levels from an index with 3 levels: at least one" - " level must be left" + "Cannot remove 3 levels from an index with 3 levels: " + "at least one level must be left" ) with pytest.raises(ValueError, match=msg): index[:2].droplevel(["one", "two", "three"]) @@ -139,3 +139,52 @@ def test_drop_not_lexsorted(): tm.assert_index_equal(lexsorted_mi, not_lexsorted_mi) with tm.assert_produces_warning(PerformanceWarning): tm.assert_index_equal(lexsorted_mi.drop("a"), not_lexsorted_mi.drop("a")) + + +@pytest.mark.parametrize( + "msg,labels,level", + [ + (r"labels \[4\] not found in level", 4, "a"), + (r"labels \[7\] not found in level", 7, "b"), + ], +) +def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + with pytest.raises(KeyError, match=msg): + s.drop(labels, level=level) + with pytest.raises(KeyError, match=msg): + df.drop(labels, level=level) + + +@pytest.mark.parametrize("labels,level", [(4, "a"), (7, "b")]) +def test_drop_errors_ignore(labels, level): + # GH 8594 + mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) + s = pd.Series([10, 20, 30], index=mi) + df = pd.DataFrame([10, 20, 30], index=mi) + + expected_s = s.drop(labels, level=level, errors="ignore") + tm.assert_series_equal(s, expected_s) + + expected_df = df.drop(labels, level=level, errors="ignore") + tm.assert_frame_equal(df, expected_df) + + +def test_drop_with_non_unique_datetime_index_and_invalid_keys(): + # GH 30399 + + # define dataframe with unique datetime index + df = pd.DataFrame( + np.random.randn(5, 3), + columns=["a", "b", "c"], + index=pd.date_range("2012", freq="H", periods=5), + ) + # create dataframe with non-unique datetime index + df = df.iloc[[0, 2, 2, 3]].copy() + + with pytest.raises(KeyError, match="not found in axis"): + df.drop(["a", "b"]) # Dropping with labels not exist in the index diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index ee1f068b92df1..93e1de535835f 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -6,7 +6,7 @@ from pandas._libs import hashtable from pandas import DatetimeIndex, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("names", [None, ["first", "second"]]) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index c81af5a0c6c49..063ede028add7 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm def test_equals(idx): diff --git a/pandas/tests/indexes/multi/test_format.py b/pandas/tests/indexes/multi/test_format.py index 3a8063aed8d20..75f23fb2f32ba 100644 --- a/pandas/tests/indexes/multi/test_format.py +++ b/pandas/tests/indexes/multi/test_format.py @@ -4,7 +4,7 @@ import pandas as pd from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_format(idx): diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index ec3c654ecb1ed..074072ae581b2 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import CategoricalIndex, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def assert_matching(actual, expected, check_dtype=False): diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 9ef2a77205acc..9070eb3deffb5 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -12,8 +12,8 @@ MultiIndex, date_range, ) +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError -import pandas.util.testing as tm def test_slice_locs_partial(idx): @@ -396,7 +396,8 @@ def test_get_loc_missing_nan(): idx.get_loc(3) with pytest.raises(KeyError, match=r"^nan$"): idx.get_loc(np.nan) - with pytest.raises(KeyError, match=r"^\[nan\]$"): + with pytest.raises(TypeError, match=r"'\[nan\]' is an invalid key"): + # listlike/non-hashable raises TypeError idx.get_loc([np.nan]) @@ -437,3 +438,91 @@ def test_timestamp_multiindex_indexer(): ) should_be = pd.Series(data=np.arange(24, len(qidx) + 24), index=qidx, name="foo") tm.assert_series_equal(result, should_be) + + +def test_get_loc_with_values_including_missing_values(): + # issue 19132 + idx = MultiIndex.from_product([[np.nan, 1]] * 2) + expected = slice(0, 2, None) + assert idx.get_loc(np.nan) == expected + + idx = MultiIndex.from_arrays([[np.nan, 1, 2, np.nan]]) + expected = np.array([True, False, False, True]) + tm.assert_numpy_array_equal(idx.get_loc(np.nan), expected) + + idx = MultiIndex.from_product([[np.nan, 1]] * 3) + expected = slice(2, 4, None) + assert idx.get_loc((np.nan, 1)) == expected + + +@pytest.mark.parametrize( + "index_arr,labels,expected", + [ + ( + [[1, np.nan, 2], [3, 4, 5]], + [1, np.nan, 2], + np.array([-1, -1, -1], dtype=np.intp), + ), + ([[1, np.nan, 2], [3, 4, 5]], [(np.nan, 4)], np.array([1], dtype=np.intp)), + ([[1, 2, 3], [np.nan, 4, 5]], [(1, np.nan)], np.array([0], dtype=np.intp)), + ( + [[1, 2, 3], [np.nan, 4, 5]], + [np.nan, 4, 5], + np.array([-1, -1, -1], dtype=np.intp), + ), + ], +) +def test_get_indexer_with_missing_value(index_arr, labels, expected): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_indexer(labels) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "index_arr,expected,target,algo", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], 0, np.nan, "left"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], 1, (np.nan, "c"), "right"), + ([["a", "b", "c"], ["d", np.nan, "d"]], 1, ("b", np.nan), "left"), + ], +) +def test_get_slice_bound_with_missing_value(index_arr, expected, target, algo): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.get_slice_bound(target, side=algo, kind="loc") + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 2, None), np.nan, 1), + ([[np.nan, 1, 2], [3, 4, 5]], slice(0, 3, None), np.nan, (2, 5)), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), 3), + ([[1, 2, 3], [4, np.nan, 5]], slice(1, 3, None), (2, np.nan), (3, 5)), + ], +) +def test_slice_indexer_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_indexer(start=start_idx, end=end_idx) + assert result == expected + + +@pytest.mark.parametrize( + "index_arr,expected,start_idx,end_idx", + [ + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, None), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, "b"), + ([[np.nan, "a", "b"], ["c", "d", "e"]], (0, 3), np.nan, ("b", "e")), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), None), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), "c"), + ([["a", "b", "c"], ["d", np.nan, "e"]], (1, 3), ("b", np.nan), ("c", "e")), + ], +) +def test_slice_locs_with_missing_value(index_arr, expected, start_idx, end_idx): + # issue 19132 + idx = MultiIndex.from_arrays(index_arr) + result = idx.slice_locs(start=start_idx, end=end_idx) + assert result == expected diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index a8711533e806c..f2ec15e0af88c 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import IntervalIndex, MultiIndex, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_labels_dtypes(): diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index 42d8cf761842e..062fb92c44552 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -87,3 +87,19 @@ def test_join_self_unique(idx, join_type): if idx.is_unique: joined = idx.join(idx, how=join_type) assert (idx == joined).all() + + +def test_join_multi_wrong_order(): + # GH 25760 + # GH 28956 + + midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + midx2 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"]) + + join_idx, lidx, ridx = midx1.join(midx2, return_indexers=False) + + exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp) + + tm.assert_index_equal(midx1, join_idx) + assert lidx is None + tm.assert_numpy_array_equal(ridx, exp_ridx) diff --git a/pandas/tests/indexes/multi/test_missing.py b/pandas/tests/indexes/multi/test_missing.py index 31de40512c474..a17e1e9928bff 100644 --- a/pandas/tests/indexes/multi/test_missing.py +++ b/pandas/tests/indexes/multi/test_missing.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Int64Index, MultiIndex, PeriodIndex, UInt64Index +import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm def test_fillna(idx): @@ -101,7 +101,7 @@ def test_nulls(idx): idx.isna() -@pytest.mark.xfail +@pytest.mark.xfail(reason="isna is not defined for MultiIndex") def test_hasnans_isnans(idx): # GH 11343, added tests for hasnans / isnans index = idx.copy() diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 5c3a48c9dd481..479b5ef0211a0 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def check_level_names(index, names): @@ -124,3 +124,20 @@ def test_get_names_from_levels(): assert idx.levels[0].name == "a" assert idx.levels[1].name == "b" + + +def test_setting_names_from_levels_raises(): + idx = pd.MultiIndex.from_product([["a"], [1, 2]], names=["a", "b"]) + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[0].name = "foo" + + with pytest.raises(RuntimeError, match="set_names"): + idx.levels[1].name = "foo" + + new = pd.Series(1, index=idx.levels[0]) + with pytest.raises(RuntimeError, match="set_names"): + new.index.name = "bar" + + assert pd.Index._no_setting_name is False + assert pd.Int64Index._no_setting_name is False + assert pd.RangeIndex._no_setting_name is False diff --git a/pandas/tests/indexes/multi/test_partial_indexing.py b/pandas/tests/indexes/multi/test_partial_indexing.py index 5db1296d828ca..b00018d2ceb69 100644 --- a/pandas/tests/indexes/multi/test_partial_indexing.py +++ b/pandas/tests/indexes/multi/test_partial_indexing.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm def test_partial_string_timestamp_multiindex(): diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 513efa8941de8..ceb14aa82a76c 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_reindex(idx): diff --git a/pandas/tests/indexes/multi/test_reshape.py b/pandas/tests/indexes/multi/test_reshape.py index 37df420e9ea2e..2e39c714ca7af 100644 --- a/pandas/tests/indexes/multi/test_reshape.py +++ b/pandas/tests/indexes/multi/test_reshape.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_insert(idx): diff --git a/pandas/tests/indexes/multi/test_set_ops.py b/pandas/tests/indexes/multi/test_setops.py similarity index 99% rename from pandas/tests/indexes/multi/test_set_ops.py rename to pandas/tests/indexes/multi/test_setops.py index 835784054261e..841e3b3f17b38 100644 --- a/pandas/tests/indexes/multi/test_set_ops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("case", [0.5, "xxx"]) diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 3dee1dbecf3ba..277bd79cfe953 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import CategoricalIndex, DataFrame, Index, MultiIndex, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_sortlevel(idx): @@ -120,7 +120,7 @@ def test_unsortedindex(): def test_unsortedindex_doc_examples(): - # http://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa + # https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa dfm = DataFrame( {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} ) diff --git a/pandas/tests/indexes/period/test_asfreq.py b/pandas/tests/indexes/period/test_asfreq.py index fd6013ab5ae08..88e800d66f3ad 100644 --- a/pandas/tests/indexes/period/test_asfreq.py +++ b/pandas/tests/indexes/period/test_asfreq.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, PeriodIndex, Series, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndex: diff --git a/pandas/tests/indexes/period/test_astype.py b/pandas/tests/indexes/period/test_astype.py index fa57ec2b1f7ca..ec386dd9dd11c 100644 --- a/pandas/tests/indexes/period/test_astype.py +++ b/pandas/tests/indexes/period/test_astype.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Index, Int64Index, NaT, Period, PeriodIndex, period_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodIndexAsType: diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 2adce0b7f8b44..dcd3c8e946e9a 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -7,14 +7,11 @@ import pandas as pd from pandas import Index, Period, PeriodIndex, Series, date_range, offsets, period_range -import pandas.core.indexes.period as period -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays import PeriodArray class TestPeriodIndex: - def setup_method(self, method): - pass - def test_construction_base_constructor(self): # GH 13664 arr = [pd.Period("2011-01", freq="M"), pd.NaT, pd.Period("2011-03", freq="M")] @@ -32,6 +29,30 @@ def test_construction_base_constructor(self): pd.Index(np.array(arr)), pd.Index(np.array(arr), dtype=object) ) + def test_base_constructor_with_period_dtype(self): + dtype = PeriodDtype("D") + values = ["2011-01-01", "2012-03-04", "2014-05-01"] + result = pd.Index(values, dtype=dtype) + + expected = pd.PeriodIndex(values, dtype=dtype) + tm.assert_index_equal(result, expected) + + @pytest.mark.parametrize( + "values_constructor", [list, np.array, PeriodIndex, PeriodArray._from_sequence] + ) + def test_index_object_dtype(self, values_constructor): + # Index(periods, dtype=object) is an Index (not an PeriodIndex) + periods = [ + pd.Period("2011-01", freq="M"), + pd.NaT, + pd.Period("2011-03", freq="M"), + ] + values = values_constructor(periods) + result = Index(values, dtype=object) + + assert type(result) is Index + tm.assert_numpy_array_equal(result.values, np.array(values)) + def test_constructor_use_start_freq(self): # GH #1118 p = Period("4/2/2012", freq="B") @@ -201,7 +222,7 @@ def test_constructor_dtype(self): assert res.dtype == "period[M]" msg = "specified freq and dtype are different" - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex(["2011-01"], freq="M", dtype="period[D]") def test_constructor_empty(self): @@ -261,12 +282,12 @@ def test_constructor_pi_nat(self): def test_constructor_incompat_freq(self): msg = "Input has different freq=D from PeriodIndex\\(freq=M\\)" - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] ) - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( np.array( [Period("2011-01", freq="M"), pd.NaT, Period("2011-01", freq="D")] @@ -274,12 +295,12 @@ def test_constructor_incompat_freq(self): ) # first element is pd.NaT - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] ) - with pytest.raises(period.IncompatibleFrequency, match=msg): + with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex( np.array( [pd.NaT, Period("2011-01", freq="M"), Period("2011-01", freq="D")] @@ -301,22 +322,33 @@ def test_constructor_mixed(self): def test_constructor_simple_new(self): idx = period_range("2007-01", name="p", periods=2, freq="M") - result = idx._simple_new(idx, name="p", freq=idx.freq) + + with pytest.raises(AssertionError, match=""): + idx._simple_new(idx, name="p", freq=idx.freq) + + result = idx._simple_new(idx._data, name="p", freq=idx.freq) tm.assert_index_equal(result, idx) - result = idx._simple_new(idx.astype("i8"), name="p", freq=idx.freq) + with pytest.raises(AssertionError): + # Need ndarray, not Int64Index + type(idx._data)._simple_new(idx.astype("i8"), freq=idx.freq) + + arr = type(idx._data)._simple_new(idx.asi8, freq=idx.freq) + result = idx._simple_new(arr, name="p") tm.assert_index_equal(result, idx) def test_constructor_simple_new_empty(self): # GH13079 idx = PeriodIndex([], freq="M", name="p") - result = idx._simple_new(idx, name="p", freq="M") + with pytest.raises(AssertionError, match=""): + idx._simple_new(idx, name="p", freq="M") + + result = idx._simple_new(idx._data, name="p", freq="M") tm.assert_index_equal(result, idx) @pytest.mark.parametrize("floats", [[1.1, 2.1], np.array([1.1, 2.1])]) def test_constructor_floats(self, floats): - msg = r"PeriodIndex\._simple_new does not accept floats" - with pytest.raises(TypeError, match=msg): + with pytest.raises(AssertionError, match=" np.ndarray: return self.array expected = pd.Index(array) @@ -1911,7 +1915,12 @@ def test_get_value(self, index): values = np.random.randn(100) value = index[67] - tm.assert_almost_equal(index.get_value(values, value), values[67]) + with pytest.raises(AttributeError, match="has no attribute '_values'"): + # Index.get_value requires a Series, not an ndarray + index.get_value(values, value) + + result = index.get_value(Series(values, index=values), value) + tm.assert_almost_equal(result, values[67]) @pytest.mark.parametrize("values", [["foo", "bar", "quux"], {"foo", "bar", "quux"}]) @pytest.mark.parametrize( @@ -2396,13 +2405,14 @@ def test_cached_properties_not_settable(self): with pytest.raises(AttributeError, match="Can't set attribute"): index.is_unique = False - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; idx = pd.Index([1, 2])" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("idx.", 4)) @@ -2781,9 +2791,35 @@ def test_shape_of_invalid_index(): # about this). However, as long as this is not solved in general,this test ensures # that the returned shape is consistent with this underlying array for # compat with matplotlib (see https://github.com/pandas-dev/pandas/issues/27775) - a = np.arange(8).reshape(2, 2, 2) - idx = pd.Index(a) - assert idx.shape == a.shape - idx = pd.Index([0, 1, 2, 3]) - assert idx[:, None].shape == (4, 1) + with tm.assert_produces_warning(DeprecationWarning): + # GH#30588 multi-dimensional indexing deprecated + assert idx[:, None].shape == (4, 1) + + +def test_validate_1d_input(): + # GH#27125 check that we do not have >1-dimensional input + msg = "Index data must be 1-dimensional" + + arr = np.arange(8).reshape(2, 2, 2) + with pytest.raises(ValueError, match=msg): + pd.Index(arr) + + with pytest.raises(ValueError, match=msg): + pd.Float64Index(arr.astype(np.float64)) + + with pytest.raises(ValueError, match=msg): + pd.Int64Index(arr.astype(np.int64)) + + with pytest.raises(ValueError, match=msg): + pd.UInt64Index(arr.astype(np.uint64)) + + df = pd.DataFrame(arr.reshape(4, 2)) + with pytest.raises(ValueError, match=msg): + pd.Index(df) + + # GH#13601 trying to assign a multi-dimensional array to an index is not + # allowed + ser = pd.Series(0, range(4)) + with pytest.raises(ValueError, match=msg): + ser.index = np.array([[2, 3]] * 4) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 82ef71efa70d0..7e30233353553 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import CategoricalIndex, MultiIndex, RangeIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestCommon: diff --git a/pandas/tests/indexes/test_engines.py b/pandas/tests/indexes/test_engines.py new file mode 100644 index 0000000000000..ee224c9c6ec89 --- /dev/null +++ b/pandas/tests/indexes/test_engines.py @@ -0,0 +1,57 @@ +import re + +import pytest + +import pandas as pd + + +class TestDatetimeEngine: + @pytest.mark.parametrize( + "scalar", + [ + pd.Timedelta(pd.Timestamp("2016-01-01").asm8.view("m8[ns]")), + pd.Timestamp("2016-01-01").value, + pd.Timestamp("2016-01-01").to_pydatetime(), + pd.Timestamp("2016-01-01").to_datetime64(), + ], + ) + def test_not_contains_requires_timestamp(self, scalar): + dti1 = pd.date_range("2016-01-01", periods=3) + dti2 = dti1.insert(1, pd.NaT) # non-monotonic + dti3 = dti1.insert(3, dti1[0]) # non-unique + dti4 = pd.date_range("2016-01-01", freq="ns", periods=2_000_000) + dti5 = dti4.insert(0, dti4[0]) # over size threshold, not unique + + msg = "|".join([re.escape(str(scalar)), re.escape(repr(scalar))]) + for dti in [dti1, dti2, dti3, dti4, dti5]: + with pytest.raises(TypeError, match=msg): + scalar in dti._engine + + with pytest.raises(KeyError, match=msg): + dti._engine.get_loc(scalar) + + +class TestTimedeltaEngine: + @pytest.mark.parametrize( + "scalar", + [ + pd.Timestamp(pd.Timedelta(days=42).asm8.view("datetime64[ns]")), + pd.Timedelta(days=42).value, + pd.Timedelta(days=42).to_pytimedelta(), + pd.Timedelta(days=42).to_timedelta64(), + ], + ) + def test_not_contains_requires_timestamp(self, scalar): + tdi1 = pd.timedelta_range("42 days", freq="9h", periods=1234) + tdi2 = tdi1.insert(1, pd.NaT) # non-monotonic + tdi3 = tdi1.insert(3, tdi1[0]) # non-unique + tdi4 = pd.timedelta_range("42 days", freq="ns", periods=2_000_000) + tdi5 = tdi4.insert(0, tdi4[0]) # over size threshold, not unique + + msg = "|".join([re.escape(str(scalar)), re.escape(repr(scalar))]) + for tdi in [tdi1, tdi2, tdi3, tdi4, tdi5]: + with pytest.raises(TypeError, match=msg): + scalar in tdi._engine + + with pytest.raises(KeyError, match=msg): + tdi._engine.get_loc(scalar) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 37976d89ecba4..12cc51222e6bb 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -8,9 +8,9 @@ import pandas as pd from pandas import Float64Index, Index, Int64Index, Series, UInt64Index +import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.tests.indexes.common import Base -import pandas.util.testing as tm class Numeric(Base): @@ -188,14 +188,14 @@ def test_constructor_invalid(self): # invalid msg = ( - r"Float64Index\(\.\.\.\) must be called with a collection of" - r" some kind, 0\.0 was passed" + r"Float64Index\(\.\.\.\) must be called with a collection of " + r"some kind, 0\.0 was passed" ) with pytest.raises(TypeError, match=msg): Float64Index(0.0) msg = ( - "String dtype not supported, you may need to explicitly cast to" - " a numeric type" + "String dtype not supported, " + "you may need to explicitly cast to a numeric type" ) with pytest.raises(TypeError, match=msg): Float64Index(["a", "b", 0.0]) @@ -389,7 +389,8 @@ def test_get_loc_missing_nan(self): idx.get_loc(3) with pytest.raises(KeyError, match="^nan$"): idx.get_loc(np.nan) - with pytest.raises(KeyError, match=r"^\[nan\]$"): + with pytest.raises(TypeError, match=r"'\[nan\]' is an invalid key"): + # listlike/non-hashable raises TypeError idx.get_loc([np.nan]) def test_contains_nans(self): @@ -570,8 +571,8 @@ def test_union_noncomparable(self): def test_cant_or_shouldnt_cast(self): msg = ( - "String dtype not supported, you may need to explicitly cast to" - " a numeric type" + "String dtype not supported, " + "you may need to explicitly cast to a numeric type" ) # can't data = ["foo", "bar", "baz"] @@ -655,8 +656,8 @@ def test_constructor(self): # scalar raise Exception msg = ( - r"Int64Index\(\.\.\.\) must be called with a collection of some" - " kind, 5 was passed" + r"Int64Index\(\.\.\.\) must be called with a collection of some " + "kind, 5 was passed" ) with pytest.raises(TypeError, match=msg): Int64Index(5) @@ -736,6 +737,12 @@ def test_get_indexer(self): expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) + def test_get_indexer_nan(self): + # GH 7820 + result = Index([1, 2, np.nan]).get_indexer([np.nan]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + def test_intersection(self): index = self.create_index() other = Index([1, 2, 3, 4, 5]) diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 3d24c70afdda2..583556656ac87 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -12,8 +12,8 @@ _np_version_under1p17, _np_version_under1p18, ) +import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin -import pandas.util.testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index a7e2363ec422e..abfa413d56655 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -11,9 +11,9 @@ import pandas as pd from pandas import Float64Index, Int64Index, RangeIndex, UInt64Index +import pandas._testing as tm from pandas.api.types import pandas_dtype from pandas.tests.indexes.conftest import indices_dict -import pandas.util.testing as tm COMPATIBLE_INCONSISTENT_PAIRS = { (Int64Index, RangeIndex): (tm.makeIntIndex, tm.makeRangeIndex), diff --git a/pandas/tests/indexes/timedeltas/test_astype.py b/pandas/tests/indexes/timedeltas/test_astype.py index e479d93af2902..82c9d995c9c7c 100644 --- a/pandas/tests/indexes/timedeltas/test_astype.py +++ b/pandas/tests/indexes/timedeltas/test_astype.py @@ -13,7 +13,7 @@ TimedeltaIndex, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltaIndex: diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index ff6ee051755bb..39abbf59d1e56 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Timedelta, TimedeltaIndex, timedelta_range, to_timedelta +import pandas._testing as tm from pandas.core.arrays import TimedeltaArray -import pandas.util.testing as tm class TestTimedeltaIndex: @@ -176,15 +176,15 @@ def test_constructor_coverage(self): # non-conforming freq msg = ( - "Inferred frequency None from passed values does not conform to" - " passed frequency D" + "Inferred frequency None from passed values does not conform to " + "passed frequency D" ) with pytest.raises(ValueError, match=msg): TimedeltaIndex(["1 days", "2 days", "4 days"], freq="D") msg = ( - "Of the four parameters: start, end, periods, and freq, exactly" - " three must be specified" + "Of the four parameters: start, end, periods, and freq, exactly " + "three must be specified" ) with pytest.raises(ValueError, match=msg): timedelta_range(periods=10, freq="D") diff --git a/pandas/tests/indexes/timedeltas/test_indexing.py b/pandas/tests/indexes/timedeltas/test_indexing.py index 17ab85033acfb..e8665ee1a3555 100644 --- a/pandas/tests/indexes/timedeltas/test_indexing.py +++ b/pandas/tests/indexes/timedeltas/test_indexing.py @@ -4,8 +4,8 @@ import pytest import pandas as pd -from pandas import Index, Timedelta, TimedeltaIndex, timedelta_range -import pandas.util.testing as tm +from pandas import Index, Timedelta, TimedeltaIndex, notna, timedelta_range +import pandas._testing as tm class TestGetItem: @@ -58,8 +58,20 @@ def test_timestamp_invalid_key(self, key): class TestWhere: - # placeholder for symmetry with DatetimeIndex and PeriodIndex tests - pass + def test_where_invalid_dtypes(self): + tdi = timedelta_range("1 day", periods=3, freq="D", name="idx") + + i2 = tdi.copy() + i2 = Index([pd.NaT, pd.NaT] + tdi[2:].tolist()) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + tdi.where(notna(i2), i2.asi8) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + tdi.where(notna(i2), i2 + pd.Timestamp.now()) + + with pytest.raises(TypeError, match="Where requires matching dtype"): + tdi.where(notna(i2), (i2 + pd.Timestamp.now()).to_period("D")) class TestTake: @@ -161,6 +173,15 @@ def test_take_fill_value(self): class TestTimedeltaIndex: + def test_insert_empty(self): + # Corner case inserting with length zero doesnt raise IndexError + idx = timedelta_range("1 Day", periods=3) + td = idx[0] + + idx[:0].insert(0, td) + idx[:0].insert(1, td) + idx[:0].insert(-1, td) + def test_insert(self): idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") @@ -219,11 +240,29 @@ def test_insert(self): assert result.name == expected.name assert result.freq == expected.freq + @pytest.mark.parametrize( + "null", [None, np.nan, np.timedelta64("NaT"), pd.NaT, pd.NA] + ) + def test_insert_nat(self, null): # GH 18295 (test missing) + idx = timedelta_range("1day", "3day") + result = idx.insert(1, null) expected = TimedeltaIndex(["1day", pd.NaT, "2day", "3day"]) - for na in (np.nan, pd.NaT, None): - result = timedelta_range("1day", "3day").insert(1, na) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) + + def test_insert_invalid_na(self): + idx = TimedeltaIndex(["4day", "1day", "2day"], name="idx") + with pytest.raises(TypeError, match="incompatible label"): + idx.insert(0, np.datetime64("NaT")) + + def test_insert_dont_cast_strings(self): + # To match DatetimeIndex and PeriodIndex behavior, dont try to + # parse strings to Timedelta + idx = timedelta_range("1day", "3day") + + result = idx.insert(0, "1 Day") + assert result.dtype == object + assert result[0] == "1 Day" def test_delete(self): idx = timedelta_range(start="1 Days", periods=5, freq="D", name="idx") diff --git a/pandas/tests/indexes/timedeltas/test_ops.py b/pandas/tests/indexes/timedeltas/test_ops.py index 56043cf3edb2d..25f27da758ad8 100644 --- a/pandas/tests/indexes/timedeltas/test_ops.py +++ b/pandas/tests/indexes/timedeltas/test_ops.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import Series, TimedeltaIndex, timedelta_range +import pandas._testing as tm from pandas.tests.base.test_ops import Ops -import pandas.util.testing as tm from pandas.tseries.offsets import Day, Hour diff --git a/pandas/tests/indexes/timedeltas/test_partial_slicing.py b/pandas/tests/indexes/timedeltas/test_partial_slicing.py index 4448b5e39684b..29e2c7dd20be0 100644 --- a/pandas/tests/indexes/timedeltas/test_partial_slicing.py +++ b/pandas/tests/indexes/timedeltas/test_partial_slicing.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, Timedelta, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSlicing: diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 38f1d2c7d4a1b..44f4a2adedaad 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import Index, Series, Timedelta, TimedeltaIndex, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm class TestVectorizedTimedelta: diff --git a/pandas/tests/indexes/timedeltas/test_setops.py b/pandas/tests/indexes/timedeltas/test_setops.py index bbdd6c8c7c017..0aa784cbb7710 100644 --- a/pandas/tests/indexes/timedeltas/test_setops.py +++ b/pandas/tests/indexes/timedeltas/test_setops.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Int64Index, TimedeltaIndex, timedelta_range -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Hour @@ -22,6 +22,22 @@ def test_union(self): i1.union(i2) # Works i2.union(i1) # Fails with "AttributeError: can't set attribute" + def test_union_sort_false(self): + tdi = timedelta_range("1day", periods=5) + + left = tdi[3:] + right = tdi[:3] + + # Check that we are testing the desired code path + assert left._can_fast_union(right) + + result = left.union(right) + tm.assert_index_equal(result, tdi) + + result = left.union(right, sort=False) + expected = pd.TimedeltaIndex(["4 Days", "5 Days", "1 Days", "2 Day", "3 Days"]) + tm.assert_index_equal(result, expected) + def test_union_coverage(self): idx = TimedeltaIndex(["3d", "1d", "2d"]) @@ -62,6 +78,21 @@ def test_union_bug_4564(self): exp = TimedeltaIndex(sorted(set(left) | set(right))) tm.assert_index_equal(result, exp) + def test_union_freq_infer(self): + # When taking the union of two TimedeltaIndexes, we infer + # a freq even if the arguments don't have freq. This matches + # DatetimeIndex behavior. + tdi = pd.timedelta_range("1 Day", periods=5) + left = tdi[[0, 1, 3, 4]] + right = tdi[[2, 3, 1]] + + assert left.freq is None + assert right.freq is None + + result = left.union(right) + tm.assert_index_equal(result, tdi) + assert result.freq == "D" + def test_intersection_bug_1708(self): index_1 = timedelta_range("1 day", periods=4, freq="h") index_2 = index_1 + pd.offsets.Hour(5) @@ -179,3 +210,51 @@ def test_intersection_non_monotonic(self, rng, expected, sort): assert isinstance(result.freq, Hour) else: assert result.freq is None + + +class TestTimedeltaIndexDifference: + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_freq(self, sort): + # GH14323: Difference of TimedeltaIndex should not preserve frequency + + index = timedelta_range("0 days", "5 days", freq="D") + + other = timedelta_range("1 days", "4 days", freq="D") + expected = TimedeltaIndex(["0 days", "5 days"], freq=None) + idx_diff = index.difference(other, sort) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["0 days", "1 days"], freq=None) + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + @pytest.mark.parametrize("sort", [None, False]) + def test_difference_sort(self, sort): + + index = pd.TimedeltaIndex( + ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] + ) + + other = timedelta_range("1 days", "4 days", freq="D") + idx_diff = index.difference(other, sort) + + expected = TimedeltaIndex(["5 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) + + other = timedelta_range("2 days", "5 days", freq="D") + idx_diff = index.difference(other, sort) + expected = TimedeltaIndex(["1 days", "0 days"], freq=None) + + if sort is None: + expected = expected.sort_values() + + tm.assert_index_equal(idx_diff, expected) + tm.assert_attr_equal("freq", idx_diff, expected) diff --git a/pandas/tests/indexes/timedeltas/test_shift.py b/pandas/tests/indexes/timedeltas/test_shift.py index 048b29c0da501..98933ff0423ab 100644 --- a/pandas/tests/indexes/timedeltas/test_shift.py +++ b/pandas/tests/indexes/timedeltas/test_shift.py @@ -4,7 +4,7 @@ import pandas as pd from pandas import TimedeltaIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltaIndexShift: diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index 35575f3349f83..3b52b93fa6369 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -14,7 +14,7 @@ date_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm from ..datetimelike import DatetimeLike @@ -57,52 +57,6 @@ def test_fillna_timedelta(self): ) tm.assert_index_equal(idx.fillna("x"), exp) - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_freq(self, sort): - # GH14323: Difference of TimedeltaIndex should not preserve frequency - - index = timedelta_range("0 days", "5 days", freq="D") - - other = timedelta_range("1 days", "4 days", freq="D") - expected = TimedeltaIndex(["0 days", "5 days"], freq=None) - idx_diff = index.difference(other, sort) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = timedelta_range("2 days", "5 days", freq="D") - idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["0 days", "1 days"], freq=None) - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - @pytest.mark.parametrize("sort", [None, False]) - def test_difference_sort(self, sort): - - index = pd.TimedeltaIndex( - ["5 days", "3 days", "2 days", "4 days", "1 days", "0 days"] - ) - - other = timedelta_range("1 days", "4 days", freq="D") - idx_diff = index.difference(other, sort) - - expected = TimedeltaIndex(["5 days", "0 days"], freq=None) - - if sort is None: - expected = expected.sort_values() - - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - - other = timedelta_range("2 days", "5 days", freq="D") - idx_diff = index.difference(other, sort) - expected = TimedeltaIndex(["1 days", "0 days"], freq=None) - - if sort is None: - expected = expected.sort_values() - - tm.assert_index_equal(idx_diff, expected) - tm.assert_attr_equal("freq", idx_diff, expected) - def test_isin(self): index = tm.makeTimedeltaIndex(4) @@ -247,6 +201,13 @@ def test_append_numpy_bug_1681(self): result = a.append(c) assert (result["B"] == td).all() + def test_delete_doesnt_infer_freq(self): + # GH#30655 behavior matches DatetimeIndex + + tdi = pd.TimedeltaIndex(["1 Day", "2 Days", None, "3 Days", "4 Days"]) + result = tdi.delete(2) + assert result.freq is None + def test_fields(self): rng = timedelta_range("1 days, 10:11:12.100123456", periods=2, freq="s") tm.assert_index_equal(rng.days, Index([1, 1], dtype="int64")) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 1c1d0f1a735cf..1cef9de6a3a77 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import timedelta_range, to_timedelta -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Day, Second diff --git a/pandas/tests/indexes/timedeltas/test_tools.py b/pandas/tests/indexes/timedeltas/test_tools.py index 5bd7a2a583b84..477fc092a4e16 100644 --- a/pandas/tests/indexes/timedeltas/test_tools.py +++ b/pandas/tests/indexes/timedeltas/test_tools.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Series, TimedeltaIndex, isna, to_timedelta -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltas: @@ -73,8 +73,7 @@ def test_to_timedelta_invalid(self): # time not supported ATM msg = ( - "Value must be Timedelta, string, integer, float, timedelta or" - " convertible" + "Value must be Timedelta, string, integer, float, timedelta or convertible" ) with pytest.raises(ValueError, match=msg): to_timedelta(time(second=1)) diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 08e8dbad4e102..3c027b035c2b8 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -7,7 +7,7 @@ from pandas.core.dtypes.common import is_scalar from pandas import DataFrame, Float64Index, MultiIndex, Series, UInt64Index, date_range -import pandas.util.testing as tm +import pandas._testing as tm def _mklbl(prefix, n): diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index bbce786fc07ba..634020982b1c2 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, IntervalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestIntervalIndex: @@ -64,7 +64,7 @@ def test_non_matching(self): s = self.s # this is a departure from our current - # indexin scheme, but simpler + # indexing scheme, but simpler with pytest.raises(KeyError, match="^$"): s.loc[[-1, 3, 4, 5]] diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index a86a9d16d3f9f..43036fbbd9844 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -4,7 +4,7 @@ import pytest from pandas import Interval, IntervalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestIntervalIndex: diff --git a/pandas/tests/indexing/multiindex/conftest.py b/pandas/tests/indexing/multiindex/conftest.py index e58e6ed0d5d83..e6d5a9eb84410 100644 --- a/pandas/tests/indexing/multiindex/conftest.py +++ b/pandas/tests/indexing/multiindex/conftest.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index e0206c8e7f6aa..8bfba8c12e934 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -2,13 +2,13 @@ import pytest from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm def test_detect_chained_assignment(): # Inplace ops, originally from: - # http://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug + # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug a = [12, 23] b = [123, None] c = [1234, 2345] diff --git a/pandas/tests/indexing/multiindex/test_getitem.py b/pandas/tests/indexing/multiindex/test_getitem.py index 519a1eb5b16d8..8ea825da8f94f 100644 --- a/pandas/tests/indexing/multiindex/test_getitem.py +++ b/pandas/tests/indexing/multiindex/test_getitem.py @@ -2,8 +2,8 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm # ---------------------------------------------------------------------------- # test indexing of Series with multi-level Index diff --git a/pandas/tests/indexing/multiindex/test_iloc.py b/pandas/tests/indexing/multiindex/test_iloc.py index 2c2e4d06f1ae3..9859c7235c380 100644 --- a/pandas/tests/indexing/multiindex/test_iloc.py +++ b/pandas/tests/indexing/multiindex/test_iloc.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index aab44daf8d17f..8ea1cebd7bf7b 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.slow diff --git a/pandas/tests/indexing/multiindex/test_ix.py b/pandas/tests/indexing/multiindex/test_ix.py index 35f3137dac059..01b0b392d52a3 100644 --- a/pandas/tests/indexing/multiindex/test_ix.py +++ b/pandas/tests/indexing/multiindex/test_ix.py @@ -4,7 +4,7 @@ from pandas.errors import PerformanceWarning from pandas import DataFrame, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndex: diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index b6b9f7f205394..3b8aa963ac698 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.indexing import IndexingError -import pandas.util.testing as tm @pytest.fixture @@ -411,3 +411,60 @@ def test_loc_setitem_single_column_slice(): df.loc[:, "B"] = np.arange(4) expected.iloc[:, 2] = np.arange(4) tm.assert_frame_equal(df, expected) + + +def test_loc_nan_multiindex(): + # GH 5286 + tups = [ + ("Good Things", "C", np.nan), + ("Good Things", "R", np.nan), + ("Bad Things", "C", np.nan), + ("Bad Things", "T", np.nan), + ("Okay Things", "N", "B"), + ("Okay Things", "N", "D"), + ("Okay Things", "B", np.nan), + ("Okay Things", "D", np.nan), + ] + df = DataFrame( + np.ones((8, 4)), + columns=Index(["d1", "d2", "d3", "d4"]), + index=MultiIndex.from_tuples(tups, names=["u1", "u2", "u3"]), + ) + result = df.loc["Good Things"].loc["C"] + expected = DataFrame( + np.ones((1, 4)), + index=Index([np.nan], dtype="object", name="u3"), + columns=Index(["d1", "d2", "d3", "d4"], dtype="object"), + ) + tm.assert_frame_equal(result, expected) + + +def test_loc_period_string_indexing(): + # GH 9892 + a = pd.period_range("2013Q1", "2013Q4", freq="Q") + i = (1111, 2222, 3333) + idx = pd.MultiIndex.from_product((a, i), names=("Periode", "CVR")) + df = pd.DataFrame( + index=idx, + columns=( + "OMS", + "OMK", + "RES", + "DRIFT_IND", + "OEVRIG_IND", + "FIN_IND", + "VARE_UD", + "LOEN_UD", + "FIN_UD", + ), + ) + result = df.loc[("2013Q1", 1111), "OMS"] + expected = pd.Series( + [np.nan], + dtype=object, + name="OMS", + index=pd.MultiIndex.from_tuples( + [(pd.Period("2013Q1"), 1111)], names=["Periode", "CVR"] + ), + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 8c6afef1234da..8163de8588232 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,12 +1,11 @@ import numpy as np -import pytest import pandas._libs.index as _index from pandas.errors import PerformanceWarning import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexBasic: @@ -47,17 +46,6 @@ def test_multiindex_contains_dropped(self): assert "a" in idx.levels[0] assert "a" not in idx - @pytest.mark.parametrize( - "data, expected", - [ - (MultiIndex.from_product([(), ()]), True), - (MultiIndex.from_product([(1, 2), (3, 4)]), True), - (MultiIndex.from_product([("a", "b"), (1, 2)]), False), - ], - ) - def test_multiindex_is_homogeneous_type(self, data, expected): - assert data._is_homogeneous_type is expected - def test_indexing_over_hashtable_size_cutoff(self): n = 10000 diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index 05ea949721b65..9d181bdcb9491 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexPartial: diff --git a/pandas/tests/indexing/multiindex/test_set_ops.py b/pandas/tests/indexing/multiindex/test_set_ops.py index 66cb0d0d46380..f2cbfadb3cfa5 100644 --- a/pandas/tests/indexing/multiindex/test_set_ops.py +++ b/pandas/tests/indexing/multiindex/test_set_ops.py @@ -1,7 +1,7 @@ from numpy.random import randn from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexSetOps: diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 7fc95ba62a888..aebd1ad2573ed 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -4,8 +4,8 @@ import pandas as pd from pandas import DataFrame, MultiIndex, Series, Timestamp, date_range, isna, notna +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm class TestMultiIndexSetItem: @@ -141,7 +141,7 @@ def test_multiindex_setitem(self): df.loc["bar"] *= 2 # from SO - # http://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation + # https://stackoverflow.com/questions/24572040/pandas-access-the-level-of-multiindex-for-inplace-operation df_orig = DataFrame.from_dict( { "price": { diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index ee0f160b33cf1..6fa9d3bd2cdbb 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -5,9 +5,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp +import pandas._testing as tm from pandas.core.indexing import _non_reducing_slice from pandas.tests.indexing.common import _mklbl -import pandas.util.testing as tm class TestMultiIndexSlicers: diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 5b8300827609a..4bec0f429a34e 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -2,7 +2,7 @@ from numpy.random import randn from pandas import DataFrame, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestMultiIndexSorted: diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index ffbe1bb785cda..db8c0c643a623 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -2,8 +2,8 @@ import pytest from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm @pytest.fixture diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index 81dedfdc74409..621417eb38d94 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestIndexingCallable: diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 40fd6575abf44..8c8dece53277e 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -16,8 +16,8 @@ Timestamp, conftest, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT -import pandas.util.testing as tm class TestCategoricalIndex: @@ -74,8 +74,8 @@ def test_loc_scalar(self): df.loc["d"] = 10 msg = ( - "cannot insert an item into a CategoricalIndex that is not" - " already an existing category" + "cannot insert an item into a CategoricalIndex that is not " + "already an existing category" ) with pytest.raises(TypeError, match=msg): df.loc["d", "A"] = 10 @@ -365,8 +365,9 @@ def test_loc_listlike(self): # not all labels in the categories with pytest.raises( KeyError, - match="'a list-indexer must only include values that are in the" - " categories'", + match=( + "'a list-indexer must only include values that are in the categories'" + ), ): self.df2.loc[["a", "d"]] diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 760bb655534b2..e845487ffca9a 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range, option_context +import pandas._testing as tm import pandas.core.common as com -import pandas.util.testing as tm class TestCaching: @@ -273,7 +273,7 @@ def random_text(nobs=100): str(df) # from SO: - # http://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc + # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) df["group"] = "b" diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 256aaef8eb5a7..b904755b099d0 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -7,7 +7,7 @@ import pandas.compat as compat import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm ############################################################### # Index / Series common tests which may trigger dtype coercions @@ -432,13 +432,19 @@ def test_insert_index_datetimes(self, fill_val, exp_dtype): ) self._assert_insert_conversion(obj, fill_val, exp, exp_dtype) - msg = "Passed item and index have different timezone" if fill_val.tz: - with pytest.raises(ValueError, match=msg): + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): obj.insert(1, pd.Timestamp("2012-01-01")) - with pytest.raises(ValueError, match=msg): - obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) + msg = "Timezones don't match" + with pytest.raises(ValueError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) + + else: + msg = "Cannot compare tz-naive and tz-aware" + with pytest.raises(TypeError, match=msg): + obj.insert(1, pd.Timestamp("2012-01-01", tz="Asia/Tokyo")) msg = "cannot insert DatetimeIndex with incompatible label" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/indexing/test_datetime.py b/pandas/tests/indexing/test_datetime.py index f2e3f7f6b3723..42f992339f036 100644 --- a/pandas/tests/indexing/test_datetime.py +++ b/pandas/tests/indexing/test_datetime.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestDatetimeIndex: diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 52d0e30f0bcad..5530896a90941 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Float64Index, Index, Int64Index, RangeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestFloatIndexers: @@ -90,11 +90,11 @@ def test_scalar_non_numeric(self): else: error = TypeError msg = ( - r"cannot do (label|index|positional) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}|" - "Cannot index by location index with a" - " non-integer key".format(klass=type(i), kind=str(float)) + r"cannot do (label|index|positional) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}|" + "Cannot index by location index with a " + "non-integer key".format(klass=type(i), kind=str(float)) ) with pytest.raises(error, match=msg): idxr(s)[3.0] @@ -111,9 +111,9 @@ def test_scalar_non_numeric(self): else: error = TypeError msg = ( - r"cannot do (label|index) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}".format(klass=type(i), kind=str(float)) + r"cannot do (label|index) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=type(i), kind=str(float)) ) with pytest.raises(error, match=msg): s.loc[3.0] @@ -123,9 +123,9 @@ def test_scalar_non_numeric(self): # setting with a float fails with iloc msg = ( - r"cannot do (label|index|positional) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}".format(klass=type(i), kind=str(float)) + r"cannot do (label|index|positional) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=type(i), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s.iloc[3.0] = 0 @@ -160,9 +160,9 @@ def test_scalar_non_numeric(self): s = Series(np.arange(len(i)), index=i) s[3] msg = ( - r"cannot do (label|index) indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}".format(klass=type(i), kind=str(float)) + r"cannot do (label|index) indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=type(i), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[3.0] @@ -177,9 +177,9 @@ def test_scalar_with_mixed(self): for idxr in [lambda x: x, lambda x: x.iloc]: msg = ( - r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}|" + r"cannot do label indexing " + r"on {klass} with these indexers \[1\.0\] of " + r"{kind}|" "Cannot index by location index with a non-integer key".format( klass=str(Index), kind=str(float) ) @@ -199,9 +199,9 @@ def test_scalar_with_mixed(self): for idxr in [lambda x: x]: msg = ( - r"cannot do label indexing" - r" on {klass} with these indexers \[1\.0\] of" - r" {kind}".format(klass=str(Index), kind=str(float)) + r"cannot do label indexing " + r"on {klass} with these indexers \[1\.0\] of " + r"{kind}".format(klass=str(Index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): idxr(s3)[1.0] @@ -313,9 +313,9 @@ def test_scalar_float(self): s.iloc[3.0] msg = ( - r"cannot do positional indexing" - r" on {klass} with these indexers \[3\.0\] of" - r" {kind}".format(klass=str(Float64Index), kind=str(float)) + r"cannot do positional indexing " + r"on {klass} with these indexers \[3\.0\] of " + r"{kind}".format(klass=str(Float64Index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s2.iloc[3.0] = 0 @@ -344,9 +344,9 @@ def test_slice_non_numeric(self): for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s.iloc[l] @@ -354,10 +354,10 @@ def test_slice_non_numeric(self): for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers" - r" \[(3|4)(\.0)?\]" - r" of ({kind_float}|{kind_int})".format( + "cannot do slice indexing " + r"on {klass} with these indexers " + r"\[(3|4)(\.0)?\] " + r"of ({kind_float}|{kind_int})".format( klass=type(index), kind_float=str(float), kind_int=str(int), @@ -370,19 +370,19 @@ def test_slice_non_numeric(self): for l in [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s.iloc[l] = 0 for idxr in [lambda x: x.loc, lambda x: x.iloc, lambda x: x]: msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers" - r" \[(3|4)(\.0)?\]" - r" of ({kind_float}|{kind_int})".format( + "cannot do slice indexing " + r"on {klass} with these indexers " + r"\[(3|4)(\.0)?\] " + r"of ({kind_float}|{kind_int})".format( klass=type(index), kind_float=str(float), kind_int=str(int), @@ -424,9 +424,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -448,9 +448,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[-6\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[-6\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[slice(-6.0, 6.0)] @@ -474,9 +474,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(2|3)\.5\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(2|3)\.5\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -492,9 +492,9 @@ def test_slice_integer(self): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] = 0 @@ -515,9 +515,9 @@ def test_integer_positional_indexing(self): klass = RangeIndex msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(2|4)\.0\] of" - " {kind}".format(klass=str(klass), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(2|4)\.0\] of " + "{kind}".format(klass=str(klass), kind=str(float)) ) with pytest.raises(TypeError, match=msg): idxr(s)[l] @@ -540,9 +540,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(0|1)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(0|1)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -555,9 +555,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[-10\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[-10\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[slice(-10.0, 10.0)] @@ -574,9 +574,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[0\.5\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[0\.5\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] @@ -591,9 +591,9 @@ def f(idxr): # positional indexing msg = ( - "cannot do slice indexing" - r" on {klass} with these indexers \[(3|4)\.0\] of" - " {kind}".format(klass=type(index), kind=str(float)) + "cannot do slice indexing " + r"on {klass} with these indexers \[(3|4)\.0\] of " + "{kind}".format(klass=type(index), kind=str(float)) ) with pytest.raises(TypeError, match=msg): s[l] = 0 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 2f27757d6a754..48c25ec034653 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -1,5 +1,6 @@ """ test positional based indexing with iloc """ +from datetime import datetime from warnings import catch_warnings, simplefilter import numpy as np @@ -7,10 +8,10 @@ import pandas as pd from pandas import DataFrame, Series, concat, date_range, isna +import pandas._testing as tm from pandas.api.types import is_scalar from pandas.core.indexing import IndexingError from pandas.tests.indexing.common import Base -import pandas.util.testing as tm class TestiLoc(Base): @@ -122,7 +123,7 @@ def check(result, expected): [ ([slice(None), ["A", "D"]]), (["1", "2"], slice(None)), - ([pd.datetime(2019, 1, 1)], slice(None)), + ([datetime(2019, 1, 1)], slice(None)), ], ) def test_iloc_non_integer_raises(self, index, columns, index_vals, column_vals): @@ -436,9 +437,9 @@ def test_iloc_getitem_labelled_frame(self): # trying to use a label msg = ( - r"Location based indexing can only have \[integer, integer" - r" slice \(START point is INCLUDED, END point is EXCLUDED\)," - r" listlike of integers, boolean array\] types" + r"Location based indexing can only have \[integer, integer " + r"slice \(START point is INCLUDED, END point is EXCLUDED\), " + r"listlike of integers, boolean array\] types" ) with pytest.raises(ValueError, match=msg): df.iloc["j", "D"] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index d75afd1540f22..1913caae93932 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -13,11 +13,11 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series +import pandas._testing as tm from pandas.core.generic import NDFrame from pandas.core.indexers import validate_indices from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice from pandas.tests.indexing.common import Base, _mklbl -import pandas.util.testing as tm # ------------------------------------------------------------------------ # Indexing test cases @@ -81,14 +81,11 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): nd3 = np.random.randint(5, size=(2, 2, 2)) msg = ( - r"Buffer has wrong number of dimensions \(expected 1," - r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" + r"Buffer has wrong number of dimensions \(expected 1, " + r"got 3\)|" "Cannot index with multidimensional key|" r"Wrong number of dimensions. values.ndim != ndim \[3 != 1\]|" - "No matching signature found|" # TypeError - "unhashable type: 'numpy.ndarray'" # TypeError + "Index data must be 1-dimensional" ) if ( @@ -104,21 +101,12 @@ def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): "categorical", ] ): - idxr[nd3] - else: - if ( - isinstance(obj, DataFrame) - and idxr_id == "getitem" - and index.inferred_type == "boolean" - ): - error = TypeError - elif idxr_id == "getitem" and index.inferred_type == "interval": - error = TypeError - else: - error = ValueError - - with pytest.raises(error, match=msg): + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): idxr[nd3] + else: + with pytest.raises(ValueError, match=msg): + with tm.assert_produces_warning(DeprecationWarning): + idxr[nd3] @pytest.mark.parametrize( "index", tm.all_index_generator(5), ids=lambda x: type(x).__name__ @@ -146,16 +134,14 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): nd3 = np.random.randint(5, size=(2, 2, 2)) msg = ( - r"Buffer has wrong number of dimensions \(expected 1," - r" got 3\)|" - "The truth value of an array with more than one element is" - " ambiguous|" - "Only 1-dimensional input arrays are supported|" - "'pandas._libs.interval.IntervalTree' object has no attribute" - " 'set_value'|" # AttributeError + r"Buffer has wrong number of dimensions \(expected 1, " + r"got 3\)|" + "'pandas._libs.interval.IntervalTree' object has no attribute " + "'set_value'|" # AttributeError "unhashable type: 'numpy.ndarray'|" # TypeError "No matching signature found|" # TypeError - r"^\[\[\[" # pandas.core.indexing.IndexingError + r"^\[\[\[|" # pandas.core.indexing.IndexingError + "Index data must be 1-dimensional" ) if (idxr_id == "iloc") or ( @@ -176,10 +162,8 @@ def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): ): idxr[nd3] = 0 else: - with pytest.raises( - (ValueError, AttributeError, TypeError, pd.core.indexing.IndexingError), - match=msg, - ): + err = (ValueError, AttributeError) + with pytest.raises(err, match=msg): idxr[nd3] = 0 def test_inf_upcast(self): @@ -1190,3 +1174,13 @@ def test_duplicate_index_mistyped_key_raises_keyerror(): with pytest.raises(KeyError): ser.index._engine.get_loc(None) + + +def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(): + # GH 30567 + ser = pd.Series([None] * 10) + mask = [False] * 3 + [True] * 5 + [False] * 2 + ser[mask] = range(5) + result = ser + expected = pd.Series([None] * 3 + list(range(5)) + [None] * 2).astype("object") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing_engines.py b/pandas/tests/indexing/test_indexing_engines.py index 7303c1ff3d111..edb5d7d7f3a57 100644 --- a/pandas/tests/indexing/test_indexing_engines.py +++ b/pandas/tests/indexing/test_indexing_engines.py @@ -2,7 +2,7 @@ from pandas._libs import algos as libalgos, index as libindex -import pandas.util.testing as tm +import pandas._testing as tm class TestNumericEngine: diff --git a/pandas/tests/indexing/test_indexing_slow.py b/pandas/tests/indexing/test_indexing_slow.py index bf8c6afd00561..2ffa44bec14a6 100644 --- a/pandas/tests/indexing/test_indexing_slow.py +++ b/pandas/tests/indexing/test_indexing_slow.py @@ -1,7 +1,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm class TestIndexingSlow: diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 6f20ec649b200..78fcd15ab4cc1 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -7,9 +7,9 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range +import pandas._testing as tm from pandas.api.types import is_scalar from pandas.tests.indexing.common import Base -import pandas.util.testing as tm class TestLoc(Base): @@ -219,8 +219,8 @@ def test_loc_to_fail(self): # raise a KeyError? msg = ( - r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[1, 2\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): df.loc[[1, 2], [1, 2]] @@ -236,15 +236,13 @@ def test_loc_to_fail(self): s.loc[-1] msg = ( - r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[-1, -2\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): s.loc[[-1, -2]] - msg = ( - r"\"None of \[Index\(\['4'\], dtype='object'\)\] are" r" in the \[index\]\"" - ) + msg = r"\"None of \[Index\(\['4'\], dtype='object'\)\] are in the \[index\]\"" with pytest.raises(KeyError, match=msg): s.loc[["4"]] @@ -254,8 +252,8 @@ def test_loc_to_fail(self): s["a"] = 2 msg = ( - r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[-2\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): s.loc[[-2]] @@ -270,8 +268,8 @@ def test_loc_to_fail(self): df = DataFrame([["a"], ["b"]], index=[1, 2], columns=["value"]) msg = ( - r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[3\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): df.loc[[3], :] @@ -373,6 +371,9 @@ def test_loc_index(self): result = df.loc[mask.values] tm.assert_frame_equal(result, expected) + result = df.loc[pd.array(mask, dtype="boolean")] + tm.assert_frame_equal(result, expected) + def test_loc_general(self): df = DataFrame( @@ -968,3 +969,36 @@ def test_loc_getitem_label_list_integer_labels( expected = df.iloc[:, expected_columns] result = df.loc[["A", "B", "C"], column_key] tm.assert_frame_equal(result, expected, check_column_type=check_column_type) + + +def test_loc_setitem_float_intindex(): + # GH 8720 + rand_data = np.random.randn(8, 4) + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + expected_data = np.hstack((rand_data, np.array([np.nan] * 8).reshape(8, 1))) + expected = pd.DataFrame(expected_data, columns=[0.0, 1.0, 2.0, 3.0, 0.5]) + tm.assert_frame_equal(result, expected) + + result = pd.DataFrame(rand_data) + result.loc[:, 0.5] = np.nan + tm.assert_frame_equal(result, expected) + + +def test_loc_axis_1_slice(): + # GH 10586 + cols = [(yr, m) for yr in [2014, 2015] for m in [7, 8, 9, 10]] + df = pd.DataFrame( + np.ones((10, 8)), + index=tuple("ABCDEFGHIJ"), + columns=pd.MultiIndex.from_tuples(cols), + ) + result = df.loc(axis=1)[(2014, 9):(2015, 8)] + expected = pd.DataFrame( + np.ones((10, 4)), + index=tuple("ABCDEFGHIJ"), + columns=pd.MultiIndex.from_tuples( + [(2014, 9), (2014, 10), (2015, 7), (2015, 8)] + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_na_indexing.py b/pandas/tests/indexing/test_na_indexing.py new file mode 100644 index 0000000000000..befe4fee8ecf8 --- /dev/null +++ b/pandas/tests/indexing/test_na_indexing.py @@ -0,0 +1,79 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "values, dtype", + [ + ([1, 2, 3], "int64"), + ([1.0, 2.0, 3.0], "float64"), + (["a", "b", "c"], "object"), + (["a", "b", "c"], "string"), + ([1, 2, 3], "datetime64[ns]"), + ([1, 2, 3], "datetime64[ns, CET]"), + ([1, 2, 3], "timedelta64[ns]"), + (["2000", "2001", "2002"], "Period[D]"), + ([1, 0, 3], "Sparse"), + ([pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(3, 4)], "interval"), + ], +) +@pytest.mark.parametrize( + "mask", [[True, False, False], [True, True, True], [False, False, False]] +) +@pytest.mark.parametrize("box_mask", [True, False]) +@pytest.mark.parametrize("frame", [True, False]) +def test_series_mask_boolean(values, dtype, mask, box_mask, frame): + ser = pd.Series(values, dtype=dtype, index=["a", "b", "c"]) + if frame: + ser = ser.to_frame() + mask = pd.array(mask, dtype="boolean") + if box_mask: + mask = pd.Series(mask, index=ser.index) + + expected = ser[mask.astype("bool")] + + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + # empty + mask = mask[:0] + ser = ser.iloc[:0] + expected = ser[mask.astype("bool")] + result = ser[mask] + tm.assert_equal(result, expected) + + if not box_mask: + # Series.iloc[Series[bool]] isn't allowed + result = ser.iloc[mask] + tm.assert_equal(result, expected) + + result = ser.loc[mask] + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("frame", [True, False]) +def test_indexing_with_na_raises(frame): + s = pd.Series([1, 2, 3], name="name") + + if frame: + s = s.to_frame() + mask = pd.array([True, False, None], dtype="boolean") + match = "cannot mask with array containing NA / NaN values" + with pytest.raises(ValueError, match=match): + s[mask] + + with pytest.raises(ValueError, match=match): + s.loc[mask] + + with pytest.raises(ValueError, match=match): + s.iloc[mask] diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 15c65be37e0d9..2ce07ec41758f 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestPartialSetting: @@ -205,8 +205,8 @@ def test_series_partial_set(self): # raises as nothing in in the index msg = ( - r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'\)\] are" - r" in the \[index\]\"" + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'\)\] are " + r"in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): ser.loc[[3, 3, 3]] @@ -286,8 +286,8 @@ def test_series_partial_set_with_name(self): # raises as nothing in in the index msg = ( - r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64'," - r" name='idx'\)\] are in the \[index\]\"" + r"\"None of \[Int64Index\(\[3, 3, 3\], dtype='int64', " + r"name='idx'\)\] are in the \[index\]\"" ) with pytest.raises(KeyError, match=msg): ser.loc[[3, 3, 3]] diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index ddaea5b597d6d..a567fb9b8ccc7 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -4,8 +4,8 @@ import pytest from pandas import DataFrame, Series, Timedelta, Timestamp, date_range +import pandas._testing as tm from pandas.tests.indexing.common import Base -import pandas.util.testing as tm class TestScalar(Base): @@ -132,8 +132,8 @@ def test_at_to_fail(self): result = s.at["a"] assert result == 1 msg = ( - "At based indexing on an non-integer index can only have" - " non-integer indexers" + "At based indexing on an non-integer index can only have " + "non-integer indexers" ) with pytest.raises(ValueError, match=msg): s.at[0] diff --git a/pandas/tests/indexing/test_timedelta.py b/pandas/tests/indexing/test_timedelta.py index 5c9865ddc7090..dd4750123c0b5 100644 --- a/pandas/tests/indexing/test_timedelta.py +++ b/pandas/tests/indexing/test_timedelta.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestTimedeltaIndexing: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 551782d0b363a..aa966caa63238 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -10,19 +10,11 @@ from pandas._libs.internals import BlockPlacement import pandas as pd -from pandas import ( - Categorical, - DataFrame, - DatetimeIndex, - Index, - MultiIndex, - Series, - SparseArray, -) +from pandas import Categorical, DataFrame, DatetimeIndex, Index, MultiIndex, Series +import pandas._testing as tm import pandas.core.algorithms as algos -from pandas.core.arrays import DatetimeArray, TimedeltaArray +from pandas.core.arrays import DatetimeArray, SparseArray, TimedeltaArray from pandas.core.internals import BlockManager, SingleBlockManager, make_block -import pandas.util.testing as tm @pytest.fixture @@ -305,7 +297,8 @@ def test_delete(self): assert (newb.values[1] == 1).all() newb = self.fblock.copy() - with pytest.raises(Exception): + + with pytest.raises(IndexError, match=None): newb.delete(3) @@ -329,7 +322,12 @@ def test_can_hold_element(self): val = date(2010, 10, 10) assert not block._can_hold_element(val) - with pytest.raises(TypeError): + + msg = ( + "'value' should be a 'Timestamp', 'NaT', " + "or array of those. Got 'date' instead." + ) + with pytest.raises(TypeError, match=msg): arr[0] = val @@ -358,7 +356,10 @@ def test_duplicate_ref_loc_failure(self): blocks[1].mgr_locs = np.array([0]) # test trying to create block manager with overlapping ref locs - with pytest.raises(AssertionError): + + msg = "Gaps in blk ref_locs" + + with pytest.raises(AssertionError, match=msg): BlockManager(blocks, axes) blocks[0].mgr_locs = np.array([0]) @@ -816,7 +817,11 @@ def test_validate_bool_args(self): bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2") for value in invalid_values: - with pytest.raises(ValueError): + msg = ( + 'For argument "inplace" expected type bool, ' + f"received type {type(value).__name__}." + ) + with pytest.raises(ValueError, match=msg): bm1.replace_list([1], [2], inplace=value) @@ -1035,9 +1040,11 @@ def test_slice_len(self): assert len(BlockPlacement(slice(1, 0, -1))) == 1 def test_zero_step_raises(self): - with pytest.raises(ValueError): + msg = "slice step cannot be zero" + + with pytest.raises(ValueError, match=msg): BlockPlacement(slice(1, 1, 0)) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): BlockPlacement(slice(1, 2, 0)) def test_unbounded_slice_raises(self): @@ -1140,9 +1147,11 @@ def assert_add_equals(val, inc, result): assert_add_equals(slice(1, 4), -1, [0, 1, 2]) assert_add_equals([1, 2, 4], -1, [0, 1, 3]) - with pytest.raises(ValueError): + msg = "iadd causes length change" + + with pytest.raises(ValueError, match=msg): BlockPlacement(slice(1, 4)).add(-10) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): BlockPlacement([1, 2, 4]).add(-10) @@ -1208,7 +1217,7 @@ def test_binop_other(self, op, value, dtype): (operator.pow, "bool"), } if (op, dtype) in skip: - pytest.skip("Invalid combination {},{}".format(op, dtype)) + pytest.skip(f"Invalid combination {op},{dtype}") e = DummyElement(value, dtype) s = pd.DataFrame({"A": [e.value, e.value]}, dtype=e.dtype) @@ -1224,7 +1233,17 @@ def test_binop_other(self, op, value, dtype): } if (op, dtype) in invalid: - with pytest.raises(TypeError): + msg = ( + None + if (dtype == " bool: + """ + Filter out invalid (engine, ext) pairs instead of skipping, as that + produces 500+ pytest.skips. + """ + engine = engine.values[0] + if engine == "openpyxl" and read_ext == ".xls": + return False + if engine == "odf" and read_ext != ".ods": + return False + if read_ext == ".ods" and engine != "odf": + return False + return True + + +def _transfer_marks(engine, read_ext): + """ + engine gives us a pytest.param objec with some marks, read_ext is just + a string. We need to generate a new pytest.param inheriting the marks. + """ + values = engine.values + (read_ext,) + new_param = pytest.param(values, marks=engine.marks) + return new_param + + @pytest.fixture( + autouse=True, params=[ - # Add any engines to test here - # When defusedxml is installed it triggers deprecation warnings for - # xlrd and openpyxl, so catch those here - pytest.param( - "xlrd", - marks=[ - td.skip_if_no("xlrd"), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), - ], - ), - pytest.param( - "openpyxl", - marks=[ - td.skip_if_no("openpyxl"), - pytest.mark.filterwarnings("ignore:.*html argument"), - ], - ), - pytest.param( - None, - marks=[ - td.skip_if_no("xlrd"), - pytest.mark.filterwarnings("ignore:.*(tree\\.iter|html argument)"), - ], - ), - pytest.param("odf", marks=td.skip_if_no("odf")), - ] + _transfer_marks(eng, ext) + for eng in engine_params + for ext in read_ext_params + if _is_valid_engine_ext_pair(eng, ext) + ], ) -def engine(request): +def engine_and_read_ext(request): """ - A fixture for Excel reader engines. + Fixture for Excel reader engine and read_ext, only including valid pairs. """ return request.param +@pytest.fixture +def engine(engine_and_read_ext): + engine, read_ext = engine_and_read_ext + return engine + + +@pytest.fixture +def read_ext(engine_and_read_ext): + engine, read_ext = engine_and_read_ext + return read_ext + + class TestReaders: @pytest.fixture(autouse=True) - def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): + def cd_and_set_engine(self, engine, datapath, monkeypatch): """ Change directory and set engine for read_excel calls. """ - if engine == "openpyxl" and read_ext == ".xls": - pytest.skip() - if engine == "odf" and read_ext != ".ods": - pytest.skip() - if read_ext == ".ods" and engine != "odf": - pytest.skip() func = partial(pd.read_excel, engine=engine) monkeypatch.chdir(datapath("io", "data", "excel")) @@ -806,16 +847,10 @@ def test_read_excel_squeeze(self, read_ext): class TestExcelFileRead: @pytest.fixture(autouse=True) - def cd_and_set_engine(self, engine, datapath, monkeypatch, read_ext): + def cd_and_set_engine(self, engine, datapath, monkeypatch): """ Change directory and set engine for ExcelFile objects. """ - if engine == "odf" and read_ext != ".ods": - pytest.skip() - if read_ext == ".ods" and engine != "odf": - pytest.skip() - if engine == "openpyxl" and read_ext == ".xls": - pytest.skip() func = partial(pd.ExcelFile, engine=engine) monkeypatch.chdir(datapath("io", "data", "excel")) @@ -953,3 +988,13 @@ def test_conflicting_excel_engines(self, read_ext): with pd.ExcelFile("test1" + read_ext) as xl: with pytest.raises(ValueError, match=msg): pd.read_excel(xl, engine="foo") + + def test_excel_read_binary(self, engine, read_ext): + # GH 15914 + expected = pd.read_excel("test1" + read_ext, engine=engine) + + with open("test1" + read_ext, "rb") as f: + data = f.read() + + actual = pd.read_excel(data, engine=engine) + tm.assert_frame_equal(expected, actual) diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index 41363bf13ed4e..88f4c3736bc0d 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter from pandas.io.formats.excel import ExcelFormatter diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e0cb75b0a6c99..55b987a599670 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, get_option, set_option -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ( ExcelFile, @@ -252,7 +252,7 @@ def test_read_excel_parse_dates(self, ext): res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) - date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") res = pd.read_excel( pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0 ) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 5d011f1f843bf..f1871a080c073 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelFile diff --git a/pandas/tests/io/excel/test_xlsxwriter.py b/pandas/tests/io/excel/test_xlsxwriter.py index deb72cc230669..b6f791434a92b 100644 --- a/pandas/tests/io/excel/test_xlsxwriter.py +++ b/pandas/tests/io/excel/test_xlsxwriter.py @@ -3,7 +3,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter diff --git a/pandas/tests/io/excel/test_xlwt.py b/pandas/tests/io/excel/test_xlwt.py index c6af78c2704d8..01feab08eb5e3 100644 --- a/pandas/tests/io/excel/test_xlwt.py +++ b/pandas/tests/io/excel/test_xlwt.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.excel import ExcelWriter, _XlwtWriter diff --git a/pandas/tests/io/formats/data/html/render_links_false.html b/pandas/tests/io/formats/data/html/render_links_false.html index 6509a0e985597..6feb403d63051 100644 --- a/pandas/tests/io/formats/data/html/render_links_false.html +++ b/pandas/tests/io/formats/data/html/render_links_false.html @@ -11,7 +11,7 @@
- + diff --git a/pandas/tests/io/formats/data/html/render_links_true.html b/pandas/tests/io/formats/data/html/render_links_true.html index e9cb5632aad1d..3eb53f3160a77 100644 --- a/pandas/tests/io/formats/data/html/render_links_true.html +++ b/pandas/tests/io/formats/data/html/render_links_true.html @@ -11,7 +11,7 @@ - + diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index a6ad5d5edbf5f..7008cef7b28fa 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -1,6 +1,6 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.formats.css import CSSResolver, CSSWarning @@ -101,29 +101,25 @@ def test_css_side_shorthands(shorthand, expansions): top, right, bottom, left = expansions assert_resolves( - "{shorthand}: 1pt".format(shorthand=shorthand), - {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, + f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, ) assert_resolves( - "{shorthand}: 1pt 4pt".format(shorthand=shorthand), - {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, + f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, ) assert_resolves( - "{shorthand}: 1pt 4pt 2pt".format(shorthand=shorthand), + f"{shorthand}: 1pt 4pt 2pt", {top: "1pt", right: "4pt", bottom: "2pt", left: "4pt"}, ) assert_resolves( - "{shorthand}: 1pt 4pt 2pt 0pt".format(shorthand=shorthand), + f"{shorthand}: 1pt 4pt 2pt 0pt", {top: "1pt", right: "4pt", bottom: "2pt", left: "0pt"}, ) with tm.assert_produces_warning(CSSWarning): - assert_resolves( - "{shorthand}: 1pt 1pt 1pt 1pt 1pt".format(shorthand=shorthand), {} - ) + assert_resolves(f"{shorthand}: 1pt 1pt 1pt 1pt 1pt", {}) @pytest.mark.parametrize( @@ -174,10 +170,10 @@ def test_css_none_absent(style, equiv): "size,resolved", [ ("xx-small", "6pt"), - ("x-small", "{pt:f}pt".format(pt=7.5)), - ("small", "{pt:f}pt".format(pt=9.6)), + ("x-small", f"{7.5:f}pt"), + ("small", f"{9.6:f}pt"), ("medium", "12pt"), - ("large", "{pt:f}pt".format(pt=13.5)), + ("large", f"{13.5:f}pt"), ("x-large", "18pt"), ("xx-large", "24pt"), ("8px", "6pt"), @@ -196,9 +192,7 @@ def test_css_absolute_font_size(size, relative_to, resolved): else: inherited = {"font-size": relative_to} assert_resolves( - "font-size: {size}".format(size=size), - {"font-size": resolved}, - inherited=inherited, + f"font-size: {size}", {"font-size": resolved}, inherited=inherited, ) @@ -224,7 +218,7 @@ def test_css_absolute_font_size(size, relative_to, resolved): ("inherit", "16pt", "16pt"), ("smaller", None, "10pt"), ("smaller", "18pt", "15pt"), - ("larger", None, "{pt:f}pt".format(pt=14.4)), + ("larger", None, f"{14.4:f}pt"), ("larger", "15pt", "18pt"), ], ) @@ -234,7 +228,5 @@ def test_css_relative_font_size(size, relative_to, resolved): else: inherited = {"font-size": relative_to} assert_resolves( - "font-size: {size}".format(size=size), - {"font-size": resolved}, - inherited=inherited, + f"font-size: {size}", {"font-size": resolved}, inherited=inherited, ) diff --git a/pandas/tests/io/formats/test_eng_formatting.py b/pandas/tests/io/formats/test_eng_formatting.py index 2edbff3766c9d..6801316ada8a3 100644 --- a/pandas/tests/io/formats/test_eng_formatting.py +++ b/pandas/tests/io/formats/test_eng_formatting.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index d8604774777a6..97956489e7da6 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -35,7 +35,7 @@ reset_option, set_option, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt import pandas.io.formats.printing as printing @@ -421,12 +421,10 @@ def test_repr_truncation_column_size(self): def test_repr_max_columns_max_rows(self): term_width, term_height = get_terminal_size() if term_width < 10 or term_height < 10: - pytest.skip( - "terminal size too small, {0} x {1}".format(term_width, term_height) - ) + pytest.skip(f"terminal size too small, {term_width} x {term_height}") def mkframe(n): - index = ["{i:05d}".format(i=i) for i in range(n)] + index = [f"{i:05d}" for i in range(n)] return DataFrame(0, index, index) df6 = mkframe(6) @@ -446,7 +444,7 @@ def mkframe(n): assert not has_truncated_repr(df6) with option_context("display.max_rows", 9, "display.max_columns", 10): - # out vertical bounds can not result in exanded repr + # out vertical bounds can not result in expanded repr assert not has_expanded_repr(df10) assert has_vertically_truncated_repr(df10) @@ -667,9 +665,9 @@ def test_to_string_with_formatters(self): ) formatters = [ - ("int", lambda x: "0x{x:x}".format(x=x)), - ("float", lambda x: "[{x: 4.1f}]".format(x=x)), - ("object", lambda x: "-{x!s}-".format(x=x)), + ("int", lambda x: f"0x{x:x}"), + ("float", lambda x: f"[{x: 4.1f}]"), + ("object", lambda x: f"-{x!s}-"), ] result = df.to_string(formatters=dict(formatters)) result2 = df.to_string(formatters=list(zip(*formatters))[1]) @@ -711,7 +709,7 @@ def format_func(x): def test_to_string_with_formatters_unicode(self): df = DataFrame({"c/\u03c3": [1, 2, 3]}) - result = df.to_string(formatters={"c/\u03c3": lambda x: "{x}".format(x=x)}) + result = df.to_string(formatters={"c/\u03c3": str}) assert result == " c/\u03c3\n" + "0 1\n1 2\n2 3" def test_east_asian_unicode_false(self): @@ -1240,7 +1238,7 @@ def test_wide_repr(self): set_option("display.expand_frame_repr", False) rep_str = repr(df) - assert "10 rows x {c} columns".format(c=max_cols - 1) in rep_str + assert f"10 rows x {max_cols - 1} columns" in rep_str set_option("display.expand_frame_repr", True) wide_repr = repr(df) assert rep_str != wide_repr @@ -1351,7 +1349,7 @@ def test_long_series(self): n = 1000 s = Series( np.random.randint(-50, 50, n), - index=["s{x:04d}".format(x=x) for x in range(n)], + index=[f"s{x:04d}" for x in range(n)], dtype="int64", ) @@ -1477,9 +1475,7 @@ def test_to_string(self): expected = ["A"] assert header == expected - biggie.to_string( - columns=["B", "A"], formatters={"A": lambda x: "{x:.1f}".format(x=x)} - ) + biggie.to_string(columns=["B", "A"], formatters={"A": lambda x: f"{x:.1f}"}) biggie.to_string(columns=["B", "A"], float_format=str) biggie.to_string(columns=["B", "A"], col_space=12, float_format=str) @@ -1610,7 +1606,7 @@ def test_to_string_small_float_values(self): result = df.to_string() # sadness per above - if "{x:.4g}".format(x=1.7e8) == "1.7e+008": + if _three_digit_exp(): expected = ( " a\n" "0 1.500000e+000\n" @@ -1922,7 +1918,7 @@ def test_repr_html_long(self): long_repr = df._repr_html_() assert ".." in long_repr assert str(41 + max_rows // 2) not in long_repr - assert "{h} rows ".format(h=h) in long_repr + assert f"{h} rows " in long_repr assert "2 columns" in long_repr def test_repr_html_float(self): @@ -1939,7 +1935,7 @@ def test_repr_html_float(self): ).set_index("idx") reg_repr = df._repr_html_() assert ".." not in reg_repr - assert "".format(val=str(40 + h)) in reg_repr + assert f"" in reg_repr h = max_rows + 1 df = DataFrame( @@ -1951,8 +1947,8 @@ def test_repr_html_float(self): ).set_index("idx") long_repr = df._repr_html_() assert ".." in long_repr - assert "".format(val="31") not in long_repr - assert "{h} rows ".format(h=h) in long_repr + assert "" not in long_repr + assert f"{h} rows " in long_repr assert "2 columns" in long_repr def test_repr_html_long_multiindex(self): @@ -2181,9 +2177,7 @@ def test_to_string(self): cp.name = "foo" result = cp.to_string(length=True, name=True, dtype=True) last_line = result.split("\n")[-1].strip() - assert last_line == ( - "Freq: B, Name: foo, Length: {cp}, dtype: float64".format(cp=len(cp)) - ) + assert last_line == (f"Freq: B, Name: foo, Length: {len(cp)}, dtype: float64") def test_freq_name_separation(self): s = Series( @@ -2665,14 +2659,14 @@ def test_format_explicit(self): assert exp == res res = repr(test_sers["asc"]) exp = ( - "0 a\n1 ab\n ... \n4 abcde\n5" - " abcdef\ndtype: object" + "0 a\n1 ab\n ... \n4 abcde\n5 " + "abcdef\ndtype: object" ) assert exp == res res = repr(test_sers["desc"]) exp = ( - "5 abcdef\n4 abcde\n ... \n1 ab\n0" - " a\ndtype: object" + "5 abcdef\n4 abcde\n ... \n1 ab\n0 " + "a\ndtype: object" ) assert exp == res @@ -2782,7 +2776,7 @@ def test_to_string_na_rep(self): def test_to_string_float_format(self): s = pd.Series(range(10), dtype="float64") - res = s.to_string(float_format=lambda x: "{0:2.1f}".format(x), max_rows=2) + res = s.to_string(float_format=lambda x: f"{x:2.1f}", max_rows=2) exp = "0 0.0\n ..\n9 9.0" assert res == exp @@ -2807,7 +2801,7 @@ def test_to_string_multindex_header(self): def _three_digit_exp(): - return "{x:.4g}".format(x=1.7e8) == "1.7e+008" + return f"{1.7e8:.4g}" == "1.7e+008" class TestFloatArrayFormatter: diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 5a3afb5025e51..a2659079be7c0 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm jinja2 = pytest.importorskip("jinja2") from pandas.io.formats.style import Styler, _get_level_lengths # noqa # isort:skip @@ -24,7 +24,7 @@ def setup_method(self, method): self.g = lambda x: x def h(x, foo="bar"): - return pd.Series("color: {foo}".format(foo=foo), index=x.index, name=x.name) + return pd.Series(f"color: {foo}", index=x.index, name=x.name) self.h = h self.styler = Styler(self.df) @@ -278,7 +278,7 @@ def test_numeric_columns(self): def test_apply_axis(self): df = pd.DataFrame({"A": [0, 0], "B": [1, 1]}) - f = lambda x: ["val: {max}".format(max=x.max()) for v in x] + f = lambda x: [f"val: {x.max()}" for v in x] result = df.style.apply(f, axis=1) assert len(result._todo) == 1 assert len(result.ctx) == 0 @@ -362,7 +362,7 @@ def color_negative_red(val): strings, black otherwise. """ color = "red" if val < 0 else "black" - return "color: {color}".format(color=color) + return f"color: {color}" dic = { ("a", "d"): [-1.12, 2.11], @@ -472,8 +472,19 @@ def test_empty(self): result = s._translate()["cellstyle"] expected = [ - {"props": [["color", " red"]], "selector": "row0_col0"}, - {"props": [["", ""]], "selector": "row1_col0"}, + {"props": [("color", " red")], "selectors": ["row0_col0"]}, + {"props": [("", "")], "selectors": ["row1_col0"]}, + ] + assert result == expected + + def test_duplicate(self): + df = pd.DataFrame({"A": [1, 0]}) + s = df.style + s.ctx = {(0, 0): ["color: red"], (1, 0): ["color: red"]} + + result = s._translate()["cellstyle"] + expected = [ + {"props": [("color", " red")], "selectors": ["row0_col0", "row1_col0"]} ] assert result == expected @@ -530,20 +541,17 @@ def test_bar_align_left_0points(self): (1, 0): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (1, 1): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (1, 2): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (2, 0): [ "width: 10em", @@ -572,8 +580,7 @@ def test_bar_align_left_0points(self): (0, 1): [ "width: 10em", " height: 80%", - "background: linear-gradient(90deg,#d65f5f 50.0%," - " transparent 50.0%)", + "background: linear-gradient(90deg,#d65f5f 50.0%, transparent 50.0%)", ], (0, 2): [ "width: 10em", @@ -1215,13 +1222,9 @@ def test_highlight_max(self): def test_export(self): f = lambda x: "color: red" if x > 0 else "color: blue" - g = ( - lambda x, y, z: "color: {z}".format(z=z) - if x > 0 - else "color: {z}".format(z=z) - ) + g = lambda x, z: f"color: {z}" if x > 0 else f"color: {z}" style1 = self.styler - style1.applymap(f).applymap(g, y="a", z="b").highlight_max() + style1.applymap(f).applymap(g, z="b").highlight_max() result = style1.export() style2 = self.df.style style2.use(result) @@ -1645,9 +1648,7 @@ def test_hide_columns_mult_levels(self): def test_pipe(self): def set_caption_from_template(styler, a, b): - return styler.set_caption( - "Dataframe with a = {a} and b = {b}".format(a=a, b=b) - ) + return styler.set_caption(f"Dataframe with a = {a} and b = {b}") styler = self.df.style.pipe(set_caption_from_template, "A", b="B") assert "Dataframe with a = A and b = B" in styler.render() diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 80edbd828194d..a211ac11cf725 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,3 +1,4 @@ +import io import os import sys @@ -6,7 +7,7 @@ import pandas as pd from pandas import DataFrame, compat -import pandas.util.testing as tm +import pandas._testing as tm class TestToCSV: @@ -204,6 +205,14 @@ def test_to_csv_na_rep(self): assert df.set_index("a").to_csv(na_rep="_") == expected assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + # GH 29975 + # Make sure full na_rep shows up when a dtype is provided + csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") + expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) + assert expected == csv + csv = pd.Series(["a", pd.NA, "c"], dtype="string").to_csv(na_rep="ZZZZZ") + assert expected == csv + def test_to_csv_date_format(self): # GH 10209 df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) @@ -376,16 +385,14 @@ def test_to_csv_string_with_lf(self): assert f.read() == expected_noarg with tm.ensure_clean("lf_test.csv") as path: # case 2: LF as line terminator - expected_lf = b"int,str_lf\n" b"1,abc\n" b'2,"d\nef"\n' b'3,"g\nh\n\ni"\n' + expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' df.to_csv(path, line_terminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf with tm.ensure_clean("lf_test.csv") as path: # case 3: CRLF as line terminator # 'line_terminator' should not change inner element - expected_crlf = ( - b"int,str_lf\r\n" b"1,abc\r\n" b'2,"d\nef"\r\n' b'3,"g\nh\n\ni"\r\n' - ) + expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' df.to_csv(path, line_terminator="\r\n", index=False) with open(path, "rb") as f: assert f.read() == expected_crlf @@ -412,9 +419,7 @@ def test_to_csv_string_with_crlf(self): assert f.read() == expected_noarg with tm.ensure_clean("crlf_test.csv") as path: # case 2: LF as line terminator - expected_lf = ( - b"int,str_crlf\n" b"1,abc\n" b'2,"d\r\nef"\n' b'3,"g\r\nh\r\n\r\ni"\n' - ) + expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' df.to_csv(path, line_terminator="\n", index=False) with open(path, "rb") as f: assert f.read() == expected_lf @@ -490,10 +495,7 @@ def test_to_csv_compression(self, compression_only, read_infer, to_infer): compression = compression_only if compression == "zip": - pytest.skip( - "{compression} is not supported " - "for to_csv".format(compression=compression) - ) + pytest.skip(f"{compression} is not supported for to_csv") # We'll complete file extension subsequently. filename = "test." @@ -567,3 +569,17 @@ def test_to_csv_na_rep_long_string(self, df_new_type): result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") assert expected == result + + def test_to_csv_timedelta_precision(self): + # GH 6783 + s = pd.Series([1, 1]).astype("timedelta64[ns]") + buf = io.StringIO() + s.to_csv(buf) + result = buf.getvalue() + expected_rows = [ + ",0", + "0,0 days 00:00:00.000000001", + "1,0 days 00:00:00.000000001", + ] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert result == expected diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 4d8edec7c7f14..883240b74c32c 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -5,7 +5,7 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.formats.css import CSSWarning from pandas.io.formats.excel import CSSToExcelConverter @@ -270,13 +270,13 @@ def test_css_to_excel_inherited(css, inherited, expected): def test_css_to_excel_good_colors(input_color, output_color): # see gh-18392 css = ( - "border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}" - ).format(color=input_color) + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) expected = dict() @@ -297,13 +297,13 @@ def test_css_to_excel_good_colors(input_color, output_color): def test_css_to_excel_bad_colors(input_color): # see gh-18392 css = ( - "border-top-color: {color}; " - "border-right-color: {color}; " - "border-bottom-color: {color}; " - "border-left-color: {color}; " - "background-color: {color}; " - "color: {color}" - ).format(color=input_color) + f"border-top-color: {input_color}; " + f"border-right-color: {input_color}; " + f"border-bottom-color: {input_color}; " + f"border-left-color: {input_color}; " + f"background-color: {input_color}; " + f"color: {input_color}" + ) expected = dict() diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index a2a577a0753f7..d3f044a42eb28 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -7,18 +7,18 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, option_context -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.formats.format as fmt lorem_ipsum = ( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod" - " tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim" - " veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex" - " ea commodo consequat. Duis aute irure dolor in reprehenderit in" - " voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur" - " sint occaecat cupidatat non proident, sunt in culpa qui officia" - " deserunt mollit anim id est laborum." + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod " + "tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim " + "veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex " + "ea commodo consequat. Duis aute irure dolor in reprehenderit in " + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur " + "sint occaecat cupidatat non proident, sunt in culpa qui officia " + "deserunt mollit anim id est laborum." ) @@ -688,7 +688,7 @@ def test_to_html_float_format_no_fixed_width(value, float_format, expected, data def test_to_html_render_links(render_links, expected, datapath): # GH 2679 data = [ - [0, "http://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], + [0, "https://pandas.pydata.org/?q1=a&q2=b", "pydata.org"], [0, "www.pydata.org", "pydata.org"], ] df = DataFrame(data, columns=["foo", "bar", None]) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index ea8688517bd93..bd681032f155d 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestToLatex: diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py new file mode 100644 index 0000000000000..8893e4294353f --- /dev/null +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -0,0 +1,55 @@ +from io import StringIO + +import pytest + +import pandas as pd + +pytest.importorskip("tabulate") + + +def test_simple(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf) + result = buf.getvalue() + assert ( + result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_other_tablefmt(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf, tablefmt="jira") + result = buf.getvalue() + assert result == "|| || 0 ||\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + + +def test_other_headers(): + buf = StringIO() + df = pd.DataFrame([1, 2, 3]) + df.to_markdown(buf=buf, headers=["foo", "bar"]) + result = buf.getvalue() + assert result == ( + "| foo | bar |\n|------:|------:|\n| 0 " + "| 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_series(): + buf = StringIO() + s = pd.Series([1, 2, 3], name="foo") + s.to_markdown(buf=buf) + result = buf.getvalue() + assert result == ( + "| | foo |\n|---:|------:|\n| 0 | 1 " + "|\n| 1 | 2 |\n| 2 | 3 |" + ) + + +def test_no_buf(capsys): + df = pd.DataFrame([1, 2, 3]) + result = df.to_markdown() + assert ( + result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py index 6ef0e0457e2e2..67b767a337a89 100755 --- a/pandas/tests/io/generate_legacy_storage_files.py +++ b/pandas/tests/io/generate_legacy_storage_files.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ self-contained to write legacy storage pickle files diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 5c5c04c35d6b7..182c21ed1d416 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -3,7 +3,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_compression_roundtrip(compression): diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index fba74d8ebcf97..2ac2acc6748d1 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.json._table_schema import ( as_json_table_type, diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 038dd2df4d632..efb95a0cb2a42 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -4,7 +4,7 @@ import pytest from pandas import DataFrame, Index, json_normalize -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.json._normalize import nested_to_record @@ -462,6 +462,30 @@ def test_nested_flattening_consistent(self): # They should be the same. tm.assert_frame_equal(df1, df2) + def test_nonetype_record_path(self, nulls_fixture): + # see gh-30148 + # should not raise TypeError + result = json_normalize( + [ + {"state": "Texas", "info": nulls_fixture}, + {"state": "Florida", "info": [{"i": 2}]}, + ], + record_path=["info"], + ) + expected = DataFrame({"i": 2}, index=[0]) + tm.assert_equal(result, expected) + + def test_non_interable_record_path_errors(self): + # see gh-30148 + test_input = {"state": "Texas", "info": 1} + test_path = "info" + msg = ( + f"{test_input} has non iterable value 1 for path {test_path}. " + "Must be iterable or null." + ) + with pytest.raises(TypeError, match=msg): + json_normalize([test_input], record_path=[test_path]) + class TestNestedToRecord: def test_flat_stays_flat(self): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6489fedad03e3..bb873c71e8a35 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,4 +1,5 @@ from collections import OrderedDict +import datetime from datetime import timedelta from io import StringIO import json @@ -12,7 +13,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, read_json -import pandas.util.testing as tm +import pandas._testing as tm _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -39,6 +40,7 @@ def assert_json_roundtrip_equal(result, expected, orient): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: @pytest.fixture(scope="function", autouse=True) def setup(self, datapath): @@ -809,6 +811,31 @@ def test_convert_dates(self): result = read_json(json, typ="series") tm.assert_series_equal(result, ts) + @pytest.mark.parametrize("date_format", ["epoch", "iso"]) + @pytest.mark.parametrize("as_object", [True, False]) + @pytest.mark.parametrize( + "date_typ", [datetime.date, datetime.datetime, pd.Timestamp] + ) + def test_date_index_and_values(self, date_format, as_object, date_typ): + data = [date_typ(year=2020, month=1, day=1), pd.NaT] + if as_object: + data.append("a") + + ser = pd.Series(data, index=data) + result = ser.to_json(date_format=date_format) + + if date_format == "epoch": + expected = '{"1577836800000":1577836800000,"null":null}' + else: + expected = ( + '{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z","null":null}' + ) + + if as_object: + expected = expected.replace("}", ',"a":"a"}') + + assert result == expected + @pytest.mark.parametrize( "infer_word", [ @@ -854,7 +881,7 @@ def test_date_format_frame(self, date, date_unit): json = df.to_json(date_format="iso") result = read_json(json) expected = df.copy() - # expected.index = expected.index.tz_localize("UTC") + expected.index = expected.index.tz_localize("UTC") expected["date"] = expected["date"].dt.tz_localize("UTC") tm.assert_frame_equal(result, expected) @@ -884,7 +911,7 @@ def test_date_format_series(self, date, date_unit): json = ts.to_json(date_format="iso") result = read_json(json, typ="series") expected = ts.copy() - # expected.index = expected.index.tz_localize("UTC") + expected.index = expected.index.tz_localize("UTC") expected = expected.dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -1244,7 +1271,7 @@ def test_to_jsonl(self): # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) - expected = '{"a\\\\":"foo\\\\","b":"bar"}\n' '{"a\\\\":"foo\\"","b":"bar"}' + expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}' assert result == expected tm.assert_frame_equal(pd.read_json(result, lines=True), df) @@ -1597,3 +1624,19 @@ def test_json_indent_all_orients(self, orient, expected): def test_json_negative_indent_raises(self): with pytest.raises(ValueError, match="must be a nonnegative integer"): pd.DataFrame().to_json(indent=-1) + + def test_emca_262_nan_inf_support(self): + # GH 12213 + data = '["a", NaN, "NaN", Infinity, "Infinity", -Infinity, "-Infinity"]' + result = pd.read_json(data) + expected = pd.DataFrame( + ["a", np.nan, "NaN", np.inf, "Infinity", -np.inf, "-Infinity"] + ) + tm.assert_frame_equal(result, expected) + + def test_deprecate_numpy_argument_read_json(self): + # GH 28512 + expected = DataFrame([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + result = read_json(expected.to_json(), numpy=True) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index b85032904c5ec..e531457627342 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -4,7 +4,7 @@ import pandas as pd from pandas import DataFrame, read_json -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.json._json import JsonReader @@ -56,7 +56,7 @@ def test_to_jsonl(): # GH15096: escaped characters in columns and data df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"]) result = df.to_json(orient="records", lines=True) - expected = '{"a\\\\":"foo\\\\","b":"bar"}\n' '{"a\\\\":"foo\\"","b":"bar"}' + expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}' assert result == expected tm.assert_frame_equal(read_json(result, lines=True), df) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 6008f6b651c2a..bedd60084124c 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -17,7 +17,7 @@ import pandas.compat as compat from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm def _clean_dict(d): @@ -111,9 +111,9 @@ def test_encode_decimal(self): @pytest.mark.parametrize("ensure_ascii", [True, False]) def test_encode_string_conversion(self, ensure_ascii): string_input = "A string \\ / \b \f \n \r \t &" - not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n ' '\\r \\t <\\/script> &"' + not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n \\r \\t <\\/script> &"' html_encoded = ( - '"A string \\\\ \\/ \\b \\f \\n \\r \\t ' '\\u003c\\/script\\u003e \\u0026"' + '"A string \\\\ \\/ \\b \\f \\n \\r \\t \\u003c\\/script\\u003e \\u0026"' ) def helper(expected_output, **encode_kwargs): @@ -559,11 +559,6 @@ def test_loads_non_str_bytes_raises(self): with pytest.raises(TypeError, match=msg): ujson.loads(None) - def test_version(self): - assert re.match( - r"^\d+\.\d+(\.\d+)?$", ujson.__version__ - ), "ujson.__version__ must be a string like '1.4.0'" - def test_encode_numeric_overflow(self): with pytest.raises(OverflowError): ujson.encode(12839128391289382193812939) @@ -816,7 +811,7 @@ def test_array_numpy_labelled(self): # see gh-10837: write out the dump explicitly # so there is no dependency on iteration order - input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, ' '{"a": 2.4, "b": 78}]' + input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, {"a": 2.4, "b": 78}]' output = ujson.loads(input_dumps, numpy=True, labelled=True) expected_vals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) assert (expected_vals == output[0]).all() diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index a87e1e796c194..15967e3be176a 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -46,11 +46,17 @@ class PythonParser(BaseParser): @pytest.fixture def csv_dir_path(datapath): + """ + The directory path to the data files needed for parser tests. + """ return datapath("io", "parser", "data") @pytest.fixture def csv1(csv_dir_path): + """ + The path to the data file "test1.csv" needed for parser tests. + """ return os.path.join(csv_dir_path, "test1.csv") @@ -69,14 +75,49 @@ def csv1(csv_dir_path): @pytest.fixture(params=_all_parsers, ids=_all_parser_ids) def all_parsers(request): + """ + Fixture all of the CSV parsers. + """ return request.param @pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) def c_parser_only(request): + """ + Fixture all of the CSV parsers using the C engine. + """ return request.param @pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) def python_parser_only(request): + """ + Fixture all of the CSV parsers using the Python engine. + """ + return request.param + + +_utf_values = [8, 16, 32] + +_encoding_seps = ["", "-", "_"] +_encoding_prefixes = ["utf", "UTF"] + +_encoding_fmts = [ + f"{prefix}{sep}" + "{0}" for sep in _encoding_seps for prefix in _encoding_prefixes +] + + +@pytest.fixture(params=_utf_values) +def utf_value(request): + """ + Fixture for all possible integer values for a UTF encoding. + """ + return request.param + + +@pytest.fixture(params=_encoding_fmts) +def encoding_fmt(request): + """ + Fixture for all possible string formats of a UTF encoding. + """ return request.param diff --git a/pandas/tests/io/parser/data/utf32_ex_small.zip b/pandas/tests/io/parser/data/utf32_ex_small.zip new file mode 100644 index 0000000000000..9a6d5c08da9db Binary files /dev/null and b/pandas/tests/io/parser/data/utf32_ex_small.zip differ diff --git a/pandas/tests/io/parser/data/utf8_ex_small.zip b/pandas/tests/io/parser/data/utf8_ex_small.zip new file mode 100644 index 0000000000000..a4c5440bdffa7 Binary files /dev/null and b/pandas/tests/io/parser/data/utf8_ex_small.zip differ diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 77b52eb90d61f..1737f14e7adf9 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -17,7 +17,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame, concat -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -597,3 +597,14 @@ def test_file_binary_mode(c_parser_only): with open(path, "rb") as f: result = parser.read_csv(f, header=None) tm.assert_frame_equal(result, expected) + + +def test_unix_style_breaks(c_parser_only): + # GH 11020 + parser = c_parser_only + with tm.ensure_clean() as path: + with open(path, "w", newline="\n") as f: + f.write("blah\n\ncol_1,col_2,col_3\n\n") + result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") + expected = DataFrame(columns=["col_1", "col_2", "col_3"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py index e1d422142ab0b..60e32d7c27200 100644 --- a/pandas/tests/io/parser/test_comment.py +++ b/pandas/tests/io/parser/test_comment.py @@ -8,7 +8,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("na_values", [None, ["NaN"]]) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index fe360f1346c7c..6c17f40b790ac 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -3,13 +3,11 @@ specific classification into the other test modules. """ import codecs -from collections import OrderedDict import csv from datetime import datetime -from io import BytesIO, StringIO +from io import StringIO import os import platform -from tempfile import TemporaryFile from urllib.error import URLError import numpy as np @@ -19,7 +17,7 @@ from pandas.errors import DtypeWarning, EmptyDataError, ParserError from pandas import DataFrame, Index, MultiIndex, Series, compat, concat -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser @@ -70,17 +68,6 @@ def _set_noconvert_columns(self): tm.assert_frame_equal(result, expected) -def test_bytes_io_input(all_parsers): - encoding = "cp1255" - parser = all_parsers - - data = BytesIO("שלום:1234\n562:123".encode(encoding)) - result = parser.read_csv(data, sep=":", encoding=encoding) - - expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) - tm.assert_frame_equal(result, expected) - - def test_empty_decimal_marker(all_parsers): data = """A|B|C 1|2,334|5 @@ -317,15 +304,6 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): tm.assert_frame_equal(result, expected) -def test_read_csv_unicode(all_parsers): - parser = all_parsers - data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) - - result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) - expected = DataFrame([["\u0141aski, Jan", 1]]) - tm.assert_frame_equal(result, expected) - - def test_read_csv_wrong_num_columns(all_parsers): # Too few columns. data = """A,B,C,D,E,F @@ -1065,59 +1043,6 @@ def test_skip_initial_space(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("sep", [",", "\t"]) -@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) -def test_utf16_bom_skiprows(all_parsers, sep, encoding): - # see gh-2298 - parser = all_parsers - data = """skip this -skip this too -A,B,C -1,2,3 -4,5,6""".replace( - ",", sep - ) - path = "__{}__.csv".format(tm.rands(10)) - kwargs = dict(sep=sep, skiprows=2) - utf8 = "utf-8" - - with tm.ensure_clean(path) as path: - from io import TextIOWrapper - - bytes_data = data.encode(encoding) - - with open(path, "wb") as f: - f.write(bytes_data) - - bytes_buffer = BytesIO(data.encode(utf8)) - bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) - - result = parser.read_csv(path, encoding=encoding, **kwargs) - expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) - - bytes_buffer.close() - tm.assert_frame_equal(result, expected) - - -def test_utf16_example(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "utf16_ex.txt") - parser = all_parsers - result = parser.read_csv(path, encoding="utf-16", sep="\t") - assert len(result) == 50 - - -def test_unicode_encoding(all_parsers, csv_dir_path): - path = os.path.join(csv_dir_path, "unicode_series.csv") - parser = all_parsers - - result = parser.read_csv(path, header=None, encoding="latin-1") - result = result.set_index(0) - got = result[1][1632] - - expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" - assert got == expected - - def test_trailing_delimiters(all_parsers): # see gh-2442 data = """A,B,C @@ -1132,7 +1057,7 @@ def test_trailing_delimiters(all_parsers): def test_escapechar(all_parsers): - # http://stackoverflow.com/questions/13824840/feature-request-for- + # https://stackoverflow.com/questions/13824840/feature-request-for- # pandas-read-csv data = '''SEARCH_TERM,ACTUAL_URL "bra tv bord","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord" @@ -1144,9 +1069,8 @@ def test_escapechar(all_parsers): StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8" ) - assert result["SEARCH_TERM"][2] == ( - 'SLAGBORD, "Bergslagen", ' "IKEA:s 1700-tals serie" - ) + assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals serie' + tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"])) @@ -1317,9 +1241,7 @@ def test_float_parser(all_parsers): def test_scientific_no_exponent(all_parsers): # see gh-12215 - df = DataFrame.from_dict( - OrderedDict([("w", ["2e"]), ("x", ["3E"]), ("y", ["42e"]), ("z", ["632E"])]) - ) + df = DataFrame.from_dict({"w": ["2e"], "x": ["3E"], "y": ["42e"], "z": ["632E"]}) data = df.to_csv(index=False) parser = all_parsers @@ -1919,68 +1841,20 @@ def test_null_byte_char(all_parsers): parser.read_csv(StringIO(data), names=names) -@pytest.mark.parametrize( - "data,kwargs,expected", - [ - # Basic test - ("a\n1", dict(), DataFrame({"a": [1]})), - # "Regular" quoting - ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), - # Test in a data row instead of header - ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), - # Test in empty data row with skipping - ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), - # Test in empty data row without skipping - ( - "\n1", - dict(names=["a"], skip_blank_lines=False), - DataFrame({"a": [np.nan, 1]}), - ), - ], -) -def test_utf8_bom(all_parsers, data, kwargs, expected): - # see gh-4793 - parser = all_parsers - bom = "\ufeff" - utf8 = "utf-8" - - def _encode_data_with_bom(_data): - bom_data = (bom + _data).encode(utf8) - return BytesIO(bom_data) - - result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) - tm.assert_frame_equal(result, expected) - - def test_temporary_file(all_parsers): # see gh-13398 parser = all_parsers data = "0 0" - new_file = TemporaryFile("w+") - new_file.write(data) - new_file.flush() - new_file.seek(0) - - result = parser.read_csv(new_file, sep=r"\s+", header=None) - new_file.close() - - expected = DataFrame([[0, 0]]) - tm.assert_frame_equal(result, expected) - + with tm.ensure_clean(mode="w+", return_filelike=True) as new_file: + new_file.write(data) + new_file.flush() + new_file.seek(0) -@pytest.mark.parametrize("byte", [8, 16]) -@pytest.mark.parametrize("fmt", ["utf-{0}", "utf_{0}", "UTF-{0}", "UTF_{0}"]) -def test_read_csv_utf_aliases(all_parsers, byte, fmt): - # see gh-13549 - expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) - parser = all_parsers - - encoding = fmt.format(byte) - data = "mb_num,multibyte\n4.8,test".encode(encoding) + result = parser.read_csv(new_file, sep=r"\s+", header=None) - result = parser.read_csv(BytesIO(data), encoding=encoding) - tm.assert_frame_equal(result, expected) + expected = DataFrame([[0, 0]]) + tm.assert_frame_equal(result, expected) def test_internal_eof_byte(all_parsers): @@ -2042,30 +1916,6 @@ def test_file_handles_with_open(all_parsers, csv1): assert not f.closed -@pytest.mark.parametrize( - "fname,encoding", - [ - ("test1.csv", "utf-8"), - ("unicode_series.csv", "latin-1"), - ("sauron.SHIFT_JIS.csv", "shiftjis"), - ], -) -def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): - # gh-23779: Python csv engine shouldn't error on files opened in binary. - parser = all_parsers - - fpath = os.path.join(csv_dir_path, fname) - expected = parser.read_csv(fpath, encoding=encoding) - - with open(fpath, mode="r", encoding=encoding) as fa: - result = parser.read_csv(fa) - tm.assert_frame_equal(expected, result) - - with open(fpath, mode="rb") as fb: - result = parser.read_csv(fb, encoding=encoding) - tm.assert_frame_equal(expected, result) - - def test_invalid_file_buffer_class(all_parsers): # see gh-15337 class InvalidBuffer: @@ -2208,3 +2058,13 @@ def test_first_row_bom(all_parsers): result = parser.read_csv(StringIO(data), delimiter="\t") expected = DataFrame(columns=["Head1", "Head2", "Head3"]) tm.assert_frame_equal(result, expected) + + +def test_integer_precision(all_parsers): + # Gh 7072 + s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 +5,1;0;0;0;1;1;843;843;843;1;1;1;1;1;1;0;0;1;1;0;0,64.0,;,4321113141090630389""" + parser = all_parsers + result = parser.read_csv(StringIO(s), header=None)[4] + expected = Series([4321583677327450765, 4321113141090630389], name=4) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py index 9d0eab0b9a907..dc03370daa1e2 100644 --- a/pandas/tests/io/parser/test_compression.py +++ b/pandas/tests/io/parser/test_compression.py @@ -9,7 +9,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[True, False]) @@ -123,12 +123,13 @@ def test_infer_compression(all_parsers, csv1, buffer, ext): tm.assert_frame_equal(result, expected) -def test_compression_utf16_encoding(all_parsers, csv_dir_path): - # see gh-18071 +def test_compression_utf_encoding(all_parsers, csv_dir_path, utf_value, encoding_fmt): + # see gh-18071, gh-24130 parser = all_parsers - path = os.path.join(csv_dir_path, "utf16_ex_small.zip") + encoding = encoding_fmt.format(utf_value) + path = os.path.join(csv_dir_path, f"utf{utf_value}_ex_small.zip") - result = parser.read_csv(path, encoding="utf-16", compression="zip", sep="\t") + result = parser.read_csv(path, encoding=encoding, compression="zip", sep="\t") expected = pd.DataFrame( { "Country": ["Venezuela", "Venezuela"], diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 2a3b1dc82fc59..88b400d9a11df 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, Index -import pandas.util.testing as tm +import pandas._testing as tm def test_converters_type_must_be_dict(all_parsers): diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py index dc10352bc6460..cc65def0fd096 100644 --- a/pandas/tests/io/parser/test_dialect.py +++ b/pandas/tests/io/parser/test_dialect.py @@ -11,7 +11,7 @@ from pandas.errors import ParserWarning from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index a68d46e8a6c15..d08c86bf2ae75 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timestamp, concat -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("dtype", [str, object]) @@ -79,7 +79,7 @@ def test_invalid_dtype_per_column(all_parsers): 3,4.5 4,5.5""" - with pytest.raises(TypeError, match='data type "foo" not understood'): + with pytest.raises(TypeError, match="data type [\"']foo[\"'] not understood"): parser.read_csv(StringIO(data), dtype={"one": "foo", 1: "int"}) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py new file mode 100644 index 0000000000000..406e7bedfd298 --- /dev/null +++ b/pandas/tests/io/parser/test_encoding.py @@ -0,0 +1,171 @@ +""" +Tests encoding functionality during parsing +for all of the parsers defined in parsers.py +""" + +from io import BytesIO +import os + +import numpy as np +import pytest + +from pandas import DataFrame +import pandas._testing as tm + + +def test_bytes_io_input(all_parsers): + encoding = "cp1255" + parser = all_parsers + + data = BytesIO("שלום:1234\n562:123".encode(encoding)) + result = parser.read_csv(data, sep=":", encoding=encoding) + + expected = DataFrame([[562, 123]], columns=["שלום", "1234"]) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_unicode(all_parsers): + parser = all_parsers + data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) + + result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) + expected = DataFrame([["\u0141aski, Jan", 1]]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sep", [",", "\t"]) +@pytest.mark.parametrize("encoding", ["utf-16", "utf-16le", "utf-16be"]) +def test_utf16_bom_skiprows(all_parsers, sep, encoding): + # see gh-2298 + parser = all_parsers + data = """skip this +skip this too +A,B,C +1,2,3 +4,5,6""".replace( + ",", sep + ) + path = "__{}__.csv".format(tm.rands(10)) + kwargs = dict(sep=sep, skiprows=2) + utf8 = "utf-8" + + with tm.ensure_clean(path) as path: + from io import TextIOWrapper + + bytes_data = data.encode(encoding) + + with open(path, "wb") as f: + f.write(bytes_data) + + bytes_buffer = BytesIO(data.encode(utf8)) + bytes_buffer = TextIOWrapper(bytes_buffer, encoding=utf8) + + result = parser.read_csv(path, encoding=encoding, **kwargs) + expected = parser.read_csv(bytes_buffer, encoding=utf8, **kwargs) + + bytes_buffer.close() + tm.assert_frame_equal(result, expected) + + +def test_utf16_example(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "utf16_ex.txt") + parser = all_parsers + result = parser.read_csv(path, encoding="utf-16", sep="\t") + assert len(result) == 50 + + +def test_unicode_encoding(all_parsers, csv_dir_path): + path = os.path.join(csv_dir_path, "unicode_series.csv") + parser = all_parsers + + result = parser.read_csv(path, header=None, encoding="latin-1") + result = result.set_index(0) + got = result[1][1632] + + expected = "\xc1 k\xf6ldum klaka (Cold Fever) (1994)" + assert got == expected + + +@pytest.mark.parametrize( + "data,kwargs,expected", + [ + # Basic test + ("a\n1", dict(), DataFrame({"a": [1]})), + # "Regular" quoting + ('"a"\n1', dict(quotechar='"'), DataFrame({"a": [1]})), + # Test in a data row instead of header + ("b\n1", dict(names=["a"]), DataFrame({"a": ["b", "1"]})), + # Test in empty data row with skipping + ("\n1", dict(names=["a"], skip_blank_lines=True), DataFrame({"a": [1]})), + # Test in empty data row without skipping + ( + "\n1", + dict(names=["a"], skip_blank_lines=False), + DataFrame({"a": [np.nan, 1]}), + ), + ], +) +def test_utf8_bom(all_parsers, data, kwargs, expected): + # see gh-4793 + parser = all_parsers + bom = "\ufeff" + utf8 = "utf-8" + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + result = parser.read_csv(_encode_data_with_bom(data), encoding=utf8, **kwargs) + tm.assert_frame_equal(result, expected) + + +def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): + # see gh-13549 + expected = DataFrame({"mb_num": [4.8], "multibyte": ["test"]}) + parser = all_parsers + + encoding = encoding_fmt.format(utf_value) + data = "mb_num,multibyte\n4.8,test".encode(encoding) + + result = parser.read_csv(BytesIO(data), encoding=encoding) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "fname,encoding", + [ + ("test1.csv", "utf-8"), + ("unicode_series.csv", "latin-1"), + ("sauron.SHIFT_JIS.csv", "shiftjis"), + ], +) +def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): + # gh-23779: Python csv engine shouldn't error on files opened in binary. + parser = all_parsers + + fpath = os.path.join(csv_dir_path, fname) + expected = parser.read_csv(fpath, encoding=encoding) + + with open(fpath, mode="r", encoding=encoding) as fa: + result = parser.read_csv(fa) + tm.assert_frame_equal(expected, result) + + with open(fpath, mode="rb") as fb: + result = parser.read_csv(fb, encoding=encoding) + tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize("pass_encoding", [True, False]) +def test_encoding_temp_file(all_parsers, utf_value, encoding_fmt, pass_encoding): + # see gh-24130 + parser = all_parsers + encoding = encoding_fmt.format(utf_value) + + expected = DataFrame({"foo": ["bar"]}) + + with tm.ensure_clean(mode="w+", encoding=encoding, return_filelike=True) as f: + f.write("foo\nbar") + f.seek(0) + + result = parser.read_csv(f, encoding=encoding if pass_encoding else None) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 214b93b6f0628..7dc106ef0c186 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -12,7 +12,7 @@ from pandas.errors import ParserError from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_read_with_bad_header(all_parsers): diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 66e00f4eb6c1c..f67a658cadfa2 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -5,10 +5,11 @@ """ from io import StringIO +import numpy as np import pytest from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("with_header", [True, False]) @@ -172,3 +173,14 @@ def test_multi_index_naming_not_all_at_beginning(all_parsers): ), ) tm.assert_frame_equal(result, expected) + + +def test_no_multi_index_level_names_empty(all_parsers): + # GH 10984 + parser = all_parsers + midx = MultiIndex.from_tuples([("A", 1, 2), ("A", 1, 2), ("B", 1, 2)]) + expected = DataFrame(np.random.randn(3, 3), index=midx, columns=["x", "y", "z"]) + with tm.ensure_clean() as path: + expected.to_csv(path) + result = parser.read_csv(path, index_col=[0, 1, 2]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index d144421090274..5c4e642115798 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -8,7 +8,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("kwargs", [dict(), dict(mangle_dupe_cols=True)]) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index c94adf9da0bf3..64ccaf60ec230 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm def _construct_dataframe(num_rows): diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 353d309a84823..f9a083d7f5d22 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -10,7 +10,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_string_nas(all_parsers): @@ -89,6 +89,7 @@ def test_default_na_values(all_parsers): "N/A", "n/a", "NA", + "", "#NA", "NULL", "null", diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 57e2950b06ce8..b8d66874bc660 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -11,7 +11,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import read_csv diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 36391e19a102e..b01b22e811ee3 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -20,8 +20,8 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm import pandas.io.date_converters as conv diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 5b381e43e3e19..7367b19b40dc3 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -13,7 +13,7 @@ from pandas.errors import ParserError from pandas import DataFrame, Index, MultiIndex -import pandas.util.testing as tm +import pandas._testing as tm def test_default_separator(python_parser_only): diff --git a/pandas/tests/io/parser/test_quoting.py b/pandas/tests/io/parser/test_quoting.py index 94858226d0b44..14773dfbea20e 100644 --- a/pandas/tests/io/parser/test_quoting.py +++ b/pandas/tests/io/parser/test_quoting.py @@ -11,7 +11,7 @@ from pandas.errors import ParserError from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 9ddaccc4d38b7..27aef2376e87d 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -12,7 +12,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import EmptyDataError, read_csv, read_fwf diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index d4f219d13ac53..fdccef1127c7e 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -12,7 +12,7 @@ from pandas.errors import EmptyDataError from pandas import DataFrame, Index -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("skiprows", [list(range(6)), 6]) diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 75a5b7cd53ddb..8d5af85c20d33 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -12,7 +12,7 @@ from pandas._libs.parsers import TextReader from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parsers import TextFileReader, read_csv @@ -179,7 +179,7 @@ def test_header_not_enough_lines(self): assert_array_dicts_equal(recs, expected) def test_escapechar(self): - data = '\\"hello world"\n' '\\"hello world"\n' '\\"hello world"' + data = '\\"hello world"\n\\"hello world"\n\\"hello world"' reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\") result = reader.read() diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 07ab41b47bf27..267fae760398a 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -12,7 +12,7 @@ from pandas.errors import ParserError -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.parsers as parsers from pandas.io.parsers import read_csv diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index 539fdf2470c51..979eb4702cc84 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -10,7 +10,7 @@ from pandas._libs.tslib import Timestamp from pandas import DataFrame, Index -import pandas.util.testing as tm +import pandas._testing as tm _msg_validate_usecols_arg = ( "'usecols' must either be list-like " diff --git a/pandas/tests/io/pytables/conftest.py b/pandas/tests/io/pytables/conftest.py index 6164f5d0722cc..214f95c6fb441 100644 --- a/pandas/tests/io/pytables/conftest.py +++ b/pandas/tests/io/pytables/conftest.py @@ -1,6 +1,6 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/io/pytables/test_compat.py b/pandas/tests/io/pytables/test_compat.py index a82e21532eddb..c7200385aa998 100644 --- a/pandas/tests/io/pytables/test_compat.py +++ b/pandas/tests/io/pytables/test_compat.py @@ -1,8 +1,8 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.tests.io.pytables.common import ensure_clean_path -import pandas.util.testing as tm tables = pytest.importorskip("tables") diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 91ee1061a5ef1..543940e674dba 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.io.pytables.common import ensure_clean_path, ensure_clean_store -import pandas.util.testing as tm from pandas.io.pytables import read_hdf diff --git a/pandas/tests/io/pytables/test_pytables_missing.py b/pandas/tests/io/pytables/test_pytables_missing.py index 4ceb80889c989..9adb0a6d227da 100644 --- a/pandas/tests/io/pytables/test_pytables_missing.py +++ b/pandas/tests/io/pytables/test_pytables_missing.py @@ -3,7 +3,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @td.skip_if_installed("tables") diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 3cd9d9cdd67d2..64c4ad800f49d 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -33,6 +33,7 @@ isna, timedelta_range, ) +import pandas._testing as tm from pandas.tests.io.pytables.common import ( _maybe_remove, create_tempfile, @@ -42,7 +43,6 @@ safe_remove, tables, ) -import pandas.util.testing as tm from pandas.io.pytables import ( ClosedFileError, @@ -66,8 +66,11 @@ class TestHDFStore: def test_format_kwarg_in_constructor(self, setup_path): # GH 13291 + + msg = "format is not a defined argument for HDFStore" + with ensure_clean_path(setup_path) as path: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): HDFStore(path, format="table") def test_context(self, setup_path): @@ -203,21 +206,27 @@ def test_api(self, setup_path): # Invalid. df = tm.makeDataFrame() - with pytest.raises(ValueError): + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", append=True, format="f") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", append=True, format="fixed") - with pytest.raises(TypeError): + msg = r"invalid HDFStore format specified \[foo\]" + + with pytest.raises(TypeError, match=msg): df.to_hdf(path, "df", append=True, format="foo") - with pytest.raises(TypeError): - df.to_hdf(path, "df", append=False, format="bar") + with pytest.raises(TypeError, match=msg): + df.to_hdf(path, "df", append=False, format="foo") # File path doesn't exist path = "" - with pytest.raises(FileNotFoundError): + msg = f"File {path} does not exist" + + with pytest.raises(FileNotFoundError, match=msg): read_hdf(path, "df") def test_api_default_format(self, setup_path): @@ -230,7 +239,10 @@ def test_api_default_format(self, setup_path): _maybe_remove(store, "df") store.put("df", df) assert not store.get_storer("df").is_table - with pytest.raises(ValueError): + + msg = "Can only append to Tables" + + with pytest.raises(ValueError, match=msg): store.append("df2", df) pd.set_option("io.hdf.default_format", "table") @@ -251,7 +263,7 @@ def test_api_default_format(self, setup_path): df.to_hdf(path, "df") with HDFStore(path) as store: assert not store.get_storer("df").is_table - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df2", append=True) pd.set_option("io.hdf.default_format", "table") @@ -384,7 +396,10 @@ def test_versioning(self, setup_path): # this is an error because its table_type is appendable, but no # version info store.get_node("df2")._v_attrs.pandas_version = None - with pytest.raises(Exception): + + msg = "'NoneType' object has no attribute 'startswith'" + + with pytest.raises(Exception, match=msg): store.select("df2") def test_mode(self, setup_path): @@ -428,7 +443,11 @@ def check(mode): # conv read if mode in ["w"]: - with pytest.raises(ValueError): + msg = ( + "mode w is not allowed while performing a read. " + r"Allowed modes are r, r\+ and a." + ) + with pytest.raises(ValueError, match=msg): read_hdf(path, "df", mode=mode) else: result = read_hdf(path, "df", mode=mode) @@ -1273,7 +1292,7 @@ def test_append_with_different_block_ordering(self, setup_path): with pytest.raises(ValueError): store.append("df", df) - # store multile additional fields in different blocks + # store multiple additional fields in different blocks df["float_3"] = Series([1.0] * len(df), dtype="float64") with pytest.raises(ValueError): store.append("df", df) @@ -3214,7 +3233,7 @@ def test_frame_select_complex(self, setup_path): tm.assert_frame_equal(result, expected) result = store.select( - "df", "(index>df.index[3] & " 'index<=df.index[6]) | string="bar"' + "df", '(index>df.index[3] & index<=df.index[6]) | string="bar"' ) expected = df.loc[ ((df.index > df.index[3]) & (df.index <= df.index[6])) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 1acb0ac6e06d2..2bf22d982e5fe 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -7,12 +7,12 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Series, Timestamp, date_range +import pandas._testing as tm from pandas.tests.io.pytables.common import ( _maybe_remove, ensure_clean_path, ensure_clean_store, ) -import pandas.util.testing as tm def _compare_with_tz(a, b): diff --git a/pandas/tests/io/sas/test_sas.py b/pandas/tests/io/sas/test_sas.py index fcd2e0e35ad9e..5d2643c20ceb2 100644 --- a/pandas/tests/io/sas/test_sas.py +++ b/pandas/tests/io/sas/test_sas.py @@ -3,7 +3,7 @@ import pytest from pandas import read_sas -import pandas.util.testing as tm +import pandas._testing as tm class TestSas: diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 49af18d2935ef..62e9ac6929c8e 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -1,3 +1,4 @@ +from datetime import datetime import io import os from pathlib import Path @@ -9,7 +10,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm # https://github.com/cython/cython/issues/1720 @@ -21,9 +22,9 @@ def setup_method(self, datapath): self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: - fname = os.path.join(self.dirpath, "test_sas7bdat_{j}.csv".format(j=j)) + fname = os.path.join(self.dirpath, f"test_sas7bdat_{j}.csv") df = pd.read_csv(fname) - epoch = pd.datetime(1960, 1, 1) + epoch = datetime(1960, 1, 1) t1 = pd.to_timedelta(df["Column4"], unit="d") df["Column4"] = epoch + t1 t2 = pd.to_timedelta(df["Column12"], unit="d") @@ -38,7 +39,7 @@ def test_from_file(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) @@ -46,7 +47,7 @@ def test_from_buffer(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") with open(fname, "rb") as f: byts = f.read() buf = io.BytesIO(byts) @@ -61,7 +62,7 @@ def test_from_iterator(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) @@ -73,7 +74,7 @@ def test_path_pathlib(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = Path(os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k))) + fname = Path(os.path.join(self.dirpath, f"test{k}.sas7bdat")) df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) @@ -84,9 +85,7 @@ def test_path_localpath(self): for j in 0, 1: df0 = self.data[j] for k in self.test_ix[j]: - fname = LocalPath( - os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) - ) + fname = LocalPath(os.path.join(self.dirpath, f"test{k}.sas7bdat")) df = pd.read_sas(fname, encoding="utf-8") tm.assert_frame_equal(df, df0) @@ -95,7 +94,7 @@ def test_iterator_loop(self): for j in 0, 1: for k in self.test_ix[j]: for chunksize in 3, 5, 10, 11: - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") rdr = pd.read_sas(fname, chunksize=10, encoding="utf-8") y = 0 for x in rdr: @@ -106,7 +105,7 @@ def test_iterator_loop(self): def test_iterator_read_too_much(self): # github #14734 k = self.test_ix[0][0] - fname = os.path.join(self.dirpath, "test{k}.sas7bdat".format(k=k)) + fname = os.path.join(self.dirpath, f"test{k}.sas7bdat") rdr = pd.read_sas(fname, format="sas7bdat", iterator=True, encoding="utf-8") d1 = rdr.read(rdr.row_count + 20) rdr.close() diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index a52b22122ba81..ee97f08ef9400 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -4,7 +4,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.sas.sasreader import read_sas diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py index 666dfd245acaa..652cacaf14ffb 100644 --- a/pandas/tests/io/test_clipboard.py +++ b/pandas/tests/io/test_clipboard.py @@ -6,15 +6,9 @@ import pandas as pd from pandas import DataFrame, get_option, read_clipboard -import pandas.util.testing as tm +import pandas._testing as tm -from pandas.io.clipboard import PyperclipException, clipboard_get, clipboard_set - -try: - DataFrame({"A": [1, 2]}).to_clipboard() - _DEPS_INSTALLED = 1 -except (PyperclipException, RuntimeError): - _DEPS_INSTALLED = 0 +from pandas.io.clipboard import clipboard_get, clipboard_set def build_kwargs(sep, excel): @@ -148,7 +142,6 @@ def test_mock_clipboard(mock_clipboard): @pytest.mark.single @pytest.mark.clipboard -@pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") @pytest.mark.usefixtures("mock_clipboard") class TestClipboard: def check_round_trip_frame(self, data, excel=None, sep=None, encoding=None): @@ -256,9 +249,7 @@ def test_round_trip_valid_encodings(self, enc, df): @pytest.mark.single @pytest.mark.clipboard -@pytest.mark.skipif(not _DEPS_INSTALLED, reason="clipboard primitives not installed") @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑´...", "abcd..."]) -@pytest.mark.xfail(reason="flaky in CI", strict=False) def test_raw_roundtrip(data): # PR #25040 wide unicode wasn't copied correctly on PY3 on windows clipboard_set(data) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index f4efbbeda6311..22aa78919ef0f 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -12,7 +12,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.common as icom @@ -142,17 +142,19 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): path = os.path.join(HERE, "data", "does_not_exist." + fn_ext) msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) - msg2 = ( - r"\[Errno 2\] No such file or directory: '.+does_not_exist" r"\.{}'" - ).format(fn_ext) + msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Expected object or value" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'" - ).format(fn_ext, fn_ext) + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: " + fr"'.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) with pytest.raises( - error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" ): reader(path) @@ -179,19 +181,21 @@ def test_read_expands_user_home_dir( path = os.path.join("~", "does_not_exist." + fn_ext) monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x)) - msg1 = r"File (b')?.+does_not_exist\.{}'? does not exist".format(fn_ext) - msg2 = ( - r"\[Errno 2\] No such file or directory:" r" '.+does_not_exist\.{}'" - ).format(fn_ext) + msg1 = fr"File (b')?.+does_not_exist\.{fn_ext}'? does not exist" + msg2 = fr"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'" msg3 = "Unexpected character found when decoding 'false'" msg4 = "path_or_buf needs to be a string file path or file-like" msg5 = ( - r"\[Errno 2\] File .+does_not_exist\.{} does not exist:" - r" '.+does_not_exist\.{}'" - ).format(fn_ext, fn_ext) + fr"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: " + fr"'.+does_not_exist\.{fn_ext}'" + ) + msg6 = fr"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'" + msg7 = ( + fr"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'" + ) with pytest.raises( - error_class, match=r"({}|{}|{}|{}|{})".format(msg1, msg2, msg3, msg4, msg5) + error_class, match=fr"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7})" ): reader(path) diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index e17a32cbc8b68..fb81e57912dac 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -6,7 +6,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.common as icom diff --git a/pandas/tests/io/test_date_converters.py b/pandas/tests/io/test_date_converters.py index 2fa5e3b30d6af..cdb8eca02a3e5 100644 --- a/pandas/tests/io/test_date_converters.py +++ b/pandas/tests/io/test_date_converters.py @@ -2,7 +2,7 @@ import numpy as np -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.date_converters as conv diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index e06f2c31a2870..0038df78dd866 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -5,7 +5,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.feather_format import read_feather, to_feather # noqa: E402 isort:skip @@ -136,7 +136,7 @@ def test_write_with_index(self): # column multi-index df.index = [0, 1, 2] - df.columns = (pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]),) + df.columns = pd.MultiIndex.from_tuples([("a", 1)]) self.check_error_on_write(df, ValueError) def test_path_pathlib(self): diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index ab27ea7098b08..7a5eba5264421 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -1,6 +1,9 @@ +from contextlib import ExitStack as does_not_raise from datetime import datetime import os import platform +import random +import string import numpy as np import pytest @@ -18,11 +21,6 @@ PRIVATE_KEY_JSON_PATH = None PRIVATE_KEY_JSON_CONTENTS = None -DATASET_ID = "pydata_pandas_bq_testing_py3" - -TABLE_ID = "new_test" -DESTINATION_TABLE = "{0}.{1}".format(DATASET_ID + "1", TABLE_ID) - VERSION = platform.python_version() @@ -70,6 +68,10 @@ def _get_client(): return bigquery.Client(project=project_id, credentials=credentials) +def generate_rand_str(length: int = 10) -> str: + return "".join(random.choices(string.ascii_lowercase, k=length)) + + def make_mixed_dataframe_v2(test_size): # create df to test for all BQ datatypes except RECORD bools = np.random.randint(2, size=(1, test_size)).astype(bool) @@ -149,34 +151,29 @@ def mock_read_gbq(sql, **kwargs): @pytest.mark.single class TestToGBQIntegrationWithServiceAccountKeyPath: - @classmethod - def setup_class(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *BEFORE* - # executing *ALL* tests described below. - + @pytest.fixture() + def gbq_dataset(self): + # Setup Dataset _skip_if_no_project_id() _skip_if_no_private_key_path() - cls.client = _get_client() - cls.dataset = cls.client.dataset(DATASET_ID + "1") - try: - # Clean-up previous test runs. - cls.client.delete_dataset(cls.dataset, delete_contents=True) - except api_exceptions.NotFound: - pass # It's OK if the dataset doesn't already exist. + dataset_id = "pydata_pandas_bq_testing_" + generate_rand_str() + + self.client = _get_client() + self.dataset = self.client.dataset(dataset_id) + + # Create the dataset + self.client.create_dataset(bigquery.Dataset(self.dataset)) - cls.client.create_dataset(bigquery.Dataset(cls.dataset)) + table_name = generate_rand_str() + destination_table = f"{dataset_id}.{table_name}" + yield destination_table - @classmethod - def teardown_class(cls): - # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *AFTER* - # executing all tests. - cls.client.delete_dataset(cls.dataset, delete_contents=True) + # Teardown Dataset + self.client.delete_dataset(self.dataset, delete_contents=True) - def test_roundtrip(self): - destination_table = DESTINATION_TABLE + "1" + def test_roundtrip(self, gbq_dataset): + destination_table = gbq_dataset test_size = 20001 df = make_mixed_dataframe_v2(test_size) @@ -189,9 +186,50 @@ def test_roundtrip(self): ) result = pd.read_gbq( - "SELECT COUNT(*) AS num_rows FROM {0}".format(destination_table), + f"SELECT COUNT(*) AS num_rows FROM {destination_table}", project_id=_get_project_id(), credentials=_get_credentials(), dialect="standard", ) assert result["num_rows"][0] == test_size + + @pytest.mark.parametrize( + "if_exists, expected_num_rows, expectation", + [ + ("append", 300, does_not_raise()), + ("fail", 200, pytest.raises(pandas_gbq.gbq.TableCreationError)), + ("replace", 100, does_not_raise()), + ], + ) + def test_gbq_if_exists( + self, if_exists, expected_num_rows, expectation, gbq_dataset + ): + # GH 29598 + destination_table = gbq_dataset + + test_size = 200 + df = make_mixed_dataframe_v2(test_size) + + df.to_gbq( + destination_table, + _get_project_id(), + chunksize=None, + credentials=_get_credentials(), + ) + + with expectation: + df.iloc[:100].to_gbq( + destination_table, + _get_project_id(), + if_exists=if_exists, + chunksize=None, + credentials=_get_credentials(), + ) + + result = pd.read_gbq( + f"SELECT COUNT(*) AS num_rows FROM {destination_table}", + project_id=_get_project_id(), + credentials=_get_credentials(), + dialect="standard", + ) + assert result["num_rows"][0] == expected_num_rows diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 85ac56c8193a6..557a9d5c13987 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -5,8 +5,8 @@ import pytest from pandas import DataFrame, date_range, read_csv +import pandas._testing as tm from pandas.util import _test_decorators as td -import pandas.util.testing as tm from pandas.io.common import is_gcs_url diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index bc26615d1aad5..7a814ce82fd73 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -15,7 +15,7 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range, read_csv -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.common import file_path_to_url import pandas.io.html @@ -383,7 +383,15 @@ def test_thousands_macau_stats(self, datapath): assert not any(s.isna().any() for _, s in df.items()) @pytest.mark.slow - def test_thousands_macau_index_col(self, datapath): + def test_thousands_macau_index_col(self, datapath, request): + # https://github.com/pandas-dev/pandas/issues/29622 + # This tests fails for bs4 >= 4.8.0 - so handle xfail accordingly + if self.read_html.keywords.get("flavor") == "bs4" and td.safe_import( + "bs4", "4.8.0" + ): + reason = "fails for bs4 version >= 4.8.0" + request.node.add_marker(pytest.mark.xfail(reason=reason)) + all_non_nan_table_index = -2 macau_data = datapath("io", "data", "html", "macau.html") dfs = self.read_html(macau_data, index_col=0, header=0) @@ -1150,9 +1158,9 @@ def test_displayed_only(self, displayed_only, exp0, exp1): assert len(dfs) == 1 # Should not parse hidden table def test_encode(self, html_encoding_file): - _, encoding = os.path.splitext(os.path.basename(html_encoding_file))[0].split( - "_" - ) + base_path = os.path.basename(html_encoding_file) + root = os.path.splitext(base_path)[0] + _, encoding = root.split("_") try: with open(html_encoding_file, "rb") as fobj: @@ -1175,7 +1183,7 @@ def test_encode(self, html_encoding_file): if is_platform_windows(): if "16" in encoding or "32" in encoding: pytest.skip() - raise + raise def test_parse_failure_unseekable(self): # Issue #17975 diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 9f3ec274007d0..a1f9c6f6af51a 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -7,7 +7,7 @@ import pandas as pd from pandas import read_orc -import pandas.util.testing as tm +import pandas._testing as tm pytest.importorskip("pyarrow", minversion="0.13.0") pytest.importorskip("pyarrow.orc") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index fc3d55e110d69..d51c712ed5abd 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -10,7 +10,7 @@ import pandas.util._test_decorators as td import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.parquet import ( FastParquetImpl, @@ -443,11 +443,12 @@ def test_duplicate_columns(self, pa): self.check_error_on_write(df, pa, ValueError) def test_unsupported(self, pa): - # period - df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) - # pyarrow 0.11 raises ArrowTypeError - # older pyarrows raise ArrowInvalid - self.check_error_on_write(df, pa, Exception) + if LooseVersion(pyarrow.__version__) < LooseVersion("0.15.1.dev"): + # period - will be supported using an extension type with pyarrow 1.0 + df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # pyarrow 0.11 raises ArrowTypeError + # older pyarrows raise ArrowInvalid + self.check_error_on_write(df, pa, Exception) # timedelta df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)}) @@ -550,6 +551,19 @@ def test_additional_extension_arrays(self, pa): expected = df.assign(a=df.a.astype("float64")) check_round_trip(df, pa, expected=expected) + @td.skip_if_no("pyarrow", min_version="0.15.1.dev") + def test_additional_extension_types(self, pa): + # test additional ExtensionArrays that are supported through the + # __arrow_array__ protocol + by defining a custom ExtensionType + df = pd.DataFrame( + { + # Arrow does not yet support struct in writing to Parquet (ARROW-1644) + # "c": pd.arrays.IntervalArray.from_tuples([(0, 1), (1, 2), (3, 4)]), + "d": pd.period_range("2012-01-01", periods=3, freq="D"), + } + ) + check_round_trip(df, pa) + class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.3.2") diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 3be966edef080..3d427dde573af 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -22,10 +22,11 @@ import pytest from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian +import pandas.util._test_decorators as td import pandas as pd from pandas import Index -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import Day, MonthEnd @@ -381,12 +382,108 @@ def test_read(self, protocol, get_random_path): tm.assert_frame_equal(df, df2) -def test_unicode_decode_error(): +def test_unicode_decode_error(datapath): # pickle file written with py27, should be readable without raising # UnicodeDecodeError, see GH#28645 - path = os.path.join(os.path.dirname(__file__), "data", "pickle", "test_py27.pkl") + path = datapath("io", "data", "pickle", "test_py27.pkl") df = pd.read_pickle(path) # just test the columns are correct since the values are random excols = pd.Index(["a", "b", "c"]) tm.assert_index_equal(df.columns, excols) + + +# --------------------- +# tests for buffer I/O +# --------------------- + + +def test_pickle_buffer_roundtrip(): + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + with open(path, "wb") as fh: + df.to_pickle(fh) + with open(path, "rb") as fh: + result = pd.read_pickle(fh) + tm.assert_frame_equal(df, result) + + +# --------------------- +# tests for URL I/O +# --------------------- + + +@pytest.mark.parametrize( + "mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"] +) +def test_pickle_generalurl_read(monkeypatch, mockurl): + def python_pickler(obj, path): + with open(path, "wb") as fh: + pickle.dump(obj, fh, protocol=-1) + + class MockReadResponse: + def __init__(self, path): + self.file = open(path, "rb") + if "gzip" in path: + self.headers = {"Content-Encoding": "gzip"} + else: + self.headers = {"Content-Encoding": None} + + def read(self): + return self.file.read() + + def close(self): + return self.file.close() + + with tm.ensure_clean() as path: + + def mock_urlopen_read(*args, **kwargs): + return MockReadResponse(path) + + df = tm.makeDataFrame() + python_pickler(df, path) + monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("gcsfs") +@pytest.mark.parametrize("mockurl", ["gs://gcs.com", "gcs://gcs.com"]) +def test_pickle_gcsurl_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockGCSFileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) + + +@td.skip_if_no("s3fs") +@pytest.mark.parametrize("mockurl", ["s3://s3.com", "s3n://s3.com", "s3a://s3.com"]) +def test_pickle_s3url_roundtrip(monkeypatch, mockurl): + with tm.ensure_clean() as path: + + class MockS3FileSystem: + def __init__(self, *args, **kwargs): + pass + + def open(self, *args): + mode = args[1] or None + f = open(path, mode) + return f + + monkeypatch.setattr("s3fs.S3FileSystem", MockS3FileSystem) + df = tm.makeDataFrame() + df.to_pickle(mockurl) + result = pd.read_pickle(mockurl) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index ccf3167d49371..013f56f83c5ec 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm pyreadstat = pytest.importorskip("pyreadstat") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index fe65820a7c975..45b3e839a08d1 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -41,7 +41,7 @@ to_datetime, to_timedelta, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.io.sql as sql from pandas.io.sql import read_sql_query, read_sql_table @@ -215,9 +215,7 @@ def teardown_method(self, method): class MySQLMixIn(MixInBase): def drop_table(self, table_name): cur = self.conn.cursor() - cur.execute( - "DROP TABLE IF EXISTS {}".format(sql._get_valid_mysql_name(table_name)) - ) + cur.execute(f"DROP TABLE IF EXISTS {sql._get_valid_mysql_name(table_name)}") self.conn.commit() def _get_all_tables(self): @@ -237,7 +235,7 @@ def _close_conn(self): class SQLiteMixIn(MixInBase): def drop_table(self, table_name): self.conn.execute( - "DROP TABLE IF EXISTS {}".format(sql._get_valid_sqlite_name(table_name)) + f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}" ) self.conn.commit() @@ -405,11 +403,7 @@ def _load_raw_sql(self): def _count_rows(self, table_name): result = ( self._get_exec() - .execute( - "SELECT count(*) AS count_1 FROM {table_name}".format( - table_name=table_name - ) - ) + .execute(f"SELECT count(*) AS count_1 FROM {table_name}") .fetchone() ) return result[0] @@ -1207,7 +1201,7 @@ def _get_sqlite_column_type(self, schema, column): for col in schema.split("\n"): if col.split()[0].strip('""') == column: return col.split()[1] - raise ValueError("Column {column} not found".format(column=column)) + raise ValueError(f"Column {column} not found") def test_sqlite_type_mapping(self): @@ -1272,7 +1266,7 @@ def setup_connect(self): # to test if connection can be made: self.conn.connect() except sqlalchemy.exc.OperationalError: - pytest.skip("Can't connect to {0} server".format(self.flavor)) + pytest.skip(f"Can't connect to {self.flavor} server") def test_read_sql(self): self._read_sql_iris() @@ -1414,7 +1408,7 @@ def check(col): else: raise AssertionError( - "DateCol loaded with incorrect type -> {0}".format(col.dtype) + f"DateCol loaded with incorrect type -> {col.dtype}" ) # GH11216 @@ -2051,15 +2045,13 @@ def psql_insert_copy(table, conn, keys, data_iter): writer.writerows(data_iter) s_buf.seek(0) - columns = ", ".join('"{}"'.format(k) for k in keys) + columns = ", ".join(f'"{k}"' for k in keys) if table.schema: - table_name = "{}.{}".format(table.schema, table.name) + table_name = f"{table.schema}.{table.name}" else: table_name = table.name - sql_query = "COPY {} ({}) FROM STDIN WITH CSV".format( - table_name, columns - ) + sql_query = f"COPY {table_name} ({columns}) FROM STDIN WITH CSV" cur.copy_expert(sql=sql_query, file=s_buf) expected = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]}) @@ -2199,14 +2191,12 @@ def test_datetime_time(self): def _get_index_columns(self, tbl_name): ixs = sql.read_sql_query( "SELECT * FROM sqlite_master WHERE type = 'index' " - + "AND tbl_name = '{tbl_name}'".format(tbl_name=tbl_name), + + f"AND tbl_name = '{tbl_name}'", self.conn, ) ix_cols = [] for ix_name in ixs.name: - ix_info = sql.read_sql_query( - "PRAGMA index_info({ix_name})".format(ix_name=ix_name), self.conn - ) + ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", self.conn) ix_cols.append(ix_info.name.tolist()) return ix_cols @@ -2217,15 +2207,11 @@ def test_transactions(self): self._transaction_test() def _get_sqlite_column_type(self, table, column): - recs = self.conn.execute("PRAGMA table_info({table})".format(table=table)) + recs = self.conn.execute(f"PRAGMA table_info({table})") for cid, name, ctype, not_null, default, pk in recs: if name == column: return ctype - raise ValueError( - "Table {table}, column {column} not found".format( - table=table, column=column - ) - ) + raise ValueError(f"Table {table}, column {column} not found") def test_dtype(self): if self.flavor == "mysql": @@ -2295,7 +2281,7 @@ def test_illegal_names(self): sql.table_exists(weird_name, self.conn) df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name]) - c_tbl = "test_weird_col_name{ndx:d}".format(ndx=ndx) + c_tbl = f"test_weird_col_name{ndx:d}" df2.to_sql(c_tbl, self.conn) sql.table_exists(c_tbl, self.conn) @@ -2500,7 +2486,7 @@ def test_if_exists(self): df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) table_name = "table_if_exists" - sql_select = "SELECT * FROM {table_name}".format(table_name=table_name) + sql_select = f"SELECT * FROM {table_name}" def clean_up(test_table_to_drop): """ @@ -2788,7 +2774,7 @@ def test_if_exists(self): df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]}) df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]}) table_name = "table_if_exists" - sql_select = "SELECT * FROM {table_name}".format(table_name=table_name) + sql_select = f"SELECT * FROM {table_name}" def clean_up(test_table_to_drop): """ diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index cbc5ebd986c15..edb766a67af89 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -12,8 +12,8 @@ from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd +import pandas._testing as tm from pandas.core.frame import DataFrame, Series -import pandas.util.testing as tm from pandas.io.parsers import read_csv from pandas.io.stata import ( @@ -21,10 +21,22 @@ PossiblePrecisionLoss, StataMissingValue, StataReader, + StataWriterUTF8, read_stata, ) +@pytest.fixture() +def mixed_frame(): + return pd.DataFrame( + { + "a": [1, 2, 3, 4], + "b": [1.0, 3.0, 27.0, 81.0], + "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], + } + ) + + @pytest.fixture def dirpath(datapath): return datapath("io", "data", "stata") @@ -111,7 +123,7 @@ def read_dta(self, file): def read_csv(self, file): return read_csv(file, parse_dates=True) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_empty_dta(self, version): empty_ds = DataFrame(columns=["unit"]) # GH 7369, make sure can read a 0-obs dta file @@ -331,7 +343,7 @@ def test_write_dta6(self): check_index_type=False, ) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta10(self, version): original = DataFrame( data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]], @@ -367,7 +379,7 @@ def test_write_preserves_original(self): df.to_stata(path, write_index=False) tm.assert_frame_equal(df, df_copy) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_encoding(self, version): # GH 4626, proper encoding handling @@ -408,7 +420,7 @@ def test_read_write_dta11(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_read_write_dta12(self, version): original = DataFrame( [(1, 2, 3, 4, 5, 6)], @@ -460,7 +472,7 @@ def test_read_write_dta13(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize( "file", ["dta14_113", "dta14_114", "dta14_115", "dta14_117"] ) @@ -503,7 +515,7 @@ def test_read_write_reread_dta15(self, file): tm.assert_frame_equal(expected, parsed) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_timestamp_and_label(self, version): original = DataFrame([(1,)], columns=["variable"]) time_stamp = datetime(2000, 2, 29, 14, 21) @@ -517,7 +529,7 @@ def test_timestamp_and_label(self, version): assert reader.time_stamp == "29 Feb 2000 14:21" assert reader.data_label == data_label - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_invalid_timestamp(self, version): original = DataFrame([(1,)], columns=["variable"]) time_stamp = "01 Jan 2000, 00:00:00" @@ -541,7 +553,7 @@ def test_numeric_column_names(self): written_and_read_again.columns = map(convert_col_name, columns) tm.assert_frame_equal(original, written_and_read_again) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nan_to_missing_value(self, version): s1 = Series(np.arange(4.0), dtype=np.float32) s2 = Series(np.arange(4.0), dtype=np.float64) @@ -661,7 +673,7 @@ def test_write_missing_strings(self): written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), expected) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.parametrize("byteorder", [">", "<"]) def test_bool_uint(self, byteorder, version): s0 = Series([0, 1, True], dtype=np.bool) @@ -736,10 +748,10 @@ def test_excessively_long_string(self): ) original = DataFrame(s) msg = ( - r"Fixed width strings in Stata \.dta files are limited to 244" - r" \(or fewer\)\ncharacters\. Column 's500' does not satisfy" - r" this restriction\. Use the\n'version=117' parameter to write" - r" the newer \(Stata 13 and later\) format\." + r"Fixed width strings in Stata \.dta files are limited to 244 " + r"\(or fewer\)\ncharacters\. Column 's500' does not satisfy " + r"this restriction\. Use the\n'version=117' parameter to write " + r"the newer \(Stata 13 and later\) format\." ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -907,7 +919,7 @@ def test_drop_column(self): columns = ["byte_", "int_", "long_", "not_found"] read_stata(self.dta15_117, convert_dates=True, columns=columns) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) @pytest.mark.filterwarnings( "ignore:\\nStata value:pandas.io.stata.ValueLabelTypeMismatch" ) @@ -967,8 +979,8 @@ def test_categorical_warnings_and_errors(self): ) with tm.ensure_clean() as path: msg = ( - "Stata value labels for a single variable must have" - r" a combined length less than 32,000 characters\." + "Stata value labels for a single variable must have " + r"a combined length less than 32,000 characters\." ) with pytest.raises(ValueError, match=msg): original.to_stata(path) @@ -984,7 +996,7 @@ def test_categorical_warnings_and_errors(self): original.to_stata(path) # should get a warning for mixed content - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_categorical_with_stata_missing_values(self, version): values = [["a" + str(i)] for i in range(120)] values.append([np.nan]) @@ -1220,20 +1232,13 @@ def test_read_chunks_columns(self): tm.assert_frame_equal(from_frame, chunk, check_dtype=False) pos += chunksize - @pytest.mark.parametrize("version", [114, 117]) - def test_write_variable_labels(self, version): + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_write_variable_labels(self, version, mixed_frame): # GH 13631, add support for writing variable labels - original = pd.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [1.0, 3.0, 27.0, 81.0], - "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], - } - ) - original.index.name = "index" + mixed_frame.index.name = "index" variable_labels = {"a": "City Rank", "b": "City Exponent", "c": "City"} with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels, version=version) + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) with StataReader(path) as sr: read_labels = sr.variable_labels() expected_labels = { @@ -1246,48 +1251,36 @@ def test_write_variable_labels(self, version): variable_labels["index"] = "The Index" with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels, version=version) + mixed_frame.to_stata(path, variable_labels=variable_labels, version=version) with StataReader(path) as sr: read_labels = sr.variable_labels() assert read_labels == variable_labels - @pytest.mark.parametrize("version", [114, 117]) - def test_invalid_variable_labels(self, version): - original = pd.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [1.0, 3.0, 27.0, 81.0], - "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], - } - ) - original.index.name = "index" + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_invalid_variable_labels(self, version, mixed_frame): + mixed_frame.index.name = "index" variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} with tm.ensure_clean() as path: msg = "Variable labels must be 80 characters or fewer" with pytest.raises(ValueError, match=msg): - original.to_stata( + mixed_frame.to_stata( path, variable_labels=variable_labels, version=version ) + @pytest.mark.parametrize("version", [114, 117]) + def test_invalid_variable_label_encoding(self, version, mixed_frame): + mixed_frame.index.name = "index" + variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"} variable_labels["a"] = "invalid character Œ" with tm.ensure_clean() as path: - msg = ( - "Variable labels must contain only characters that can be" - " encoded in Latin-1" - ) - with pytest.raises(ValueError, match=msg): - original.to_stata( + with pytest.raises( + ValueError, match="Variable labels must contain only characters" + ): + mixed_frame.to_stata( path, variable_labels=variable_labels, version=version ) - def test_write_variable_label_errors(self): - original = pd.DataFrame( - { - "a": [1, 2, 3, 4], - "b": [1.0, 3.0, 27.0, 81.0], - "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"], - } - ) + def test_write_variable_label_errors(self, mixed_frame): values = ["\u03A1", "\u0391", "\u039D", "\u0394", "\u0391", "\u03A3"] variable_labels_utf8 = { @@ -1297,12 +1290,12 @@ def test_write_variable_label_errors(self): } msg = ( - "Variable labels must contain only characters that can be" - " encoded in Latin-1" + "Variable labels must contain only characters that can be " + "encoded in Latin-1" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels_utf8) + mixed_frame.to_stata(path, variable_labels=variable_labels_utf8) variable_labels_long = { "a": "City Rank", @@ -1315,7 +1308,7 @@ def test_write_variable_label_errors(self): msg = "Variable labels must be 80 characters or fewer" with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: - original.to_stata(path, variable_labels=variable_labels_long) + mixed_frame.to_stata(path, variable_labels=variable_labels_long) def test_default_date_conversion(self): # GH 12259 @@ -1425,8 +1418,8 @@ def test_out_of_range_double(self): } ) msg = ( - r"Column ColumnTooBig has a maximum value \(.+\)" - r" outside the range supported by Stata \(.+\)" + r"Column ColumnTooBig has a maximum value \(.+\) outside the range " + r"supported by Stata \(.+\)" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1434,8 +1427,8 @@ def test_out_of_range_double(self): df.loc[2, "ColumnTooBig"] = np.inf msg = ( - "Column ColumnTooBig has a maximum value of infinity which" - " is outside the range supported by Stata" + "Column ColumnTooBig has a maximum value of infinity which is outside " + "the range supported by Stata" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1468,8 +1461,8 @@ def test_out_of_range_float(self): original.loc[2, "ColumnTooBig"] = np.inf msg = ( - "Column ColumnTooBig has a maximum value of infinity which" - " is outside the range supported by Stata" + "Column ColumnTooBig has a maximum value of infinity which " + "is outside the range supported by Stata" ) with pytest.raises(ValueError, match=msg): with tm.ensure_clean() as path: @@ -1637,7 +1630,7 @@ def test_invalid_date_conversion(self): with pytest.raises(ValueError, match=msg): original.to_stata(path, convert_dates={"wrong_name": "tc"}) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_nonfile_writing(self, version): # GH 21041 bio = io.BytesIO() @@ -1700,35 +1693,27 @@ def test_mixed_string_strl(self): expected = output.fillna("") tm.assert_frame_equal(reread, expected) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_all_none_exception(self, version): output = [{"none": "none", "number": 0}, {"none": None, "number": 1}] output = pd.DataFrame(output) output.loc[:, "none"] = None with tm.ensure_clean() as path: - msg = ( - r"Column `none` cannot be exported\.\n\n" - "Only string-like object arrays containing all strings or a" - r" mix of strings and None can be exported\. Object arrays" - r" containing only null values are prohibited\. Other" - " object typescannot be exported and must first be" - r" converted to one of the supported types\." - ) - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match="Column `none` cannot be exported"): output.to_stata(path, version=version) - @pytest.mark.parametrize("version", [114, 117]) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) def test_invalid_file_not_written(self, version): content = "Here is one __�__ Another one __·__ Another one __½__" df = DataFrame([content], columns=["invalid"]) with tm.ensure_clean() as path: msg1 = ( - r"'latin-1' codec can't encode character '\\ufffd'" - r" in position 14: ordinal not in range\(256\)" + r"'latin-1' codec can't encode character '\\ufffd' " + r"in position 14: ordinal not in range\(256\)" ) msg2 = ( - "'ascii' codec can't decode byte 0xef in position 14:" - r" ordinal not in range\(128\)" + "'ascii' codec can't decode byte 0xef in position 14: " + r"ordinal not in range\(128\)" ) with pytest.raises(UnicodeEncodeError, match=r"{}|{}".format(msg1, msg2)): with tm.assert_produces_warning(ResourceWarning): @@ -1778,3 +1763,56 @@ def test_stata_119(self): assert df.iloc[0, 7] == 3.14 assert df.iloc[0, -1] == 1 assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21)) + + @pytest.mark.parametrize("version", [118, 119, None]) + def test_utf8_writer(self, version): + cat = pd.Categorical(["a", "β", "ĉ"], ordered=True) + data = pd.DataFrame( + [ + [1.0, 1, "ᴬ", "ᴀ relatively long ŝtring"], + [2.0, 2, "ᴮ", ""], + [3.0, 3, "ᴰ", None], + ], + columns=["a", "β", "ĉ", "strls"], + ) + data["ᴐᴬᵀ"] = cat + variable_labels = { + "a": "apple", + "β": "ᵈᵉᵊ", + "ĉ": "ᴎტჄႲႳႴႶႺ", + "strls": "Long Strings", + "ᴐᴬᵀ": "", + } + data_label = "ᴅaᵀa-label" + data["β"] = data["β"].astype(np.int32) + with tm.ensure_clean() as path: + writer = StataWriterUTF8( + path, + data, + data_label=data_label, + convert_strl=["strls"], + variable_labels=variable_labels, + write_index=False, + version=version, + ) + writer.write_file() + reread_encoded = read_stata(path) + # Missing is intentionally converted to empty strl + data["strls"] = data["strls"].fillna("") + tm.assert_frame_equal(data, reread_encoded) + reader = StataReader(path) + assert reader.data_label == data_label + assert reader.variable_labels() == variable_labels + + data.to_stata(path, version=version, write_index=False) + reread_to_stata = read_stata(path) + tm.assert_frame_equal(data, reread_to_stata) + + def test_writer_118_exceptions(self): + df = DataFrame(np.zeros((1, 33000), dtype=np.int8)) + with tm.ensure_clean() as path: + with pytest.raises(ValueError, match="version must be either 118 or 119."): + StataWriterUTF8(path, df, version=117) + with tm.ensure_clean() as path: + with pytest.raises(ValueError, match="You must use version 119"): + StataWriterUTF8(path, df, version=118) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 86cb7fc57b225..a604d90acc854 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # coding: utf-8 import os @@ -14,7 +14,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm """ diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 116d924f5a596..8ee279f0e1f38 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -10,8 +10,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, MultiIndex, Series, date_range, timedelta_range +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm import pandas.plotting as plotting diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index 71a186dc2f3b0..9cd3ccbf9214e 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -10,7 +10,7 @@ from pandas.compat.numpy import np_datetime64_compat from pandas import Index, Period, Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm from pandas.plotting import ( deregister_matplotlib_converters, @@ -66,11 +66,10 @@ def test_registering_no_warning(self): # Set to the "warn" state, in case this isn't the first test run register_matplotlib_converters() - with tm.assert_produces_warning(None) as w: + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + # GH#30588 DeprecationWarning from 2D indexing ax.plot(s.index, s.values) - assert len(w) == 0 - def test_pandas_plots_register(self): pytest.importorskip("matplotlib.pyplot") s = Series(range(12), index=date_range("2017", periods=12)) @@ -101,19 +100,16 @@ def test_option_no_warning(self): # Test without registering first, no warning with ctx: - with tm.assert_produces_warning(None) as w: + # GH#30588 DeprecationWarning from 2D indexing on Index + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): ax.plot(s.index, s.values) - assert len(w) == 0 - # Now test with registering register_matplotlib_converters() with ctx: - with tm.assert_produces_warning(None) as w: + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): ax.plot(s.index, s.values) - assert len(w) == 0 - def test_registry_resets(self): units = pytest.importorskip("matplotlib.units") dates = pytest.importorskip("matplotlib.dates") diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 8456f095e5868..fb86b600d3d3c 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -9,12 +9,12 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Index, NaT, Series, isna +import pandas._testing as tm from pandas.core.indexes.datetimes import bdate_range, date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.resample import DatetimeIndex from pandas.tests.plotting.common import TestPlotBase -import pandas.util.testing as tm from pandas.tseries.offsets import DateOffset @@ -121,8 +121,8 @@ def test_both_style_and_color(self): ts = tm.makeTimeSeries() msg = ( "Cannot pass 'style' string with a color symbol and 'color' " - "keyword argument. Please use one or the other or pass 'style'" - " without a color symbol" + "keyword argument. Please use one or the other or pass 'style' " + "without a color symbol" ) with pytest.raises(ValueError, match=msg): ts.plot(style="b-", color="#000099") diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index fd66888fc30e4..1c429bafa9a19 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -17,9 +17,9 @@ import pandas as pd from pandas import DataFrame, MultiIndex, PeriodIndex, Series, bdate_range, date_range +import pandas._testing as tm from pandas.core.arrays import integer_array from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm from pandas.io.formats.printing import pprint_thing import pandas.plotting as plotting @@ -555,14 +555,14 @@ def test_subplots_timeseries_y_axis_not_supported(self): period: since period isn't yet implemented in ``select_dtypes`` and because it will need a custom value converter + - tick formater (as was done for x-axis plots) + tick formatter (as was done for x-axis plots) categorical: because it will need a custom value converter + - tick formater (also doesn't work for x-axis, as of now) + tick formatter (also doesn't work for x-axis, as of now) datetime_mixed_tz: - because of the way how pandas handels ``Series`` of + because of the way how pandas handles ``Series`` of ``datetime`` objects with different timezone, generally converting ``datetime`` objects in a tz-aware form could help with this problem @@ -1162,6 +1162,36 @@ def test_plot_scatter(self): axes = df.plot(x="x", y="y", kind="scatter", subplots=True) self._check_axes_shape(axes, axes_num=1, layout=(1, 1)) + def test_raise_error_on_datetime_time_data(self): + # GH 8113, datetime.time type is not supported by matplotlib in scatter + df = pd.DataFrame(np.random.randn(10), columns=["a"]) + df["dtime"] = pd.date_range(start="2014-01-01", freq="h", periods=10).time + msg = "must be a string or a number, not 'datetime.time'" + + with pytest.raises(TypeError, match=msg): + df.plot(kind="scatter", x="dtime", y="a") + + def test_scatterplot_datetime_data(self): + # GH 30391 + dates = pd.date_range(start=date(2019, 1, 1), periods=12, freq="W") + vals = np.random.normal(0, 1, len(dates)) + df = pd.DataFrame({"dates": dates, "vals": vals}) + + _check_plot_works(df.plot.scatter, x="dates", y="vals") + _check_plot_works(df.plot.scatter, x=0, y=1) + + def test_scatterplot_object_data(self): + # GH 18755 + df = pd.DataFrame(dict(a=["A", "B", "C"], b=[2, 3, 4])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + + df = pd.DataFrame(dict(a=["A", "B", "C"], b=["a", "b", "c"])) + + _check_plot_works(df.plot.scatter, x="a", y="b") + _check_plot_works(df.plot.scatter, x=0, y=1) + @pytest.mark.slow def test_if_scatterplot_colorbar_affects_xaxis_visibility(self): # addressing issue #10611, to ensure colobar does not @@ -1216,24 +1246,15 @@ def test_if_scatterplot_colorbars_are_next_to_parent_axes(self): colorbar_distance = axes_x_coords[3, :] - axes_x_coords[2, :] assert np.isclose(parent_distance, colorbar_distance, atol=1e-7).all() + @pytest.mark.parametrize("x, y", [("x", "y"), ("y", "x"), ("y", "y")]) @pytest.mark.slow - def test_plot_scatter_with_categorical_data(self): - # GH 16199 + def test_plot_scatter_with_categorical_data(self, x, y): + # after fixing GH 18755, should be able to plot categorical data df = pd.DataFrame( {"x": [1, 2, 3, 4], "y": pd.Categorical(["a", "b", "a", "c"])} ) - with pytest.raises(ValueError) as ve: - df.plot(x="x", y="y", kind="scatter") - ve.match("requires y column to be numeric") - - with pytest.raises(ValueError) as ve: - df.plot(x="y", y="x", kind="scatter") - ve.match("requires x column to be numeric") - - with pytest.raises(ValueError) as ve: - df.plot(x="y", y="y", kind="scatter") - ve.match("requires x column to be numeric") + _check_plot_works(df.plot.scatter, x=x, y=y) @pytest.mark.slow def test_plot_scatter_with_c(self): @@ -3250,6 +3271,34 @@ def test_plot_no_numeric_data(self): with pytest.raises(TypeError): df.plot() + def test_missing_markers_legend(self): + # 14958 + df = pd.DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"]) + ax = df.plot(y=["A"], marker="x", linestyle="solid") + df.plot(y=["B"], marker="o", linestyle="dotted", ax=ax) + df.plot(y=["C"], marker="<", linestyle="dotted", ax=ax) + + self._check_legend_labels(ax, labels=["A", "B", "C"]) + self._check_legend_marker(ax, expected_markers=["x", "o", "<"]) + + def test_missing_markers_legend_using_style(self): + # 14563 + df = pd.DataFrame( + { + "A": [1, 2, 3, 4, 5, 6], + "B": [2, 4, 1, 3, 2, 4], + "C": [3, 3, 2, 6, 4, 2], + "X": [1, 2, 3, 4, 5, 6], + } + ) + + fig, ax = self.plt.subplots() + for kind in "ABC": + df.plot("X", kind, label=kind, ax=ax, style=".") + + self._check_legend_labels(ax, labels=["A", "B", "C"]) + self._check_legend_marker(ax, expected_markers=[".", ".", "."]) + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index bb1747710fe18..8fec4bb134cb4 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -8,8 +8,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase -import pandas.util.testing as tm @td.skip_if_no_mpl diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 6c1c7dfd1a4a4..50ebbc22f2739 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -9,8 +9,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm @td.skip_if_no_mpl @@ -253,6 +253,24 @@ def test_tight_layout(self): tm.close() + def test_hist_subplot_xrot(self): + # GH 30288 + df = DataFrame( + { + "length": [1.5, 0.5, 1.2, 0.9, 3], + "animal": ["pig", "rabbit", "pig", "pig", "rabbit"], + } + ) + axes = _check_plot_works( + df.hist, + filterwarnings="always", + column="length", + by="animal", + bins=5, + xrot=0, + ) + self._check_ticks_props(axes, xrot=0) + @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index eadcc12d8428c..228c84528e882 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -10,8 +10,8 @@ import pandas.util._test_decorators as td from pandas import DataFrame, Series +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm import pandas.plotting as plotting @@ -319,8 +319,8 @@ def test_subplot_titles(self, iris): # Case len(title) > len(df) msg = ( - "The length of `title` must equal the number of columns if" - " using `title` of type `list` and `subplots=True`" + "The length of `title` must equal the number of columns if " + "using `title` of type `list` and `subplots=True`" ) with pytest.raises(ValueError, match=msg): df.plot(subplots=True, title=title + ["kittens > puppies"]) @@ -331,8 +331,8 @@ def test_subplot_titles(self, iris): # Case subplots=False and title is of type list msg = ( - "Using `title` of type `list` is not supported unless" - " `subplots=True` is passed" + "Using `title` of type `list` is not supported unless " + "`subplots=True` is passed" ) with pytest.raises(ValueError, match=msg): df.plot(subplots=False, title=title) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 61722d726b28b..8463f30bee8f0 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -14,8 +14,8 @@ import pandas as pd from pandas import DataFrame, Series, date_range +import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works -import pandas.util.testing as tm import pandas.plotting as plotting diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index d66472b1c2054..7400b049961d5 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -21,8 +21,8 @@ timedelta_range, to_timedelta, ) +import pandas._testing as tm from pandas.core import nanops -import pandas.util.testing as tm def get_objs(): diff --git a/pandas/tests/reductions/test_stat_reductions.py b/pandas/tests/reductions/test_stat_reductions.py index 432811b5a8264..59dbcb9ab9fa0 100644 --- a/pandas/tests/reductions/test_stat_reductions.py +++ b/pandas/tests/reductions/test_stat_reductions.py @@ -10,8 +10,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray -import pandas.util.testing as tm class TestDatetimeLikeStatReductions: diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 02203f476af8e..f8a1810e66219 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -5,12 +5,12 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.groupby.groupby import DataError from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import PeriodIndex, period_range from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range -import pandas.util.testing as tm # a fixture value can be overridden by the test parameter value. Note that the # value of the fixture can be overridden this way even if the test doesn't use @@ -84,8 +84,8 @@ def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() msg = ( - "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Index'" + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, " + "but got an instance of 'Index'" ) with pytest.raises(TypeError, match=msg): xp.resample("A").mean() diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 5837d526e3978..4860329718f54 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -10,11 +10,11 @@ import pandas as pd from pandas import DataFrame, Series, Timedelta, Timestamp, isna, notna +import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, period_range from pandas.core.resample import DatetimeIndex, _get_timestamp_range_edges -import pandas.util.testing as tm import pandas.tseries.offsets as offsets from pandas.tseries.offsets import BDay, Minute @@ -1564,3 +1564,20 @@ def test_get_timestamp_range_edges(first, last, offset, exp_first, exp_last): result = _get_timestamp_range_edges(first, last, offset) expected = (exp_first, exp_last) assert result == expected + + +def test_resample_apply_product(): + # GH 5586 + index = date_range(start="2012-01-31", freq="M", periods=12) + + ts = Series(range(12), index=index) + df = DataFrame(dict(A=ts, B=ts + 2)) + result = df.resample("Q").apply(np.product) + expected = DataFrame( + np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64), + index=DatetimeIndex( + ["2012-03-31", "2012-06-30", "2012-09-30", "2012-12-31"], freq="Q-DEC" + ), + columns=["A", "B"], + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 0ed0bf18a82ee..955f8c7482937 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -10,11 +10,11 @@ import pandas as pd from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range from pandas.core.resample import _get_period_range_edges -import pandas.util.testing as tm import pandas.tseries.offsets as offsets @@ -82,9 +82,9 @@ def test_selection(self, index, freq, kind, kwargs): index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]), ) msg = ( - "Resampling from level= or on= selection with a PeriodIndex is" - r" not currently supported, use \.set_index\(\.\.\.\) to" - " explicitly set index" + "Resampling from level= or on= selection with a PeriodIndex is " + r"not currently supported, use \.set_index\(\.\.\.\) to " + "explicitly set index" ) with pytest.raises(NotImplementedError, match=msg): df.resample(freq, kind=kind, **kwargs) @@ -130,8 +130,8 @@ def test_not_subperiod(self, simple_period_range_series, rule, expected_error_ms # These are incompatible period rules for resampling ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="w-wed") msg = ( - "Frequency cannot be resampled to {}, as they" - " are not sub or super periods" + "Frequency cannot be resampled to {}, as they " + "are not sub or super periods" ).format(expected_error_msg) with pytest.raises(IncompatibleFrequency, match=msg): ts.resample(rule).mean() @@ -236,8 +236,8 @@ def test_resample_same_freq(self, resample_method): def test_resample_incompat_freq(self): msg = ( - "Frequency cannot be resampled to ," - " as they are not sub or super periods" + "Frequency cannot be resampled to , " + "as they are not sub or super periods" ) with pytest.raises(IncompatibleFrequency, match=msg): Series( diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 7852afcdbfea9..d552241f9126f 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm dti = date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min") @@ -257,8 +257,8 @@ def test_fillna(): tm.assert_series_equal(result, expected) msg = ( - r"Invalid fill method\. Expecting pad \(ffill\), backfill" - r" \(bfill\) or nearest\. Got 0" + r"Invalid fill method\. Expecting pad \(ffill\), backfill " + r"\(bfill\) or nearest\. Got 0" ) with pytest.raises(ValueError, match=msg): r.fillna(0) @@ -519,8 +519,8 @@ def test_selection_api_validation(): # non DatetimeIndex msg = ( - "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex," - " but got an instance of 'Int64Index'" + "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, " + "but got an instance of 'Int64Index'" ) with pytest.raises(TypeError, match=msg): df.resample("2D", level="v") @@ -539,8 +539,8 @@ def test_selection_api_validation(): # upsampling not allowed msg = ( - "Upsampling from level= or on= selection is not supported, use" - r" \.set_index\(\.\.\.\) to explicitly set index to datetime-like" + "Upsampling from level= or on= selection is not supported, use " + r"\.set_index\(\.\.\.\) to explicitly set index to datetime-like" ) with pytest.raises(ValueError, match=msg): df.resample("2D", level="d").asfreq() diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 7efc6b0d466b9..4e3585c0be884 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -2,10 +2,12 @@ import numpy as np +from pandas.util._test_decorators import async_mark + import pandas as pd from pandas import DataFrame, Series, Timestamp +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm test_frame = DataFrame( {"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}, @@ -13,17 +15,18 @@ ) -def test_tab_complete_ipython6_warning(ip): +@async_mark() +async def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter code = dedent( """\ - import pandas.util.testing as tm + import pandas._testing as tm s = tm.makeTimeSeries() rs = s.resample("D") """ ) - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 4c27d48cff6fd..3aa7765954634 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.groupby.grouper import Grouper from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm test_series = Series(np.random.randn(1000), index=date_range("1/1/2000", periods=1000)) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 7a6ebf826ca4d..d1bcdc55cb509 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -4,8 +4,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.indexes.timedeltas import timedelta_range -import pandas.util.testing as tm def test_asfreq_bug(): diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index e477b7608ab93..7020d373caf82 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, merge +import pandas._testing as tm from pandas.tests.reshape.merge.test_merge import NGROUPS, N, get_test_data -import pandas.util.testing as tm a_ = np.array @@ -212,8 +212,8 @@ def test_join_on(self): source_copy = source.copy() source_copy["A"] = 0 msg = ( - "You are trying to merge on float64 and object columns. If" - " you wish to proceed you should use pd.concat" + "You are trying to merge on float64 and object columns. If " + "you wish to proceed you should use pd.concat" ) with pytest.raises(ValueError, match=msg): target.join(source_copy, on="A") @@ -226,9 +226,7 @@ def test_join_on_fails_with_different_right_index(self): {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}, index=tm.makeCustomIndex(10, 2), ) - msg = ( - r"len\(left_on\) must equal the number of levels in the index" ' of "right"' - ) + msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): merge(df, df2, left_on="a", right_index=True) @@ -240,9 +238,7 @@ def test_join_on_fails_with_different_left_index(self): df2 = DataFrame( {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)} ) - msg = ( - r"len\(right_on\) must equal the number of levels in the index" ' of "left"' - ) + msg = r'len\(right_on\) must equal the number of levels in the index of "left"' with pytest.raises(ValueError, match=msg): merge(df, df2, right_on="b", left_index=True) @@ -737,9 +733,7 @@ def test_join_multi_to_multi(self, join_type): ) tm.assert_frame_equal(expected, result) - msg = ( - r"len\(left_on\) must equal the number of levels in the index" ' of "right"' - ) + msg = r'len\(left_on\) must equal the number of levels in the index of "right"' with pytest.raises(ValueError, match=msg): left.join(right, on="xy", how=join_type) @@ -770,6 +764,35 @@ def test_join_on_tz_aware_datetimeindex(self): expected["vals_2"] = pd.Series([np.nan] * 2 + list("tuv"), dtype=object) tm.assert_frame_equal(result, expected) + def test_join_datetime_string(self): + # GH 5647 + dfa = DataFrame( + [ + ["2012-08-02", "L", 10], + ["2012-08-02", "J", 15], + ["2013-04-06", "L", 20], + ["2013-04-06", "J", 25], + ], + columns=["x", "y", "a"], + ) + dfa["x"] = pd.to_datetime(dfa["x"]) + dfb = DataFrame( + [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]], + columns=["x", "y", "z"], + index=[2, 4], + ) + dfb["x"] = pd.to_datetime(dfb["x"]) + result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"]) + expected = DataFrame( + [ + [pd.Timestamp("2012-08-02 00:00:00"), "J", 1, 15], + [pd.Timestamp("2013-04-06 00:00:00"), "L", 2, 20], + ], + index=[2, 4], + columns=["x", "y", "z", "a"], + ) + tm.assert_frame_equal(result, expected) + def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"): diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 5f4e8323c7127..30c440035d48e 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -25,10 +25,10 @@ TimedeltaIndex, UInt64Index, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import MergeError, merge -import pandas.util.testing as tm N = 50 NGROUPS = 8 @@ -201,8 +201,8 @@ def test_merge_misspecified(self): merge(self.left, self.right, right_index=True) msg = ( - 'Can only pass argument "on" OR "left_on" and "right_on", not' - " a combination of both" + 'Can only pass argument "on" OR "left_on" and "right_on", not ' + "a combination of both" ) with pytest.raises(pd.errors.MergeError, match=msg): merge(self.left, self.left, left_on="key", on="key") @@ -744,7 +744,7 @@ def test_overlapping_columns_error_message(self): # #2649, #10639 df2.columns = ["key1", "foo", "foo"] - msg = r"Data columns not unique: Index\(\['foo', 'foo'\]," r" dtype='object'\)" + msg = r"Data columns not unique: Index\(\['foo', 'foo'\], dtype='object'\)" with pytest.raises(MergeError, match=msg): merge(df, df2) @@ -1013,10 +1013,9 @@ def test_indicator(self): df_badcolumn = DataFrame({"col1": [1, 2], i: [2, 2]}) msg = ( - "Cannot use `indicator=True` option when data contains a" - " column named {}|" - "Cannot use name of an existing column for indicator" - " column" + "Cannot use `indicator=True` option when data contains a " + "column named {}|" + "Cannot use name of an existing column for indicator column" ).format(i) with pytest.raises(ValueError, match=msg): merge(df1, df_badcolumn, on="col1", how="outer", indicator=True) @@ -1235,8 +1234,8 @@ def test_validation(self): ) msg = ( - "Merge keys are not unique in either left or right dataset;" - " not a one-to-one merge" + "Merge keys are not unique in either left or right dataset; " + "not a one-to-one merge" ) with pytest.raises(MergeError, match=msg): merge(left, right, on="a", validate="1:1") diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index b2e764c5463fa..8037095aff0b9 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -6,8 +6,8 @@ import pandas as pd from pandas import Timedelta, merge_asof, read_csv, to_datetime +import pandas._testing as tm from pandas.core.reshape.merge import MergeError -import pandas.util.testing as tm class TestAsOfMerge: @@ -1185,6 +1185,13 @@ def test_merge_datatype_categorical_error_raises(self): with pytest.raises(MergeError, match=msg): merge_asof(left, right, on="a") + def test_merge_groupby_multiple_column_with_categorical_column(self): + # GH 16454 + df = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])}) + result = merge_asof(df, df, on="x", by=["y", "z"]) + expected = pd.DataFrame({"x": [0], "y": [0], "z": pd.Categorical([0])}) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "func", [lambda x: x, lambda x: to_datetime(x)], ids=["numeric", "datetime"] ) diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index 4e0f570567c07..691f2549c0ece 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture diff --git a/pandas/tests/reshape/merge/test_merge_ordered.py b/pandas/tests/reshape/merge/test_merge_ordered.py index 6d6429fb4e6b5..e0063925a03e1 100644 --- a/pandas/tests/reshape/merge/test_merge_ordered.py +++ b/pandas/tests/reshape/merge/test_merge_ordered.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, merge_ordered -import pandas.util.testing as tm +import pandas._testing as tm class TestMergeOrdered: diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py index bce62571d55ec..1f78c1900d237 100644 --- a/pandas/tests/reshape/merge/test_multi.py +++ b/pandas/tests/reshape/merge/test_multi.py @@ -1,14 +1,12 @@ -from collections import OrderedDict - import numpy as np from numpy.random import randn import pytest import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.reshape.concat import concat from pandas.core.reshape.merge import merge -import pandas.util.testing as tm @pytest.fixture @@ -474,17 +472,13 @@ def test_merge_datetime_index(self, klass): if klass is not None: on_vector = klass(on_vector) - expected = DataFrame( - OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) - ) + expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]}) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict( - [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] - ) + {"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]} ) result = df.merge(df, on=[df.index.year], how="inner") @@ -788,17 +782,13 @@ def test_merge_datetime_index(self, box): if box is not None: on_vector = box(on_vector) - expected = DataFrame( - OrderedDict([("a", [1, 2, 3]), ("key_1", [2016, 2017, 2018])]) - ) + expected = DataFrame({"a": [1, 2, 3], "key_1": [2016, 2017, 2018]}) result = df.merge(df, on=["a", on_vector], how="inner") tm.assert_frame_equal(result, expected) expected = DataFrame( - OrderedDict( - [("key_0", [2016, 2017, 2018]), ("a_x", [1, 2, 3]), ("a_y", [1, 2, 3])] - ) + {"key_0": [2016, 2017, 2018], "a_x": [1, 2, 3], "a_y": [1, 2, 3]} ) result = df.merge(df, on=[df.index.year], how="inner") @@ -828,3 +818,22 @@ def test_single_common_level(self): ).set_index(["key", "X", "Y"]) tm.assert_frame_equal(result, expected) + + def test_join_multi_wrong_order(self): + # GH 25760 + # GH 28956 + + midx1 = pd.MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"]) + midx3 = pd.MultiIndex.from_tuples([(4, 1), (3, 2), (3, 1)], names=["b", "a"]) + + left = pd.DataFrame(index=midx1, data={"x": [10, 20, 30, 40]}) + right = pd.DataFrame(index=midx3, data={"y": ["foo", "bar", "fing"]}) + + result = left.join(right) + + expected = pd.DataFrame( + index=midx1, + data={"x": [10, 20, 30, 40], "y": ["fing", "foo", "bar", np.nan]}, + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 05193c00f0649..b3b2c5a05c6ad 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -27,9 +27,10 @@ isna, read_csv, ) +import pandas._testing as tm +from pandas.core.arrays import SparseArray from pandas.core.construction import create_series_with_explicit_dtype from pandas.tests.extension.decimal import to_decimal -import pandas.util.testing as tm @pytest.fixture(params=[True, False]) @@ -197,8 +198,8 @@ def test_concatlike_same_dtypes(self): # cannot append non-index msg = ( - r"cannot concatenate object of type '.+';" - " only Series and DataFrame objs are valid" + r"cannot concatenate object of type '.+'; " + "only Series and DataFrame objs are valid" ) with pytest.raises(TypeError, match=msg): pd.Series(vals1).append(vals2) @@ -1865,8 +1866,8 @@ def test_concat_invalid(self): # trying to concat a ndframe with a non-ndframe df1 = tm.makeCustomDataframe(10, 2) msg = ( - "cannot concatenate object of type '{}';" - " only Series and DataFrame objs are valid" + "cannot concatenate object of type '{}'; " + "only Series and DataFrame objs are valid" ) for obj in [1, dict(), [1, 2], (1, 2)]: with pytest.raises(TypeError, match=msg.format(type(obj))): @@ -2730,3 +2731,22 @@ def test_concat_datetimeindex_freq(): expected = pd.DataFrame(data[50:] + data[:50], index=dr[50:].append(dr[:50])) expected.index._data.freq = None tm.assert_frame_equal(result, expected) + + +def test_concat_empty_df_object_dtype(): + # GH 9149 + df_1 = pd.DataFrame({"Row": [0, 1, 1], "EmptyCol": np.nan, "NumberCol": [1, 2, 3]}) + df_2 = pd.DataFrame(columns=df_1.columns) + result = pd.concat([df_1, df_2], axis=0) + expected = df_1.astype(object) + tm.assert_frame_equal(result, expected) + + +def test_concat_sparse(): + # GH 23557 + a = pd.Series(SparseArray([0, 1, 2])) + expected = pd.DataFrame(data=[[0, 0], [1, 1], [2, 2]]).astype( + pd.SparseDtype(np.int64, 0) + ) + result = pd.concat([a, a], axis=1) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 611c3272c123f..13b6f05ed304a 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -19,9 +19,9 @@ timedelta_range, to_datetime, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT import pandas.core.reshape.tile as tmod -import pandas.util.testing as tm def test_simple(): @@ -603,3 +603,12 @@ def test_cut_bool_coercion_to_int(bins, box, compare): expected = cut(data_expected, bins, duplicates="drop") result = cut(data_result, bins, duplicates="drop") compare(result, expected) + + +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_cut_incorrect_labels(labels): + # GH 13318 + values = range(5) + msg = "Bin labels must either be False, None or passed in as a list-like argument" + with pytest.raises(ValueError, match=msg): + cut(values, 4, labels=labels) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 2c03c48209fea..814325844cb4c 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, lreshape, melt, wide_to_long -import pandas.util.testing as tm +import pandas._testing as tm class TestMelt: diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index bd1d3d2d5bb63..44073f56abfa1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import date, datetime, timedelta from itertools import product @@ -16,9 +15,9 @@ concat, date_range, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.reshape.pivot import crosstab, pivot_table -import pandas.util.testing as tm @pytest.fixture(params=[True, False]) @@ -782,6 +781,15 @@ def test_pivot_with_list_like_values_nans(self, values, method): expected = DataFrame(data=data, index=index, columns=columns, dtype="object") tm.assert_frame_equal(result, expected) + def test_pivot_columns_none_raise_error(self): + # GH 30924 + df = pd.DataFrame( + {"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]} + ) + msg = r"pivot\(\) missing 1 required argument: 'columns'" + with pytest.raises(TypeError, match=msg): + df.pivot(index="col1", values="col3") + @pytest.mark.xfail( reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966" ) @@ -897,12 +905,6 @@ def _check_output( totals = table.loc[("All", ""), value_col] assert totals == self.data[value_col].mean() - # no rows - rtable = self.data.pivot_table( - columns=["AA", "BB"], margins=True, aggfunc=np.mean - ) - assert isinstance(rtable, Series) - table = self.data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean") for item in ["DD", "EE", "FF"]: totals = table.loc[("All", ""), item] @@ -952,6 +954,20 @@ def test_margins_dtype_len(self): tm.assert_frame_equal(expected, result) + @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) + def test_pivot_table_multiindex_only(self, cols): + # GH 17038 + df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]}) + + result = df2.pivot_table(values="v", columns=cols) + expected = DataFrame( + [[4, 5, 6]], + columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), + index=Index(["v"]), + ) + + tm.assert_frame_equal(result, expected) + def test_pivot_integer_columns(self): # caused by upstream bug in unstack @@ -1044,7 +1060,7 @@ def test_pivot_columns_lexsorted(self): assert pivoted.columns.is_monotonic def test_pivot_complex_aggfunc(self): - f = OrderedDict([("D", ["std"]), ("E", ["sum"])]) + f = {"D": ["std"], "E": ["sum"]} expected = self.data.groupby(["A", "B"]).agg(f).unstack("B") result = self.data.pivot_table(index="A", columns="B", aggfunc=f) @@ -1966,6 +1982,31 @@ def test_pivot_table_aggfunc_scalar_dropna(self, dropna): tm.assert_frame_equal(result, expected) + def test_pivot_table_empty_aggfunc(self): + # GH 9186 + df = pd.DataFrame( + { + "A": [2, 2, 3, 3, 2], + "id": [5, 6, 7, 8, 9], + "C": ["p", "q", "q", "p", "q"], + "D": [None, None, None, None, None], + } + ) + result = df.pivot_table(index="A", columns="D", values="id", aggfunc=np.size) + expected = pd.DataFrame() + tm.assert_frame_equal(result, expected) + + def test_pivot_table_no_column_raises(self): + # GH 10326 + def agg(l): + return np.mean(l) + + foo = pd.DataFrame( + {"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]} + ) + with pytest.raises(KeyError, match="notpresent"): + foo.pivot_table("notpresent", "X", "Y", aggfunc=agg) + class TestCrosstab: def setup_method(self, method): @@ -2525,6 +2566,19 @@ def test_crosstab_tuple_name(self, names): result = pd.crosstab(s1, s2) tm.assert_frame_equal(result, expected) + def test_crosstab_both_tuple_names(self): + # GH 18321 + s1 = pd.Series(range(3), name=("a", "b")) + s2 = pd.Series(range(3), name=("c", "d")) + + expected = pd.DataFrame( + np.eye(3, dtype="int64"), + index=pd.Index(range(3), name=("a", "b")), + columns=pd.Index(range(3), name=("c", "d")), + ) + result = crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + def test_crosstab_unsorted_order(self): df = pd.DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) result = pd.crosstab(df.index, [df.b, df.a]) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index eca9b11bd4364..95406a5ebf4f7 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -18,9 +18,9 @@ qcut, timedelta_range, ) +import pandas._testing as tm from pandas.api.types import CategoricalDtype as CDT from pandas.core.algorithms import quantile -import pandas.util.testing as tm from pandas.tseries.offsets import Day, Nano @@ -130,6 +130,38 @@ def test_qcut_return_intervals(): tm.assert_series_equal(res, exp) +@pytest.mark.parametrize("labels", ["foo", 1, True]) +def test_qcut_incorrect_labels(labels): + # GH 13318 + values = range(5) + msg = "Bin labels must either be False, None or passed in as a list-like argument" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=labels) + + +@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))]) +def test_qcut_wrong_length_labels(labels): + # GH 13318 + values = range(10) + msg = "Bin labels must be one fewer than the number of bin edges" + with pytest.raises(ValueError, match=msg): + qcut(values, 4, labels=labels) + + +@pytest.mark.parametrize( + "labels, expected", + [ + (["a", "b", "c"], Categorical(["a", "b", "c"], ordered=True)), + (list(range(3)), Categorical([0, 1, 2], ordered=True)), + ], +) +def test_qcut_list_like_labels(labels, expected): + # GH 13318 + values = range(3) + result = qcut(values, 3, labels=labels) + tm.assert_categorical_equal(result, expected) + + @pytest.mark.parametrize( "kwargs,msg", [ diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index b695b05c7c7db..f25291f4aef12 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -7,8 +7,8 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, get_dummies +import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray, SparseDtype -import pandas.util.testing as tm class TestGetDummies: @@ -45,7 +45,7 @@ def test_basic(self, sparse, dtype): dtype=self.effective_dtype(dtype), ) if sparse: - expected = expected.apply(pd.SparseArray, fill_value=0.0) + expected = expected.apply(SparseArray, fill_value=0.0) result = get_dummies(s_list, sparse=sparse, dtype=dtype) tm.assert_frame_equal(result, expected) @@ -132,7 +132,7 @@ def test_include_na(self, sparse, dtype): {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype) ) if sparse: - exp = exp.apply(pd.SparseArray, fill_value=0.0) + exp = exp.apply(SparseArray, fill_value=0.0) tm.assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 @@ -145,7 +145,7 @@ def test_include_na(self, sparse, dtype): # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: - exp_na = exp_na.apply(pd.SparseArray, fill_value=0.0) + exp_na = exp_na.apply(SparseArray, fill_value=0.0) tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype) @@ -167,7 +167,7 @@ def test_unicode(self, sparse): dtype=np.uint8, ) if sparse: - exp = exp.apply(pd.SparseArray, fill_value=0) + exp = exp.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self, df, sparse): @@ -180,10 +180,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): if sparse: expected = pd.DataFrame( { - "A_a": pd.SparseArray([1, 0, 1], dtype="uint8"), - "A_b": pd.SparseArray([0, 1, 0], dtype="uint8"), - "B_b": pd.SparseArray([1, 1, 0], dtype="uint8"), - "B_c": pd.SparseArray([0, 0, 1], dtype="uint8"), + "A_a": SparseArray([1, 0, 1], dtype="uint8"), + "A_b": SparseArray([0, 1, 0], dtype="uint8"), + "B_b": SparseArray([1, 1, 0], dtype="uint8"), + "B_c": SparseArray([0, 0, 1], dtype="uint8"), } ) @@ -226,7 +226,7 @@ def test_dataframe_dummies_prefix_list(self, df, sparse): cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected = expected[["C"] + cols] - typ = pd.SparseArray if sparse else pd.Series + typ = SparseArray if sparse else pd.Series expected[cols] = expected[cols].apply(lambda x: typ(x)) tm.assert_frame_equal(result, expected) @@ -423,7 +423,7 @@ def test_basic_drop_first(self, sparse): result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: - expected = expected.apply(pd.SparseArray, fill_value=0) + expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) result = get_dummies(s_series, drop_first=True, sparse=sparse) @@ -457,7 +457,7 @@ def test_basic_drop_first_NA(self, sparse): res = get_dummies(s_NA, drop_first=True, sparse=sparse) exp = DataFrame({"b": [0, 1, 0]}, dtype=np.uint8) if sparse: - exp = exp.apply(pd.SparseArray, fill_value=0) + exp = exp.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res, exp) @@ -466,7 +466,7 @@ def test_basic_drop_first_NA(self, sparse): ["b", np.nan], axis=1 ) if sparse: - exp_na = exp_na.apply(pd.SparseArray, fill_value=0) + exp_na = exp_na.apply(SparseArray, fill_value=0) tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies( @@ -480,7 +480,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse): result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=np.uint8) if sparse: - expected = expected.apply(pd.SparseArray, fill_value=0) + expected = expected.apply(SparseArray, fill_value=0) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): @@ -494,7 +494,7 @@ def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): expected = expected[["C", "A_b", "B_c", "cat_y"]] if sparse: for col in cols: - expected[col] = pd.SparseArray(expected[col]) + expected[col] = SparseArray(expected[col]) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_na(self, df, sparse): @@ -516,7 +516,7 @@ def test_dataframe_dummies_drop_first_with_na(self, df, sparse): expected = expected.sort_index(axis=1) if sparse: for col in cols: - expected[col] = pd.SparseArray(expected[col]) + expected[col] = SparseArray(expected[col]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py index 9b56c4df6d7de..a503173bd74b1 100644 --- a/pandas/tests/reshape/test_union_categoricals.py +++ b/pandas/tests/reshape/test_union_categoricals.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestUnionCategoricals: diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index 60c6d7ec3017b..cd518dda4edbf 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -2,8 +2,8 @@ import pytest from pandas import Index, date_range +import pandas._testing as tm from pandas.core.reshape.util import cartesian_product -import pandas.util.testing as tm class TestCartesianProduct: diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 73371c48f9370..6af9c9884589c 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -16,7 +16,7 @@ import pandas as pd from pandas import NaT, Period, Timedelta, Timestamp, offsets -import pandas.util.testing as tm +import pandas._testing as tm class TestPeriodConstruction: diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 82d5b097733f1..dcb9d66708724 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.common import is_scalar import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_singleton(): @@ -16,8 +16,8 @@ def test_singleton(): def test_repr(): - assert repr(NA) == "NA" - assert str(NA) == "NA" + assert repr(NA) == "" + assert str(NA) == "" def test_truthiness(): @@ -58,12 +58,6 @@ def test_comparison_ops(): assert (NA >= other) is NA assert (NA < other) is NA assert (NA <= other) is NA - - if isinstance(other, (np.int64, np.bool_)): - # for numpy scalars we get a deprecation warning and False as result - # for equality or error for larger/lesser than - continue - assert (other == NA) is NA assert (other != NA) is NA assert (other > NA) is NA @@ -87,33 +81,52 @@ def test_comparison_ops(): np.float_(-0), ], ) -def test_pow_special(value): +@pytest.mark.parametrize("asarray", [True, False]) +def test_pow_special(value, asarray): + if asarray: + value = np.array([value]) result = pd.NA ** value - assert isinstance(result, type(value)) + + if asarray: + result = result[0] + else: + # this assertion isn't possible for ndarray. + assert isinstance(result, type(value)) assert result == 1 @pytest.mark.parametrize( - "value", - [ - 1, - 1.0, - -1, - -1.0, - True, - np.bool_(True), - np.int_(1), - np.float_(1), - np.int_(-1), - np.float_(-1), - ], + "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)], ) -def test_rpow_special(value): +@pytest.mark.parametrize("asarray", [True, False]) +def test_rpow_special(value, asarray): + if asarray: + value = np.array([value]) result = value ** pd.NA - assert result == value - if not isinstance(value, (np.float_, np.bool_, np.int_)): + + if asarray: + result = result[0] + elif not isinstance(value, (np.float_, np.bool_, np.int_)): + # this assertion isn't possible with asarray=True assert isinstance(result, type(value)) + assert result == value + + +@pytest.mark.parametrize( + "value", [-1, -1.0, np.int_(-1), np.float_(-1)], +) +@pytest.mark.parametrize("asarray", [True, False]) +def test_rpow_minus_one(value, asarray): + if asarray: + value = np.array([value]) + result = value ** pd.NA + + if asarray: + result = result[0] + + assert pd.isna(result) + def test_unary_ops(): assert +NA is NA @@ -162,6 +175,19 @@ def test_logical_not(): assert ~NA is NA +@pytest.mark.parametrize( + "shape", [(3,), (3, 3), (1, 2, 3)], +) +def test_arithmetic_ndarray(shape, all_arithmetic_functions): + op = all_arithmetic_functions + a = np.zeros(shape) + if op.__name__ == "pow": + a += 5 + result = op(pd.NA, a) + expected = np.full(a.shape, pd.NA, dtype=object) + tm.assert_numpy_array_equal(result, expected) + + def test_is_scalar(): assert is_scalar(NA) is True @@ -177,6 +203,55 @@ def test_series_isna(): tm.assert_series_equal(s.isna(), expected) +def test_ufunc(): + assert np.log(pd.NA) is pd.NA + assert np.add(pd.NA, 1) is pd.NA + result = np.divmod(pd.NA, 1) + assert result[0] is pd.NA and result[1] is pd.NA + + result = np.frexp(pd.NA) + assert result[0] is pd.NA and result[1] is pd.NA + + +def test_ufunc_raises(): + with pytest.raises(ValueError, match="ufunc method 'at'"): + np.log.at(pd.NA, 0) + + +def test_binary_input_not_dunder(): + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + result = np.logaddexp(a, pd.NA) + tm.assert_numpy_array_equal(result, expected) + + result = np.logaddexp(pd.NA, a) + tm.assert_numpy_array_equal(result, expected) + + # all NA, multiple inputs + assert np.logaddexp(pd.NA, pd.NA) is pd.NA + + result = np.modf(pd.NA, pd.NA) + assert len(result) == 2 + assert all(x is pd.NA for x in result) + + +def test_divmod_ufunc(): + # binary in, binary out. + a = np.array([1, 2, 3]) + expected = np.array([pd.NA, pd.NA, pd.NA], dtype=object) + + result = np.divmod(a, pd.NA) + assert isinstance(result, tuple) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + result = np.divmod(pd.NA, a) + for arr in result: + tm.assert_numpy_array_equal(arr, expected) + tm.assert_numpy_array_equal(arr, expected) + + def test_integer_hash_collision_dict(): # GH 30013 result = {NA: "foo", hash(NA): "bar"} diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e709db980b721..a537f000959e3 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -21,9 +21,9 @@ Timestamp, isna, ) +import pandas._testing as tm from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray from pandas.core.ops import roperator -import pandas.util.testing as tm @pytest.mark.parametrize( @@ -123,6 +123,13 @@ def test_round_nat(klass, method, freq): "dst", "fromordinal", "fromtimestamp", + pytest.param( + "fromisocalendar", + marks=pytest.mark.skipif( + not compat.PY38, + reason="'fromisocalendar' was added in stdlib datetime in python 3.8", + ), + ), "isocalendar", "strftime", "strptime", @@ -297,6 +304,8 @@ def test_overlap_public_nat_methods(klass, expected): # "fromisoformat" was introduced in 3.7 if klass is Timestamp and not compat.PY37: expected.remove("fromisoformat") + + # "fromisocalendar" was introduced in 3.8 if klass is Timestamp and not compat.PY38: expected.remove("fromisocalendar") diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index fed613b910c55..ce08a47f824ee 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -9,8 +9,8 @@ import pandas as pd from pandas import NaT, Timedelta, Timestamp, offsets +import pandas._testing as tm from pandas.core import ops -import pandas.util.testing as tm class TestTimedeltaAdditionSubtraction: @@ -271,8 +271,8 @@ def test_ops_ndarray(self): tm.assert_numpy_array_equal(td * np.array([2]), expected) tm.assert_numpy_array_equal(np.array([2]) * td, expected) msg = ( - "ufunc '?multiply'? cannot use operands with types" - r" dtype\(' 5] @@ -393,8 +393,8 @@ def test_2d_to_1d_assignment_raises(): y = pd.Series(range(2)) msg = ( - r"shape mismatch: value array of shape \(2,2\) could not be" - r" broadcast to indexing result of shape \(2,\)" + r"shape mismatch: value array of shape \(2,2\) could not be " + r"broadcast to indexing result of shape \(2,\)" ) with pytest.raises(ValueError, match=msg): y.loc[range(2)] = x @@ -894,7 +894,7 @@ def test_take(): expected = Series([4, 2, 4], index=[4, 3, 4]) tm.assert_series_equal(actual, expected) - msg = "index {} is out of bounds for size 5" + msg = "index {} is out of bounds for( axis 0 with)? size 5" with pytest.raises(IndexError, match=msg.format(10)): s.take([1, 10]) with pytest.raises(IndexError, match=msg.format(5)): diff --git a/pandas/tests/series/indexing/test_loc.py b/pandas/tests/series/indexing/test_loc.py index e6b5b5df2b000..7d6b6c78cc492 100644 --- a/pandas/tests/series/indexing/test_loc.py +++ b/pandas/tests/series/indexing/test_loc.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize("val,expected", [(2 ** 63 - 1, 3), (2 ** 63, 4)]) diff --git a/pandas/tests/series/indexing/test_numeric.py b/pandas/tests/series/indexing/test_numeric.py index a641b47f2e690..3684ca00c2f17 100644 --- a/pandas/tests/series/indexing/test_numeric.py +++ b/pandas/tests/series/indexing/test_numeric.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import DataFrame, Index, Series -import pandas.util.testing as tm +import pandas._testing as tm def test_get(): @@ -261,8 +261,8 @@ def test_setitem_float_labels(): def test_slice_float_get_set(datetime_series): msg = ( r"cannot do slice indexing on with these indexers \[{key}\]" - r" of " + r"\.datetimes\.DatetimeIndex'> with these indexers \[{key}\] " + r"of " ) with pytest.raises(TypeError, match=msg.format(key=r"4\.0")): datetime_series[4.0:10.0] diff --git a/pandas/tests/series/methods/test_append.py b/pandas/tests/series/methods/test_append.py new file mode 100644 index 0000000000000..dc0fca4bba067 --- /dev/null +++ b/pandas/tests/series/methods/test_append.py @@ -0,0 +1,158 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, DatetimeIndex, Series, date_range +import pandas._testing as tm + + +class TestSeriesAppend: + def test_append(self, datetime_series, string_series, object_series): + appended_series = string_series.append(object_series) + for idx, value in appended_series.items(): + if idx in string_series.index: + assert value == string_series[idx] + elif idx in object_series.index: + assert value == object_series[idx] + else: + raise AssertionError("orphaned index!") + + msg = "Indexes have overlapping values:" + with pytest.raises(ValueError, match=msg): + datetime_series.append(datetime_series, verify_integrity=True) + + def test_append_many(self, datetime_series): + pieces = [datetime_series[:5], datetime_series[5:10], datetime_series[10:]] + + result = pieces[0].append(pieces[1:]) + tm.assert_series_equal(result, datetime_series) + + def test_append_duplicates(self): + # GH 13677 + s1 = pd.Series([1, 2, 3]) + s2 = pd.Series([4, 5, 6]) + exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) + tm.assert_series_equal(s1.append(s2), exp) + tm.assert_series_equal(pd.concat([s1, s2]), exp) + + # the result must have RangeIndex + exp = pd.Series([1, 2, 3, 4, 5, 6]) + tm.assert_series_equal( + s1.append(s2, ignore_index=True), exp, check_index_type=True + ) + tm.assert_series_equal( + pd.concat([s1, s2], ignore_index=True), exp, check_index_type=True + ) + + msg = "Indexes have overlapping values:" + with pytest.raises(ValueError, match=msg): + s1.append(s2, verify_integrity=True) + with pytest.raises(ValueError, match=msg): + pd.concat([s1, s2], verify_integrity=True) + + def test_append_tuples(self): + # GH 28410 + s = pd.Series([1, 2, 3]) + list_input = [s, s] + tuple_input = (s, s) + + expected = s.append(list_input) + result = s.append(tuple_input) + + tm.assert_series_equal(expected, result) + + +class TestSeriesAppendWithDatetimeIndex: + def test_append(self): + rng = date_range("5/8/2012 1:45", periods=10, freq="5T") + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + + result = ts.append(ts) + result_df = df.append(df) + ex_index = DatetimeIndex(np.tile(rng.values, 2)) + tm.assert_index_equal(result.index, ex_index) + tm.assert_index_equal(result_df.index, ex_index) + + appended = rng.append(rng) + tm.assert_index_equal(appended, ex_index) + + appended = rng.append([rng, rng]) + ex_index = DatetimeIndex(np.tile(rng.values, 3)) + tm.assert_index_equal(appended, ex_index) + + # different index names + rng1 = rng.copy() + rng2 = rng.copy() + rng1.name = "foo" + rng2.name = "bar" + assert rng1.append(rng1).name == "foo" + assert rng1.append(rng2).name is None + + def test_append_tz(self): + # see gh-2938 + rng = date_range("5/8/2012 1:45", periods=10, freq="5T", tz="US/Eastern") + rng2 = date_range("5/8/2012 2:35", periods=10, freq="5T", tz="US/Eastern") + rng3 = date_range("5/8/2012 1:45", periods=20, freq="5T", tz="US/Eastern") + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_tz_explicit_pytz(self): + # see gh-2938 + from pytz import timezone as timezone + + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz=timezone("US/Eastern") + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz=timezone("US/Eastern") + ) + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) + + def test_append_tz_dateutil(self): + # see gh-2938 + rng = date_range( + "5/8/2012 1:45", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng2 = date_range( + "5/8/2012 2:35", periods=10, freq="5T", tz="dateutil/US/Eastern" + ) + rng3 = date_range( + "5/8/2012 1:45", periods=20, freq="5T", tz="dateutil/US/Eastern" + ) + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + ts2 = Series(np.random.randn(len(rng2)), rng2) + df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) + + result = ts.append(ts2) + result_df = df.append(df2) + tm.assert_index_equal(result.index, rng3) + tm.assert_index_equal(result_df.index, rng3) + + appended = rng.append(rng2) + tm.assert_index_equal(appended, rng3) diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py new file mode 100644 index 0000000000000..62273e2d363fb --- /dev/null +++ b/pandas/tests/series/methods/test_argsort.py @@ -0,0 +1,63 @@ +import numpy as np +import pytest + +from pandas import Series, Timestamp, isna +import pandas._testing as tm + + +class TestSeriesArgsort: + def _check_accum_op(self, name, ser, check_dtype=True): + func = getattr(np, name) + tm.assert_numpy_array_equal( + func(ser).values, func(np.array(ser)), check_dtype=check_dtype, + ) + + # with missing values + ts = ser.copy() + ts[::2] = np.NaN + + result = func(ts)[1::2] + expected = func(np.array(ts.dropna())) + + tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) + + def test_argsort(self, datetime_series): + self._check_accum_op("argsort", datetime_series, check_dtype=False) + argsorted = datetime_series.argsort() + assert issubclass(argsorted.dtype.type, np.integer) + + # GH#2967 (introduced bug in 0.11-dev I think) + s = Series([Timestamp("201301{i:02d}".format(i=i)) for i in range(1, 6)]) + assert s.dtype == "datetime64[ns]" + shifted = s.shift(-1) + assert shifted.dtype == "datetime64[ns]" + assert isna(shifted[4]) + + result = s.argsort() + expected = Series(range(5), dtype="int64") + tm.assert_series_equal(result, expected) + + result = shifted.argsort() + expected = Series(list(range(4)) + [-1], dtype="int64") + tm.assert_series_equal(result, expected) + + def test_argsort_stable(self): + s = Series(np.random.randint(0, 100, size=10000)) + mindexer = s.argsort(kind="mergesort") + qindexer = s.argsort() + + mexpected = np.argsort(s.values, kind="mergesort") + qexpected = np.argsort(s.values, kind="quicksort") + + tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) + tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) + msg = ( + r"ndarray Expected type , " + r"found instead" + ) + with pytest.raises(AssertionError, match=msg): + tm.assert_numpy_array_equal(qindexer, mindexer) + + def test_argsort_preserve_name(self, datetime_series): + result = datetime_series.argsort() + assert result.name == datetime_series.name diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 8bc9e9c38d83a..b121efd202744 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -2,7 +2,7 @@ import pytest from pandas import Series, Timestamp, date_range, isna, notna, offsets -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesAsof: diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index c2bec2744583a..37764d3b82c2d 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, Timestamp, isna, notna -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesClip: diff --git a/pandas/tests/series/methods/test_count.py b/pandas/tests/series/methods/test_count.py index 9cf776c0d9f1a..1ca48eeb7c441 100644 --- a/pandas/tests/series/methods/test_count.py +++ b/pandas/tests/series/methods/test_count.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import Categorical, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesCount: diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index f7dae87018419..1f6033d435323 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -5,7 +5,7 @@ import pandas as pd from pandas import Series, isna -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesCov: diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index ed412e7da3d43..b147a04b11090 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,7 +1,7 @@ import numpy as np from pandas import Series, Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesDescribe: diff --git a/pandas/tests/series/methods/test_diff.py b/pandas/tests/series/methods/test_diff.py index 9cb4ec827a271..033f75e95f11b 100644 --- a/pandas/tests/series/methods/test_diff.py +++ b/pandas/tests/series/methods/test_diff.py @@ -2,7 +2,7 @@ import pytest from pandas import Series, TimedeltaIndex, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesDiff: diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py new file mode 100644 index 0000000000000..2d052505d5ecc --- /dev/null +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -0,0 +1,141 @@ +import numpy as np +import pytest + +from pandas import Categorical, Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, False, False, True, True, False])), + ("last", Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])), + ], +) +def test_drop_duplicates(any_numpy_dtype, keep, expected): + tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) + + if tc.dtype == "bool": + pytest.skip("tested separately in test_drop_duplicates_bool") + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, True])), + ("last", Series([True, True, False, False])), + (False, Series([True, True, True, True])), + ], +) +def test_drop_duplicates_bool(keep, expected): + tc = Series([True, False, True, False]) + + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + +class TestSeriesDropDuplicates: + @pytest.mark.parametrize( + "dtype", + ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], + ) + def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): + cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) + + # Test case 1 + input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) + tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc1 values are seemingly-random + if not (np.array(tc1) == input1).all(): + pytest.xfail(reason="GH#7996") + + expected = Series([False, False, False, True]) + tm.assert_series_equal(tc1.duplicated(), expected) + tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, False]) + tm.assert_series_equal(tc1.duplicated(keep="last"), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc1.duplicated(keep=False), expected) + tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) + sc = tc1.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc1[~expected]) + + # Test case 2 + input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) + tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) + if dtype == "datetime64[D]": + # pre-empty flaky xfail, tc2 values are seemingly-random + if not (np.array(tc2) == input2).all(): + pytest.xfail(reason="GH#7996") + + expected = Series([False, False, False, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(), expected) + tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, False, False, False]) + tm.assert_series_equal(tc2.duplicated(keep="last"), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + expected = Series([False, True, True, False, True, True, False]) + tm.assert_series_equal(tc2.duplicated(keep=False), expected) + tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) + sc = tc2.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc2[~expected]) + + def test_drop_duplicates_categorical_bool(self, ordered_fixture): + tc = Series( + Categorical( + [True, False, True, False], + categories=[True, False], + ordered=ordered_fixture, + ) + ) + + expected = Series([False, False, True, True]) + tm.assert_series_equal(tc.duplicated(), expected) + tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, False, False]) + tm.assert_series_equal(tc.duplicated(keep="last"), expected) + tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep="last", inplace=True) + tm.assert_series_equal(sc, tc[~expected]) + + expected = Series([True, True, True, True]) + tm.assert_series_equal(tc.duplicated(keep=False), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=False, inplace=True) + tm.assert_series_equal(sc, tc[~expected]) diff --git a/pandas/tests/series/methods/test_duplicated.py b/pandas/tests/series/methods/test_duplicated.py new file mode 100644 index 0000000000000..5cc297913e851 --- /dev/null +++ b/pandas/tests/series/methods/test_duplicated.py @@ -0,0 +1,35 @@ +import numpy as np +import pytest + +from pandas import Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True], name="name")), + ("last", Series([True, True, False, False, False], name="name")), + (False, Series([True, True, True, False, True], name="name")), + ], +) +def test_duplicated_keep(keep, expected): + ser = Series(["a", "b", "b", "c", "a"], name="name") + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "keep, expected", + [ + ("first", Series([False, False, True, False, True])), + ("last", Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])), + ], +) +def test_duplicated_nan_none(keep, expected): + ser = Series([np.nan, 3, 3, None, np.nan], dtype=object) + + result = ser.duplicated(keep=keep) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index e79d3c0556cf1..979199e1efc62 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def test_basic(): diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 69b2f896aec52..3836c1d56bf87 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -3,7 +3,7 @@ import pandas as pd from pandas import Series, date_range -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesIsIn: @@ -29,8 +29,8 @@ def test_isin_with_string_scalar(self): # GH#4763 s = Series(["A", "B", "C", "a", "B", "B", "A", "C"]) msg = ( - r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[str\]" + r"only list-like objects are allowed to be passed to isin\(\), " + r"you passed a \[str\]" ) with pytest.raises(TypeError, match=msg): s.isin("a") diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 423b4ad78a78a..a029965c7394f 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm main_dtypes = [ "datetime", diff --git a/pandas/tests/series/methods/test_pct_change.py b/pandas/tests/series/methods/test_pct_change.py new file mode 100644 index 0000000000000..1efb57894f986 --- /dev/null +++ b/pandas/tests/series/methods/test_pct_change.py @@ -0,0 +1,79 @@ +import numpy as np +import pytest + +from pandas import Series, date_range +import pandas._testing as tm + + +class TestSeriesPctChange: + def test_pct_change(self, datetime_series): + rs = datetime_series.pct_change(fill_method=None) + tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) + + rs = datetime_series.pct_change(2) + filled = datetime_series.fillna(method="pad") + tm.assert_series_equal(rs, filled / filled.shift(2) - 1) + + rs = datetime_series.pct_change(fill_method="bfill", limit=1) + filled = datetime_series.fillna(method="bfill", limit=1) + tm.assert_series_equal(rs, filled / filled.shift(1) - 1) + + rs = datetime_series.pct_change(freq="5D") + filled = datetime_series.fillna(method="pad") + tm.assert_series_equal( + rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) + ) + + def test_pct_change_with_duplicate_axis(self): + # GH#28664 + common_idx = date_range("2019-11-14", periods=5, freq="D") + result = Series(range(5), common_idx).pct_change(freq="B") + + # the reason that the expected should be like this is documented at PR 28681 + expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) + + tm.assert_series_equal(result, expected) + + def test_pct_change_shift_over_nas(self): + s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) + + chg = s.pct_change() + expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) + tm.assert_series_equal(chg, expected) + + @pytest.mark.parametrize( + "freq, periods, fill_method, limit", + [ + ("5B", 5, None, None), + ("3B", 3, None, None), + ("3B", 3, "bfill", None), + ("7B", 7, "pad", 1), + ("7B", 7, "bfill", 3), + ("14B", 14, None, None), + ], + ) + def test_pct_change_periods_freq( + self, freq, periods, fill_method, limit, datetime_series + ): + # GH#7292 + rs_freq = datetime_series.pct_change( + freq=freq, fill_method=fill_method, limit=limit + ) + rs_periods = datetime_series.pct_change( + periods, fill_method=fill_method, limit=limit + ) + tm.assert_series_equal(rs_freq, rs_periods) + + empty_ts = Series(index=datetime_series.index, dtype=object) + rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) + rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) + tm.assert_series_equal(rs_freq, rs_periods) + + +@pytest.mark.parametrize("fill_method", ["pad", "ffill", None]) +def test_pct_change_with_duplicated_indices(fill_method): + # GH30463 + s = Series([np.nan, 1, 2, 3, 9, 18], index=["a", "b"] * 3) + result = s.pct_change(fill_method=fill_method) + expected = Series([np.nan, np.nan, 1.0, 0.5, 2.0, 1.0], index=["a", "b"] * 3) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 4eb275d63e878..79f50afca658f 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import Index, Series +import pandas._testing as tm from pandas.core.indexes.datetimes import Timestamp -import pandas.util.testing as tm class TestSeriesQuantile: diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 793e8b7da4965..3d4688c8274f9 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -8,8 +8,8 @@ import pandas.util._test_decorators as td from pandas import NaT, Series, Timestamp, date_range +import pandas._testing as tm from pandas.api.types import CategoricalDtype -import pandas.util.testing as tm class TestSeriesRank: @@ -203,8 +203,7 @@ def test_rank_signature(self): s = Series([0, 1]) s.rank(method="average") msg = ( - "No axis named average for object type" - " " + "No axis named average for object type " ) with pytest.raises(ValueError, match=msg): s.rank("average") diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 4125b5816422a..770ad38b0215e 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesReplace: @@ -120,8 +120,8 @@ def test_replace_with_single_list(self): # make sure things don't get corrupted when fillna call fails s = ser.copy() msg = ( - r"Invalid fill method\. Expecting pad \(ffill\) or backfill" - r" \(bfill\)\. Got crash_cymbal" + r"Invalid fill method\. Expecting pad \(ffill\) or backfill " + r"\(bfill\)\. Got crash_cymbal" ) with pytest.raises(ValueError, match=msg): s.replace([1, 2, 3], inplace=True, method="crash_cymbal") diff --git a/pandas/tests/series/methods/test_round.py b/pandas/tests/series/methods/test_round.py index 1776468ef5a83..7f0711a0f30d7 100644 --- a/pandas/tests/series/methods/test_round.py +++ b/pandas/tests/series/methods/test_round.py @@ -2,7 +2,7 @@ import pytest from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesRound: diff --git a/pandas/tests/series/methods/test_searchsorted.py b/pandas/tests/series/methods/test_searchsorted.py index 0d6e9635579f0..fd6c6f74a9136 100644 --- a/pandas/tests/series/methods/test_searchsorted.py +++ b/pandas/tests/series/methods/test_searchsorted.py @@ -1,8 +1,8 @@ import numpy as np from pandas import Series, Timestamp, date_range +import pandas._testing as tm from pandas.api.types import is_scalar -import pandas.util.testing as tm class TestSeriesSearchSorted: diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index 2cf847c928862..8256e2f33b936 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -13,7 +13,7 @@ date_range, offsets, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay diff --git a/pandas/tests/series/methods/test_sort_index.py b/pandas/tests/series/methods/test_sort_index.py index ab15b8c814029..6fa4eeaee34c0 100644 --- a/pandas/tests/series/methods/test_sort_index.py +++ b/pandas/tests/series/methods/test_sort_index.py @@ -4,7 +4,7 @@ import pytest from pandas import IntervalIndex, MultiIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesSortIndex: @@ -135,3 +135,34 @@ def test_sort_index_intervals(self): [3, 2, 1, np.nan], IntervalIndex.from_arrays([3, 2, 1, 0], [4, 3, 2, 1]) ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_list, sorted_list, ascending, ignore_index, output_index", + [ + ([2, 3, 6, 1], [2, 3, 6, 1], True, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [2, 3, 6, 1], True, False, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [1, 6, 3, 2], False, False, [3, 2, 1, 0]), + ], + ) + def test_sort_index_ignore_index( + self, inplace, original_list, sorted_list, ascending, ignore_index, output_index + ): + # GH 30114 + ser = Series(original_list) + expected = Series(sorted_list, index=output_index) + kwargs = { + "ascending": ascending, + "ignore_index": ignore_index, + "inplace": inplace, + } + + if inplace: + result_ser = ser.copy() + result_ser.sort_index(**kwargs) + else: + result_ser = ser.sort_index(**kwargs) + + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index ec3b8385e79e7..caa2abd61af6a 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesSortValues: @@ -77,8 +77,8 @@ def test_sort_values(self, datetime_series): s = df.iloc[:, 0] msg = ( - "This Series is a view of some other array, to sort in-place" - " you must create a copy" + "This Series is a view of some other array, to sort in-place " + "you must create a copy" ) with pytest.raises(ValueError, match=msg): s.sort_values(inplace=True) @@ -156,3 +156,28 @@ def test_sort_values_categorical(self): result = df.sort_values(by=["grade", "id"]) expected = df.iloc[[2, 1, 5, 4, 3, 0]] tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("inplace", [True, False]) + @pytest.mark.parametrize( + "original_list, sorted_list, ignore_index, output_index", + [ + ([2, 3, 6, 1], [6, 3, 2, 1], True, [0, 1, 2, 3]), + ([2, 3, 6, 1], [6, 3, 2, 1], False, [2, 1, 0, 3]), + ], + ) + def test_sort_values_ignore_index( + self, inplace, original_list, sorted_list, ignore_index, output_index + ): + # GH 30114 + ser = Series(original_list) + expected = Series(sorted_list, index=output_index) + kwargs = {"ignore_index": ignore_index, "inplace": inplace} + + if inplace: + result_ser = ser.copy() + result_ser.sort_values(ascending=False, **kwargs) + else: + result_ser = ser.sort_values(ascending=False, **kwargs) + + tm.assert_series_equal(result_ser, expected) + tm.assert_series_equal(ser, Series(original_list)) diff --git a/pandas/tests/series/methods/test_to_dict.py b/pandas/tests/series/methods/test_to_dict.py index 0f1359f99e594..2fbf3e8d39cf3 100644 --- a/pandas/tests/series/methods/test_to_dict.py +++ b/pandas/tests/series/methods/test_to_dict.py @@ -3,7 +3,7 @@ import pytest from pandas import Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesToDict: diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index b2bf5e854fbcc..d4e2890ed8bf0 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index 15d895f44c7b2..fdb35befeb0c2 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -2,7 +2,7 @@ import pandas as pd from pandas import Categorical, CategoricalIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesValueCounts: diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 9e1bae8469138..71f6681e8c955 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -4,23 +4,23 @@ import pytest from pandas import DataFrame, Index, MultiIndex, RangeIndex, Series -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesAlterAxes: def test_setindex(self, string_series): # wrong type msg = ( - r"Index\(\.\.\.\) must be called with a collection of some" - r" kind, None was passed" + r"Index\(\.\.\.\) must be called with a collection of some " + r"kind, None was passed" ) with pytest.raises(TypeError, match=msg): string_series.index = None # wrong length msg = ( - "Length mismatch: Expected axis has 30 elements, new" - " values have 29 elements" + "Length mismatch: Expected axis has 30 elements, " + "new values have 29 elements" ) with pytest.raises(ValueError, match=msg): string_series.index = np.arange(len(string_series) - 1) @@ -83,8 +83,9 @@ def test_rename_axis_supported(self): s = Series(range(5)) s.rename({}, axis=0) s.rename({}, axis="index") - with pytest.raises(ValueError, match="No axis named 5"): - s.rename({}, axis=5) + # TODO: clean up shared index validation + # with pytest.raises(ValueError, match="No axis named 5"): + # s.rename({}, axis=5) def test_set_name_attribute(self): s = Series([1, 2, 3]) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 17cf307a04d7f..c29bd3ea0cb7d 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -6,89 +6,11 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import Categorical, DataFrame, MultiIndex, Series, Timestamp, isna -import pandas.util.testing as tm +from pandas import DataFrame, MultiIndex, Series +import pandas._testing as tm class TestSeriesAnalytics: - def test_argsort(self, datetime_series): - self._check_accum_op("argsort", datetime_series, check_dtype=False) - argsorted = datetime_series.argsort() - assert issubclass(argsorted.dtype.type, np.integer) - - # GH 2967 (introduced bug in 0.11-dev I think) - s = Series([Timestamp("201301{i:02d}".format(i=i)) for i in range(1, 6)]) - assert s.dtype == "datetime64[ns]" - shifted = s.shift(-1) - assert shifted.dtype == "datetime64[ns]" - assert isna(shifted[4]) - - result = s.argsort() - expected = Series(range(5), dtype="int64") - tm.assert_series_equal(result, expected) - - result = shifted.argsort() - expected = Series(list(range(4)) + [-1], dtype="int64") - tm.assert_series_equal(result, expected) - - def test_argsort_stable(self): - s = Series(np.random.randint(0, 100, size=10000)) - mindexer = s.argsort(kind="mergesort") - qindexer = s.argsort() - - mexpected = np.argsort(s.values, kind="mergesort") - qexpected = np.argsort(s.values, kind="quicksort") - - tm.assert_series_equal(mindexer, Series(mexpected), check_dtype=False) - tm.assert_series_equal(qindexer, Series(qexpected), check_dtype=False) - msg = ( - r"ndarray Expected type ," - r" found instead" - ) - with pytest.raises(AssertionError, match=msg): - tm.assert_numpy_array_equal(qindexer, mindexer) - - def _check_accum_op(self, name, datetime_series_, check_dtype=True): - func = getattr(np, name) - tm.assert_numpy_array_equal( - func(datetime_series_).values, - func(np.array(datetime_series_)), - check_dtype=check_dtype, - ) - - # with missing values - ts = datetime_series_.copy() - ts[::2] = np.NaN - - result = func(ts)[1::2] - expected = func(np.array(ts.dropna())) - - tm.assert_numpy_array_equal(result.values, expected, check_dtype=False) - - def test_compress(self): - cond = [True, False, True, False, False] - s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo") - expected = Series(s.values.compress(cond), index=list("ac"), name="foo") - with tm.assert_produces_warning(FutureWarning): - result = s.compress(cond) - tm.assert_series_equal(result, expected) - - def test_numpy_compress(self): - cond = [True, False, True, False, False] - s = Series([1, -1, 5, 8, 7], index=list("abcde"), name="foo") - expected = Series(s.values.compress(cond), index=list("ac"), name="foo") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_series_equal(np.compress(cond, s), expected) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - msg = "the 'axis' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.compress(cond, s, axis=1) - - msg = "the 'out' parameter is not supported" - with pytest.raises(ValueError, match=msg): - np.compress(cond, s, out=s) - def test_prod_numpy16_bug(self): s = Series([1.0, 1.0, 1.0], index=range(3)) result = s.prod() @@ -198,41 +120,7 @@ def test_ptp(self): N = 1000 arr = np.random.randn(N) ser = Series(arr) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert np.ptp(ser) == np.ptp(arr) - - # GH11163 - s = Series([3, 5, np.nan, -3, 10]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - assert s.ptp() == 13 - assert pd.isna(s.ptp(skipna=False)) - - mi = pd.MultiIndex.from_product([["a", "b"], [1, 2, 3]]) - s = pd.Series([1, np.nan, 7, 3, 5, np.nan], index=mi) - - expected = pd.Series([6, 2], index=["a", "b"], dtype=np.float64) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_series_equal(s.ptp(level=0), expected) - - expected = pd.Series([np.nan, np.nan], index=["a", "b"]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - tm.assert_series_equal(s.ptp(level=0, skipna=False), expected) - - msg = "No axis named 1 for object type " - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s.ptp(axis=1) - - s = pd.Series(["a", "b", "c", "d", "e"]) - msg = r"unsupported operand type\(s\) for -: 'str' and 'str'" - with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s.ptp() - - msg = r"Series\.ptp does not implement numeric_only\." - with pytest.raises(NotImplementedError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - s.ptp(numeric_only=True) + assert np.ptp(ser) == np.ptp(arr) def test_repeat(self): s = Series(np.random.randn(3), index=["a", "b", "c"]) @@ -272,23 +160,6 @@ def test_is_monotonic(self): assert s.is_monotonic is False assert s.is_monotonic_decreasing is True - def test_apply_categorical(self): - values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) - s = pd.Series(values, name="XX", index=list("abcdefg")) - result = s.apply(lambda x: x.lower()) - - # should be categorical dtype when the number of categories are - # the same - values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) - exp = pd.Series(values, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - tm.assert_categorical_equal(result.values, exp.values) - - result = s.apply(lambda x: "A") - exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) - tm.assert_series_equal(result, exp) - assert result.dtype == np.object - def test_unstack(self): index = MultiIndex( @@ -397,100 +268,3 @@ def test_validate_stat_keepdims(self): ) with pytest.raises(ValueError, match=msg): np.sum(s, keepdims=True) - - -class TestCategoricalSeriesAnalytics: - @pytest.mark.parametrize( - "dtype", - ["int_", "uint", "float_", "unicode_", "timedelta64[h]", "datetime64[D]"], - ) - def test_drop_duplicates_categorical_non_bool(self, dtype, ordered_fixture): - cat_array = np.array([1, 2, 3, 4, 5], dtype=np.dtype(dtype)) - - # Test case 1 - input1 = np.array([1, 2, 3, 3], dtype=np.dtype(dtype)) - tc1 = Series(Categorical(input1, categories=cat_array, ordered=ordered_fixture)) - if dtype == "datetime64[D]": - # pre-empty flaky xfail, tc1 values are seemingly-random - if not (np.array(tc1) == input1).all(): - pytest.xfail(reason="GH#7996") - - expected = Series([False, False, False, True]) - tm.assert_series_equal(tc1.duplicated(), expected) - tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(inplace=True) - tm.assert_series_equal(sc, tc1[~expected]) - - expected = Series([False, False, True, False]) - tm.assert_series_equal(tc1.duplicated(keep="last"), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(keep="last", inplace=True) - tm.assert_series_equal(sc, tc1[~expected]) - - expected = Series([False, False, True, True]) - tm.assert_series_equal(tc1.duplicated(keep=False), expected) - tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(keep=False, inplace=True) - tm.assert_series_equal(sc, tc1[~expected]) - - # Test case 2 - input2 = np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(dtype)) - tc2 = Series(Categorical(input2, categories=cat_array, ordered=ordered_fixture)) - if dtype == "datetime64[D]": - # pre-empty flaky xfail, tc2 values are seemingly-random - if not (np.array(tc2) == input2).all(): - pytest.xfail(reason="GH#7996") - - expected = Series([False, False, False, False, True, True, False]) - tm.assert_series_equal(tc2.duplicated(), expected) - tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(inplace=True) - tm.assert_series_equal(sc, tc2[~expected]) - - expected = Series([False, True, True, False, False, False, False]) - tm.assert_series_equal(tc2.duplicated(keep="last"), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(keep="last", inplace=True) - tm.assert_series_equal(sc, tc2[~expected]) - - expected = Series([False, True, True, False, True, True, False]) - tm.assert_series_equal(tc2.duplicated(keep=False), expected) - tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(keep=False, inplace=True) - tm.assert_series_equal(sc, tc2[~expected]) - - def test_drop_duplicates_categorical_bool(self, ordered_fixture): - tc = Series( - Categorical( - [True, False, True, False], - categories=[True, False], - ordered=ordered_fixture, - ) - ) - - expected = Series([False, False, True, True]) - tm.assert_series_equal(tc.duplicated(), expected) - tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - expected = Series([True, True, False, False]) - tm.assert_series_equal(tc.duplicated(keep="last"), expected) - tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep="last", inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - expected = Series([True, True, True, True]) - tm.assert_series_equal(tc.duplicated(keep=False), expected) - tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=False, inplace=True) - tm.assert_series_equal(sc, tc[~expected]) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index f8cf6b6a54d14..f96d6ddfc357e 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.util._test_decorators import async_mark + import pandas as pd from pandas import ( Categorical, @@ -19,8 +21,8 @@ period_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import PeriodArray -import pandas.util.testing as tm import pandas.io.formats.printing as printing @@ -112,10 +114,6 @@ def _pickle_roundtrip(self, obj): unpickled = pd.read_pickle(path) return unpickled - def test_argsort_preserve_name(self, datetime_series): - result = datetime_series.argsort() - assert result.name == datetime_series.name - def test_sort_index_name(self, datetime_series): result = datetime_series.sort_index(ascending=False) assert result.name == datetime_series.name @@ -130,8 +128,8 @@ def test_constructor_dict(self): expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"]) tm.assert_series_equal(result, expected) - def test_constructor_subclass_dict(self): - data = tm.TestSubDict((x, 10.0 * x) for x in range(10)) + def test_constructor_subclass_dict(self, dict_subclass): + data = dict_subclass((x, 10.0 * x) for x in range(10)) series = Series(data) expected = Series(dict(data.items())) tm.assert_series_equal(series, expected) @@ -314,7 +312,7 @@ def test_iteritems_strings(self, string_series): for idx, val in string_series.iteritems(): assert val == string_series[idx] - # assert is lazy (genrators don't define reverse, lists do) + # assert is lazy (generators don't define reverse, lists do) assert not hasattr(string_series.iteritems(), "reverse") def test_items_datetimes(self, datetime_series): @@ -325,7 +323,7 @@ def test_items_strings(self, string_series): for idx, val in string_series.items(): assert val == string_series[idx] - # assert is lazy (genrators don't define reverse, lists do) + # assert is lazy (generators don't define reverse, lists do) assert not hasattr(string_series.items(), "reverse") def test_raise_on_info(self): @@ -469,30 +467,6 @@ def f(x): s = Series(np.random.randn(10)) tm.assert_almost_equal(s.ravel(order="F"), s.values.ravel(order="F")) - # compress - # GH 6658 - s = Series([0, 1.0, -1], index=list("abc")) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = np.compress(s > 0, s) - tm.assert_series_equal(result, Series([1.0], index=["b"])) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = np.compress(s < -1, s) - # result empty Index(dtype=object) as the same as original - exp = Series([], dtype="float64", index=Index([], dtype="object")) - tm.assert_series_equal(result, exp) - - s = Series([0, 1.0, -1], index=[0.1, 0.2, 0.3]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = np.compress(s > 0, s) - tm.assert_series_equal(result, Series([1.0], index=[0.2])) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = np.compress(s < -1, s) - # result empty Float64Index as the same as original - exp = Series([], dtype="float64", index=Index([], dtype="float64")) - tm.assert_series_equal(result, exp) - def test_str_accessor_updates_on_inplace(self): s = pd.Series(list("abc")) s.drop([0], inplace=True) @@ -519,13 +493,14 @@ def test_empty_method(self): for full_series in [pd.Series([1]), s2]: assert not full_series.empty - def test_tab_complete_warning(self, ip): + @async_mark() + async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter code = "import pandas as pd; s = pd.Series()" - ip.run_code(code) + await ip.run_code(code) with tm.assert_produces_warning(None): with provisionalcompleter("ignore"): list(ip.Completer.completions("s.", 1)) @@ -537,6 +512,13 @@ def test_integer_series_size(self): s = Series(range(9), dtype="Int64") assert s.size == 9 + def test_attrs(self): + s = pd.Series([0, 1], name="abc") + assert s.attrs == {} + s.attrs["version"] = 1 + result = s + 1 + assert result.attrs == {"version": 1} + class TestCategoricalSeries: @pytest.mark.parametrize( diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 8956b8b0b2d20..a4c55a80a9f0f 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -1,4 +1,4 @@ -from collections import Counter, OrderedDict, defaultdict +from collections import Counter, defaultdict from itertools import chain import numpy as np @@ -6,9 +6,9 @@ import pandas as pd from pandas import DataFrame, Index, Series, isna +import pandas._testing as tm from pandas.conftest import _get_cython_table_params from pandas.core.base import SpecificationError -import pandas.util.testing as tm class TestSeriesApply: @@ -162,6 +162,23 @@ def test_apply_dict_depr(self): with pytest.raises(SpecificationError, match=msg): tsdf.A.agg({"foo": ["sum", "mean"]}) + def test_apply_categorical(self): + values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True) + ser = pd.Series(values, name="XX", index=list("abcdefg")) + result = ser.apply(lambda x: x.lower()) + + # should be categorical dtype when the number of categories are + # the same + values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True) + exp = pd.Series(values, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + tm.assert_categorical_equal(result.values, exp.values) + + result = ser.apply(lambda x: "A") + exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) + tm.assert_series_equal(result, exp) + assert result.dtype == np.object + @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) def test_apply_categorical_with_nan_values(self, series): # GH 20714 bug fixed in: GH 24275 @@ -297,18 +314,16 @@ def test_replicate_describe(self, string_series): # this also tests a result set that is all scalars expected = string_series.describe() result = string_series.apply( - OrderedDict( - [ - ("count", "count"), - ("mean", "mean"), - ("std", "std"), - ("min", "min"), - ("25%", lambda x: x.quantile(0.25)), - ("50%", "median"), - ("75%", lambda x: x.quantile(0.75)), - ("max", "max"), - ] - ) + { + "count": "count", + "mean": "mean", + "std": "std", + "min": "min", + "25%": lambda x: x.quantile(0.25), + "50%": "median", + "75%": lambda x: x.quantile(0.75), + "max": "max", + } ) tm.assert_series_equal(result, expected) @@ -333,7 +348,7 @@ def test_non_callable_aggregates(self): # test when mixed w/ callable reducers result = s.agg(["size", "count", "mean"]) - expected = Series(OrderedDict([("size", 3.0), ("count", 2.0), ("mean", 1.5)])) + expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5}) tm.assert_series_equal(result[expected.index], expected) @pytest.mark.parametrize( @@ -612,6 +627,30 @@ class DictWithoutMissing(dict): expected = Series([np.nan, np.nan, "three"]) tm.assert_series_equal(result, expected) + def test_map_abc_mapping(self, non_mapping_dict_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + s = Series([1, 2, 3]) + not_a_dictionary = non_mapping_dict_subclass({3: "three"}) + result = s.map(not_a_dictionary) + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + + def test_map_abc_mapping_with_missing(self, non_mapping_dict_subclass): + # https://github.com/pandas-dev/pandas/issues/29733 + # Check collections.abc.Mapping support as mapper for Series.map + class NonDictMappingWithMissing(non_mapping_dict_subclass): + def __missing__(self, key): + return "missing" + + s = Series([1, 2, 3]) + not_a_dictionary = NonDictMappingWithMissing({3: "three"}) + result = s.map(not_a_dictionary) + # __missing__ is a dict concept, not a Mapping concept, + # so it should not change the result! + expected = Series([np.nan, np.nan, "three"]) + tm.assert_series_equal(result, expected) + def test_map_box(self): vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")] s = pd.Series(vals) @@ -741,3 +780,10 @@ def test_apply_scaler_on_date_time_index_aware_series(self): series = tm.makeTimeSeries(nper=30).tz_localize("UTC") result = pd.Series(series.index).apply(lambda x: 1) tm.assert_series_equal(result, pd.Series(np.ones(30), dtype="int64")) + + def test_map_float_to_string_precision(self): + # GH 13228 + ser = pd.Series(1 / 3) + result = ser.map(lambda val: str(val)).to_dict() + expected = {0: "0.3333333333333333"} + assert result == expected diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 68d6169fa4f34..f3ffdc373e178 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -3,10 +3,11 @@ import numpy as np import pytest +from pandas._libs.tslibs import IncompatibleFrequency + import pandas as pd from pandas import Series -from pandas.core.indexes.period import IncompatibleFrequency -import pandas.util.testing as tm +import pandas._testing as tm def _permute(obj): @@ -46,6 +47,22 @@ def test_flex_method_equivalence(self, opname, ts): expected = alt(other, series) tm.assert_almost_equal(result, expected) + def test_flex_method_subclass_metadata_preservation(self, all_arithmetic_operators): + # GH 13208 + class MySeries(Series): + _metadata = ["x"] + + @property + def _constructor(self): + return MySeries + + opname = all_arithmetic_operators + op = getattr(Series, opname) + m = MySeries([1, 2, 3], name="test") + m.x = 42 + result = op(m, 1) + assert result.x == 42 + class TestSeriesArithmetic: # Some of these may end up in tests/arithmetic, but are not yet sorted @@ -171,6 +188,14 @@ def test_ser_cmp_result_names(self, names, op): result = op(ser, tdi) assert result.name == names[2] + # interval dtype + if op in [operator.eq, operator.ne]: + # interval dtype comparisons not yet implemented + ii = pd.interval_range(start=0, periods=5, name=names[0]) + ser = Series(ii).rename(names[1]) + result = op(ser, ii) + assert result.name == names[2] + # categorical if op in [operator.eq, operator.ne]: # categorical dtype comparisons raise for inequalities diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index c6f4ce364f328..239353d3955b4 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -4,65 +4,11 @@ import pytest import pandas as pd -from pandas import DataFrame, DatetimeIndex, Series, date_range -import pandas.util.testing as tm +from pandas import DataFrame, Series +import pandas._testing as tm class TestSeriesCombine: - def test_append(self, datetime_series, string_series, object_series): - appendedSeries = string_series.append(object_series) - for idx, value in appendedSeries.items(): - if idx in string_series.index: - assert value == string_series[idx] - elif idx in object_series.index: - assert value == object_series[idx] - else: - raise AssertionError("orphaned index!") - - msg = "Indexes have overlapping values:" - with pytest.raises(ValueError, match=msg): - datetime_series.append(datetime_series, verify_integrity=True) - - def test_append_many(self, datetime_series): - pieces = [datetime_series[:5], datetime_series[5:10], datetime_series[10:]] - - result = pieces[0].append(pieces[1:]) - tm.assert_series_equal(result, datetime_series) - - def test_append_duplicates(self): - # GH 13677 - s1 = pd.Series([1, 2, 3]) - s2 = pd.Series([4, 5, 6]) - exp = pd.Series([1, 2, 3, 4, 5, 6], index=[0, 1, 2, 0, 1, 2]) - tm.assert_series_equal(s1.append(s2), exp) - tm.assert_series_equal(pd.concat([s1, s2]), exp) - - # the result must have RangeIndex - exp = pd.Series([1, 2, 3, 4, 5, 6]) - tm.assert_series_equal( - s1.append(s2, ignore_index=True), exp, check_index_type=True - ) - tm.assert_series_equal( - pd.concat([s1, s2], ignore_index=True), exp, check_index_type=True - ) - - msg = "Indexes have overlapping values:" - with pytest.raises(ValueError, match=msg): - s1.append(s2, verify_integrity=True) - with pytest.raises(ValueError, match=msg): - pd.concat([s1, s2], verify_integrity=True) - - def test_append_tuples(self): - # GH 28410 - s = pd.Series([1, 2, 3]) - list_input = [s, s] - tuple_input = (s, s) - - expected = s.append(list_input) - result = s.append(tuple_input) - - tm.assert_series_equal(expected, result) - def test_combine_scalar(self): # GH 21248 # Note - combine() with another Series is tested elsewhere because @@ -319,99 +265,3 @@ def test_combine_first_dt64(self): rs = s0.combine_first(s1) xp = Series([datetime(2010, 1, 1), "2011"]) tm.assert_series_equal(rs, xp) - - -class TestTimeseries: - def test_append_concat(self): - rng = date_range("5/8/2012 1:45", periods=10, freq="5T") - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - - result = ts.append(ts) - result_df = df.append(df) - ex_index = DatetimeIndex(np.tile(rng.values, 2)) - tm.assert_index_equal(result.index, ex_index) - tm.assert_index_equal(result_df.index, ex_index) - - appended = rng.append(rng) - tm.assert_index_equal(appended, ex_index) - - appended = rng.append([rng, rng]) - ex_index = DatetimeIndex(np.tile(rng.values, 3)) - tm.assert_index_equal(appended, ex_index) - - # different index names - rng1 = rng.copy() - rng2 = rng.copy() - rng1.name = "foo" - rng2.name = "bar" - assert rng1.append(rng1).name == "foo" - assert rng1.append(rng2).name is None - - def test_append_concat_tz(self): - # see gh-2938 - rng = date_range("5/8/2012 1:45", periods=10, freq="5T", tz="US/Eastern") - rng2 = date_range("5/8/2012 2:35", periods=10, freq="5T", tz="US/Eastern") - rng3 = date_range("5/8/2012 1:45", periods=20, freq="5T", tz="US/Eastern") - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_append_concat_tz_explicit_pytz(self): - # see gh-2938 - from pytz import timezone as timezone - - rng = date_range( - "5/8/2012 1:45", periods=10, freq="5T", tz=timezone("US/Eastern") - ) - rng2 = date_range( - "5/8/2012 2:35", periods=10, freq="5T", tz=timezone("US/Eastern") - ) - rng3 = date_range( - "5/8/2012 1:45", periods=20, freq="5T", tz=timezone("US/Eastern") - ) - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) - - def test_append_concat_tz_dateutil(self): - # see gh-2938 - rng = date_range( - "5/8/2012 1:45", periods=10, freq="5T", tz="dateutil/US/Eastern" - ) - rng2 = date_range( - "5/8/2012 2:35", periods=10, freq="5T", tz="dateutil/US/Eastern" - ) - rng3 = date_range( - "5/8/2012 1:45", periods=20, freq="5T", tz="dateutil/US/Eastern" - ) - ts = Series(np.random.randn(len(rng)), rng) - df = DataFrame(np.random.randn(len(rng), 4), index=rng) - ts2 = Series(np.random.randn(len(rng2)), rng2) - df2 = DataFrame(np.random.randn(len(rng2), 4), index=rng2) - - result = ts.append(ts2) - result_df = df.append(df2) - tm.assert_index_equal(result.index, rng3) - tm.assert_index_equal(result_df.index, rng3) - - appended = rng.append(rng2) - tm.assert_index_equal(appended, rng3) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index c772038619db0..d760939657d47 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -26,8 +26,8 @@ period_range, timedelta_range, ) -from pandas.core.arrays import period_array -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.core.arrays import IntervalArray, period_array class TestSeriesConstructors: @@ -67,6 +67,14 @@ def test_invalid_dtype(self): with pytest.raises(TypeError, match=msg): Series([], name="time", dtype=dtype) + def test_invalid_compound_dtype(self): + # GH#13296 + c_dtype = np.dtype([("a", "i8"), ("b", "f4")]) + cdt_arr = np.array([(1, 0.4), (256, -13)], dtype=c_dtype) + + with pytest.raises(ValueError, match="Use DataFrame instead"): + Series(cdt_arr, index=["A", "B"]) + def test_scalar_conversion(self): # Pass in scalar is disabled @@ -773,7 +781,7 @@ def test_constructor_dtype_datetime64(self): dts.astype("int64") # invalid casting - msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[int32\]" + msg = r"cannot astype a datetimelike from \[datetime64\[ns\]\] to \[int32\]" with pytest.raises(TypeError, match=msg): dts.astype("int32") @@ -959,16 +967,43 @@ def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): expected = Series(pd.Timestamp(arg)).dt.tz_localize("CET") tm.assert_series_equal(result, expected) - def test_construction_interval(self): + def test_constructor_datetime64_bigendian(self): + # GH#30976 + ms = np.datetime64(1, "ms") + arr = np.array([np.datetime64(1, "ms")], dtype=">M8[ms]") + + result = Series(arr) + expected = Series([Timestamp(ms)]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("interval_constructor", [IntervalIndex, IntervalArray]) + def test_construction_interval(self, interval_constructor): # construction from interval & array of intervals - index = IntervalIndex.from_breaks(np.arange(3), closed="right") - result = Series(index) - repr(result) - str(result) - tm.assert_index_equal(Index(result.values), index) + intervals = interval_constructor.from_breaks(np.arange(3), closed="right") + result = Series(intervals) + assert result.dtype == "interval[int64]" + tm.assert_index_equal(Index(result.values), Index(intervals)) - result = Series(index.values) - tm.assert_index_equal(Index(result.values), index) + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_infer_interval(self, data_constructor): + # GH 23563: consistent closed results in interval dtype + data = [pd.Interval(0, 1), pd.Interval(0, 2), None] + result = pd.Series(data_constructor(data)) + expected = pd.Series(IntervalArray(data)) + assert result.dtype == "interval[float64]" + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_interval_mixed_closed(self, data_constructor): + # GH 23563: mixed closed results in object dtype (not interval dtype) + data = [pd.Interval(0, 1, closed="both"), pd.Interval(0, 2, closed="neither")] + result = Series(data_constructor(data)) + assert result.dtype == object + assert result.tolist() == data def test_construction_consistency(self): @@ -985,17 +1020,16 @@ def test_construction_consistency(self): result = Series(s.values, dtype=s.dtype) tm.assert_series_equal(result, s) - def test_constructor_infer_period(self): + @pytest.mark.parametrize( + "data_constructor", [list, np.array], ids=["list", "ndarray[object]"] + ) + def test_constructor_infer_period(self, data_constructor): data = [pd.Period("2000", "D"), pd.Period("2001", "D"), None] - result = pd.Series(data) + result = pd.Series(data_constructor(data)) expected = pd.Series(period_array(data)) tm.assert_series_equal(result, expected) assert result.dtype == "Period[D]" - data = np.asarray(data, dtype=object) - tm.assert_series_equal(result, expected) - assert result.dtype == "Period[D]" - def test_constructor_period_incompatible_frequency(self): data = [pd.Period("2000", "D"), pd.Period("2001", "A")] result = pd.Series(data) @@ -1081,6 +1115,14 @@ def create_data(constructor): tm.assert_series_equal(result_datetime, expected) tm.assert_series_equal(result_Timestamp, expected) + def test_constructor_mapping(self, non_mapping_dict_subclass): + # GH 29788 + ndm = non_mapping_dict_subclass({3: "three"}) + result = Series(ndm) + expected = Series(["three"], index=[3]) + + tm.assert_series_equal(result, expected) + def test_constructor_list_of_tuples(self): data = [(1, 1), (2, 2), (2, 3)] s = Series(data) @@ -1198,7 +1240,7 @@ def test_constructor_dtype_timedelta64(self): td.astype("int64") # invalid casting - msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[int32\]" + msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to \[int32\]" with pytest.raises(TypeError, match=msg): td.astype("int32") diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 0fac279291c66..885b5bf0476f2 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -10,10 +10,8 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p18 - import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm def _check_accum_op(name, series, check_dtype=True): @@ -63,16 +61,18 @@ def test_cummax(self, datetime_series): tm.assert_series_equal(result, expected) - @pytest.mark.xfail( - not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" - ) - def test_cummin_datetime64(self): + @pytest.mark.parametrize("tz", [None, "US/Pacific"]) + def test_cummin_datetime64(self, tz): s = pd.Series( - pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"] + ).tz_localize(tz) ) expected = pd.Series( - pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-1"]) + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-1"] + ).tz_localize(tz) ) result = s.cummin(skipna=True) tm.assert_series_equal(expected, result) @@ -80,21 +80,23 @@ def test_cummin_datetime64(self): expected = pd.Series( pd.to_datetime( ["NaT", "2000-1-2", "2000-1-2", "2000-1-1", "2000-1-1", "2000-1-1"] - ) + ).tz_localize(tz) ) result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) - @pytest.mark.xfail( - not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" - ) - def test_cummax_datetime64(self): + @pytest.mark.parametrize("tz", [None, "US/Pacific"]) + def test_cummax_datetime64(self, tz): s = pd.Series( - pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"]) + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-1", "NaT", "2000-1-3"] + ).tz_localize(tz) ) expected = pd.Series( - pd.to_datetime(["NaT", "2000-1-2", "NaT", "2000-1-2", "NaT", "2000-1-3"]) + pd.to_datetime( + ["NaT", "2000-1-2", "NaT", "2000-1-2", "NaT", "2000-1-3"] + ).tz_localize(tz) ) result = s.cummax(skipna=True) tm.assert_series_equal(expected, result) @@ -102,14 +104,11 @@ def test_cummax_datetime64(self): expected = pd.Series( pd.to_datetime( ["NaT", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-2", "2000-1-3"] - ) + ).tz_localize(tz) ) result = s.cummax(skipna=False) tm.assert_series_equal(expected, result) - @pytest.mark.xfail( - not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" - ) def test_cummin_timedelta64(self): s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) @@ -125,9 +124,6 @@ def test_cummin_timedelta64(self): result = s.cummin(skipna=False) tm.assert_series_equal(expected, result) - @pytest.mark.xfail( - not _np_version_under1p18, reason="numpy 1.18 changed min/max behavior for NaT" - ) def test_cummax_timedelta64(self): s = pd.Series(pd.to_timedelta(["NaT", "2 min", "NaT", "1 min", "NaT", "3 min"])) diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index aa56131f05570..b8be4ea137e3d 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -24,9 +24,9 @@ period_range, timedelta_range, ) +import pandas._testing as tm from pandas.core.arrays import PeriodArray import pandas.core.common as com -import pandas.util.testing as tm class TestSeriesDatetimeValues: diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index ff4842791b4fd..1fc582156a884 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -20,7 +20,7 @@ Timestamp, date_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesDtypes: @@ -193,8 +193,8 @@ def test_astype_dict_like(self, dtype_class): dt3 = dtype_class({"abc": str, "def": str}) msg = ( - "Only the Series name can be used for the key in Series dtype" - r" mappings\." + "Only the Series name can be used for the key in Series dtype " + r"mappings\." ) with pytest.raises(KeyError, match=msg): s.astype(dt3) @@ -273,7 +273,7 @@ def test_astype_categorical_to_other(self): expected = s tm.assert_series_equal(s.astype("category"), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) - msg = r"could not convert string to float|" r"invalid literal for float\(\)" + msg = r"could not convert string to float|invalid literal for float\(\)" with pytest.raises(ValueError, match=msg): s.astype("float64") @@ -410,8 +410,8 @@ def test_arg_for_errors_in_astype(self): s = Series([1, 2, 3]) msg = ( - r"Expected value of kwarg 'errors' to be one of \['raise'," - r" 'ignore'\]\. Supplied value is 'False'" + r"Expected value of kwarg 'errors' to be one of \['raise', " + r"'ignore'\]\. Supplied value is 'False'" ) with pytest.raises(ValueError, match=msg): s.astype(np.float64, errors=False) @@ -465,13 +465,6 @@ def test_infer_objects_series(self): assert actual.dtype == "object" tm.assert_series_equal(actual, expected) - def test_is_homogeneous_type(self): - with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): - empty = Series() - assert empty._is_homogeneous_type - assert Series([1, 2])._is_homogeneous_type - assert Series(pd.Categorical([1, 2]))._is_homogeneous_type - @pytest.mark.parametrize( "data", [ diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 666354e70bdd4..3513db6177951 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -2,11 +2,11 @@ import pytest from pandas import Categorical, Series +import pandas._testing as tm from pandas.core.construction import create_series_with_explicit_dtype -import pandas.util.testing as tm -def test_value_counts_nunique(): +def test_nunique(): # basics.rst doc example series = Series(np.random.randn(500)) series[20:500] = np.nan @@ -90,72 +90,3 @@ def __ne__(self, other): s.is_unique captured = capsys.readouterr() assert len(captured.err) == 0 - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, False, False, True, True, False])), - ("last", Series([False, True, True, False, False, False, False])), - (False, Series([False, True, True, False, True, True, False])), - ], -) -def test_drop_duplicates(any_numpy_dtype, keep, expected): - tc = Series([1, 0, 3, 5, 3, 0, 4], dtype=np.dtype(any_numpy_dtype)) - - if tc.dtype == "bool": - pytest.skip("tested separately in test_drop_duplicates_bool") - - tm.assert_series_equal(tc.duplicated(keep=keep), expected) - tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, True])), - ("last", Series([True, True, False, False])), - (False, Series([True, True, True, True])), - ], -) -def test_drop_duplicates_bool(keep, expected): - tc = Series([True, False, True, False]) - - tm.assert_series_equal(tc.duplicated(keep=keep), expected) - tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - tm.assert_series_equal(sc, tc[~expected]) - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True], name="name")), - ("last", Series([True, True, False, False, False], name="name")), - (False, Series([True, True, True, False, True], name="name")), - ], -) -def test_duplicated_keep(keep, expected): - s = Series(["a", "b", "b", "c", "a"], name="name") - - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "keep, expected", - [ - ("first", Series([False, False, True, False, True])), - ("last", Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])), - ], -) -def test_duplicated_nan_none(keep, expected): - s = Series([np.nan, 3, 3, None, np.nan], dtype=object) - - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py index efcb500a0b79f..4c817ed2e2d59 100644 --- a/pandas/tests/series/test_internals.py +++ b/pandas/tests/series/test_internals.py @@ -5,8 +5,8 @@ import pandas as pd from pandas import NaT, Series, Timestamp +import pandas._testing as tm from pandas.core.internals.blocks import IntBlock -import pandas.util.testing as tm class TestSeriesInternals: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index f53081ac53b01..510c11a51ca38 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm from pandas.io.common import get_handle diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 8e1fee4d542e7..6b7d9e00a5228 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -21,25 +21,7 @@ date_range, isna, ) -import pandas.util.testing as tm - - -def _skip_if_no_pchip(): - try: - from scipy.interpolate import pchip_interpolate # noqa - except ImportError: - import pytest - - pytest.skip("scipy.interpolate.pchip missing") - - -def _skip_if_no_akima(): - try: - from scipy.interpolate import Akima1DInterpolator # noqa - except ImportError: - import pytest - - pytest.skip("scipy.interpolate.Akima1DInterpolator missing") +import pandas._testing as tm def _simple_ts(start, end, freq="D"): @@ -293,7 +275,7 @@ def test_datetime64_tz_fillna(self): ["2011-01-01 10:00", pd.NaT, "2011-01-03 10:00", pd.NaT], tz=tz ) s = pd.Series(idx) - assert s.dtype == "datetime64[ns, {0}]".format(tz) + assert s.dtype == f"datetime64[ns, {tz}]" tm.assert_series_equal(pd.isna(s), null_loc) result = s.fillna(pd.Timestamp("2011-01-02 10:00")) @@ -475,9 +457,9 @@ def test_fillna_consistency(self): def test_where_sparse(self): # GH#17198 make sure we dont get an AttributeError for sp_index - ser = pd.Series(pd.SparseArray([1, 2])) + ser = pd.Series(pd.arrays.SparseArray([1, 2])) result = ser.where(ser >= 2, 0) - expected = pd.Series(pd.SparseArray([0, 2])) + expected = pd.Series(pd.arrays.SparseArray([0, 2])) tm.assert_series_equal(result, expected) def test_datetime64tz_fillna_round_issue(self): @@ -520,11 +502,11 @@ def test_fillna_int(self): def test_fillna_raise(self): s = Series(np.random.randint(-100, 100, 50)) - msg = '"value" parameter must be a scalar or dict, but you passed a' ' "list"' + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): s.fillna([1, 2]) - msg = '"value" parameter must be a scalar or dict, but you passed a' ' "tuple"' + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): s.fillna((1, 2)) @@ -611,11 +593,11 @@ def test_fillna_categorical_raise(self): with pytest.raises(ValueError, match="fill value must be in categories"): s.fillna({1: "d", 3: "a"}) - msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "list"' + msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): s.fillna(["a", "b"]) - msg = '"value" parameter must be a scalar or ' 'dict, but you passed a "tuple"' + msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): s.fillna(("a", "b")) @@ -1099,7 +1081,6 @@ def test_interpolate_time_raises_for_non_timeseries(self): @td.skip_if_no_scipy def test_interpolate_pchip(self): - _skip_if_no_pchip() ser = Series(np.sort(np.random.uniform(size=100))) @@ -1113,7 +1094,6 @@ def test_interpolate_pchip(self): @td.skip_if_no_scipy def test_interpolate_akima(self): - _skip_if_no_akima() ser = Series([10, 11, 12, 13]) @@ -1200,8 +1180,8 @@ def test_interpolate_index_values(self): def test_interpolate_non_ts(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) msg = ( - "time-weighted interpolation only works on Series or DataFrames" - " with a DatetimeIndex" + "time-weighted interpolation only works on Series or DataFrames " + "with a DatetimeIndex" ) with pytest.raises(ValueError, match=msg): s.interpolate(method="time") @@ -1304,7 +1284,7 @@ def test_interpolate_invalid_float_limit(self, nontemporal_method): def test_interp_invalid_method(self, invalid_method): s = Series([1, 3, np.nan, 12, np.nan, 25]) - msg = "method must be one of.* Got '{}' instead".format(invalid_method) + msg = f"method must be one of.* Got '{invalid_method}' instead" with pytest.raises(ValueError, match=msg): s.interpolate(method=invalid_method) @@ -1344,8 +1324,8 @@ def test_interp_limit_bad_direction(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) msg = ( - r"Invalid limit_direction: expecting one of \['forward'," - r" 'backward', 'both'\], got 'abc'" + r"Invalid limit_direction: expecting one of \['forward', " + r"'backward', 'both'\], got 'abc'" ) with pytest.raises(ValueError, match=msg): s.interpolate(method="linear", limit=2, limit_direction="abc") @@ -1367,6 +1347,7 @@ def test_interp_limit_area(self): [np.nan, np.nan, 3.0, 4.0, np.nan, np.nan, 7.0, np.nan, np.nan] ) result = s.interpolate(method="linear", limit_area="inside", limit=1) + tm.assert_series_equal(result, expected) expected = Series([np.nan, np.nan, 3.0, 4.0, np.nan, 6.0, 7.0, np.nan, np.nan]) result = s.interpolate( @@ -1382,6 +1363,7 @@ def test_interp_limit_area(self): [np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan] ) result = s.interpolate(method="linear", limit_area="outside", limit=1) + tm.assert_series_equal(result, expected) expected = Series([np.nan, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, 7.0, np.nan]) result = s.interpolate( @@ -1391,8 +1373,9 @@ def test_interp_limit_area(self): expected = Series([3.0, 3.0, 3.0, np.nan, np.nan, np.nan, 7.0, np.nan, np.nan]) result = s.interpolate( - method="linear", limit_area="outside", direction="backward" + method="linear", limit_area="outside", limit_direction="backward" ) + tm.assert_series_equal(result, expected) # raises an error even if limit type is wrong. msg = r"Invalid limit_area: expecting one of \['inside', 'outside'\], got abc" @@ -1619,7 +1602,7 @@ def test_interp_non_timedelta_index(self, interp_methods_ind, ind): method, kwargs = interp_methods_ind if method == "pchip": - _skip_if_no_pchip() + pytest.importorskip("scipy") if method == "linear": result = df[0].interpolate(**kwargs) @@ -1628,9 +1611,9 @@ def test_interp_non_timedelta_index(self, interp_methods_ind, ind): else: expected_error = ( "Index column must be numeric or datetime type when " - "using {method} method other than linear. " + f"using {method} method other than linear. " "Try setting a numeric or datetime index column before " - "interpolating.".format(method=method) + "interpolating." ) with pytest.raises(ValueError, match=expected_error): df[0].interpolate(method=method, **kwargs) @@ -1647,7 +1630,7 @@ def test_interpolate_timedelta_index(self, interp_methods_ind): method, kwargs = interp_methods_ind if method == "pchip": - _skip_if_no_pchip() + pytest.importorskip("scipy") if method in {"linear", "pchip"}: result = df[0].interpolate(method=method, **kwargs) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index 06fe64d69fb6b..bdd9f92d92d3f 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -6,9 +6,9 @@ import pandas as pd from pandas import Categorical, DataFrame, Index, Series, bdate_range, date_range, isna +import pandas._testing as tm from pandas.core import ops import pandas.core.nanops as nanops -import pandas.util.testing as tm class TestSeriesLogicalOps: diff --git a/pandas/tests/series/test_period.py b/pandas/tests/series/test_period.py index 4aeb211170d8f..03fee389542e3 100644 --- a/pandas/tests/series/test_period.py +++ b/pandas/tests/series/test_period.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Period, Series, period_range +import pandas._testing as tm from pandas.core.arrays import PeriodArray -import pandas.util.testing as tm class TestSeriesPeriod: diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 008ae50e4cde5..64a8c4569406e 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -15,7 +15,7 @@ period_range, timedelta_range, ) -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesRepr: diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index 5e2d23a70e5be..73247bbf8b3d6 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -1,4 +1,4 @@ -import pandas.util.testing as tm +import pandas._testing as tm class TestSeriesSubclassing: diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index b9bd7744d3f9c..a2d14f27d7b7a 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -21,7 +21,7 @@ timedelta_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.offsets import BDay, BMonthEnd @@ -76,69 +76,6 @@ def test_asfreq_datetimeindex_empty_series(self): result = Series([3], index=index.copy()).asfreq("H") tm.assert_index_equal(expected.index, result.index) - def test_pct_change(self, datetime_series): - rs = datetime_series.pct_change(fill_method=None) - tm.assert_series_equal(rs, datetime_series / datetime_series.shift(1) - 1) - - rs = datetime_series.pct_change(2) - filled = datetime_series.fillna(method="pad") - tm.assert_series_equal(rs, filled / filled.shift(2) - 1) - - rs = datetime_series.pct_change(fill_method="bfill", limit=1) - filled = datetime_series.fillna(method="bfill", limit=1) - tm.assert_series_equal(rs, filled / filled.shift(1) - 1) - - rs = datetime_series.pct_change(freq="5D") - filled = datetime_series.fillna(method="pad") - tm.assert_series_equal( - rs, (filled / filled.shift(freq="5D") - 1).reindex_like(filled) - ) - - def test_pct_change_with_duplicate_axis(self): - # GH 28664 - common_idx = date_range("2019-11-14", periods=5, freq="D") - result = Series(range(5), common_idx).pct_change(freq="B") - - # the reason that the expected should be like this is documented at PR 28681 - expected = Series([np.NaN, np.inf, np.NaN, np.NaN, 3.0], common_idx) - - tm.assert_series_equal(result, expected) - - def test_pct_change_shift_over_nas(self): - s = Series([1.0, 1.5, np.nan, 2.5, 3.0]) - - chg = s.pct_change() - expected = Series([np.nan, 0.5, 0.0, 2.5 / 1.5 - 1, 0.2]) - tm.assert_series_equal(chg, expected) - - @pytest.mark.parametrize( - "freq, periods, fill_method, limit", - [ - ("5B", 5, None, None), - ("3B", 3, None, None), - ("3B", 3, "bfill", None), - ("7B", 7, "pad", 1), - ("7B", 7, "bfill", 3), - ("14B", 14, None, None), - ], - ) - def test_pct_change_periods_freq( - self, freq, periods, fill_method, limit, datetime_series - ): - # GH 7292 - rs_freq = datetime_series.pct_change( - freq=freq, fill_method=fill_method, limit=limit - ) - rs_periods = datetime_series.pct_change( - periods, fill_method=fill_method, limit=limit - ) - tm.assert_series_equal(rs_freq, rs_periods) - - empty_ts = Series(index=datetime_series.index, dtype=object) - rs_freq = empty_ts.pct_change(freq=freq, fill_method=fill_method, limit=limit) - rs_periods = empty_ts.pct_change(periods, fill_method=fill_method, limit=limit) - tm.assert_series_equal(rs_freq, rs_periods) - def test_autocorr(self, datetime_series): # Just run the function corr1 = datetime_series.autocorr() @@ -200,7 +137,9 @@ def test_first_last_valid(self, datetime_series): assert ts.last_valid_index().freq == ts.index.freq def test_mpl_compat_hack(self, datetime_series): - result = datetime_series[:, np.newaxis] + with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + # GH#30588 multi-dimensional indexing deprecated + result = datetime_series[:, np.newaxis] expected = datetime_series.values[:, np.newaxis] tm.assert_almost_equal(result, expected) @@ -564,10 +503,7 @@ def test_between_time_raises(self): def test_between_time_types(self): # GH11818 rng = date_range("1/1/2000", "1/5/2000", freq="5min") - msg = ( - r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\]" - " to a time" - ) + msg = r"Cannot convert arg \[datetime\.datetime\(2010, 1, 2, 1, 0\)\] to a time" with pytest.raises(ValueError, match=msg): rng.indexer_between_time(datetime(2010, 1, 2, 1), datetime(2010, 1, 2, 5)) @@ -794,14 +730,12 @@ def test_asarray_tz_naive(self): # This shouldn't produce a warning. ser = pd.Series(pd.date_range("2000", periods=2)) expected = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - with tm.assert_produces_warning(None): - result = np.asarray(ser) + result = np.asarray(ser) tm.assert_numpy_array_equal(result, expected) # optionally, object - with tm.assert_produces_warning(None): - result = np.asarray(ser, dtype=object) + result = np.asarray(ser, dtype=object) expected = np.array([pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-02")]) tm.assert_numpy_array_equal(result, expected) @@ -810,15 +744,12 @@ def test_asarray_tz_aware(self): tz = "US/Central" ser = pd.Series(pd.date_range("2000", periods=2, tz=tz)) expected = np.array(["2000-01-01T06", "2000-01-02T06"], dtype="M8[ns]") - # We warn by default and return an ndarray[M8[ns]] - with tm.assert_produces_warning(FutureWarning): - result = np.asarray(ser) + result = np.asarray(ser, dtype="datetime64[ns]") tm.assert_numpy_array_equal(result, expected) # Old behavior with no warning - with tm.assert_produces_warning(None): - result = np.asarray(ser, dtype="M8[ns]") + result = np.asarray(ser, dtype="M8[ns]") tm.assert_numpy_array_equal(result, expected) @@ -826,7 +757,6 @@ def test_asarray_tz_aware(self): expected = np.array( [pd.Timestamp("2000-01-01", tz=tz), pd.Timestamp("2000-01-02", tz=tz)] ) - with tm.assert_produces_warning(None): - result = np.asarray(ser, dtype=object) + result = np.asarray(ser, dtype=object) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/series/test_timezones.py b/pandas/tests/series/test_timezones.py index 5e255e7cd5dcd..a363f927d10a9 100644 --- a/pandas/tests/series/test_timezones.py +++ b/pandas/tests/series/test_timezones.py @@ -11,8 +11,8 @@ from pandas._libs.tslibs import conversion, timezones from pandas import DatetimeIndex, Index, NaT, Series, Timestamp +import pandas._testing as tm from pandas.core.indexes.datetimes import date_range -import pandas.util.testing as tm class TestSeriesTimezones: diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 120eaeaf785b0..ece7f1f21ab23 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -5,7 +5,8 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm +from pandas.arrays import SparseArray UNARY_UFUNCS = [np.positive, np.floor, np.exp] BINARY_UFUNCS = [np.add, np.logaddexp] # dunder op @@ -33,7 +34,7 @@ def test_unary_ufunc(ufunc, sparse): array = np.random.randint(0, 10, 10, dtype="int64") array[::2] = 0 if sparse: - array = pd.SparseArray(array, dtype=pd.SparseDtype("int64", 0)) + array = SparseArray(array, dtype=pd.SparseDtype("int64", 0)) index = list(string.ascii_letters[:10]) name = "name" @@ -51,8 +52,8 @@ def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc): # Test that ufunc(Series(a), array) == Series(ufunc(a, b)) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -79,8 +80,8 @@ def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc): # * ufunc(Index, Series) dispatches to Series (returns a Series) a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -110,8 +111,8 @@ def test_binary_ufunc_with_series( # with alignment between the indices a1, a2 = arrays_for_binary_ufunc if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) name = "name" # op(Series, array) preserves the name. series = pd.Series(a1, name=name) @@ -149,7 +150,7 @@ def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc): # * ufunc(Series, scalar) == ufunc(scalar, Series) array, _ = arrays_for_binary_ufunc if sparse: - array = pd.SparseArray(array) + array = SparseArray(array) other = 2 series = pd.Series(array, name="name") @@ -183,8 +184,8 @@ def test_multiple_ouput_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ a2[a2 == 0] = 1 if sparse: - a1 = pd.SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) - a2 = pd.SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) + a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0)) + a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0)) s1 = pd.Series(a1) s2 = pd.Series(a2) @@ -209,7 +210,7 @@ def test_multiple_ouput_ufunc(sparse, arrays_for_binary_ufunc): array, _ = arrays_for_binary_ufunc if sparse: - array = pd.SparseArray(array) + array = SparseArray(array) series = pd.Series(array, name="name") result = np.modf(series) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 82f647c9385b2..6c7f8c9b0475e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -30,11 +30,11 @@ Timestamp, compat, ) +import pandas._testing as tm from pandas.conftest import BYTES_DTYPES, STRING_DTYPES import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray import pandas.core.common as com -import pandas.util.testing as tm class TestFactorize: @@ -653,8 +653,8 @@ class TestIsin: def test_invalid(self): msg = ( - r"only list-like objects are allowed to be passed to isin\(\)," - r" you passed a \[int\]" + r"only list-like objects are allowed to be passed to isin\(\), " + r"you passed a \[int\]" ) with pytest.raises(TypeError, match=msg): algos.isin(1, 1) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index a8a0fcea7182c..186c735a0bff9 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -60,10 +60,11 @@ def test_random_state(): assert com.random_state() is np.random # Error for floats or strings - with pytest.raises(ValueError): + msg = "random_state must be an integer, a numpy RandomState, or None" + with pytest.raises(ValueError, match=msg): com.random_state("test") - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): com.random_state(5.5) @@ -93,15 +94,17 @@ def test_dict_compat(): def test_standardize_mapping(): # No uninitialized defaultdicts - with pytest.raises(TypeError): + msg = r"to_dict\(\) only accepts initialized defaultdicts" + with pytest.raises(TypeError, match=msg): com.standardize_mapping(collections.defaultdict) # No non-mapping subtypes, instance - with pytest.raises(TypeError): + msg = "unsupported type: " + with pytest.raises(TypeError, match=msg): com.standardize_mapping([]) # No non-mapping subtypes, class - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): com.standardize_mapping(list) fill = {"bad": "data"} diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index ea128c8c3a422..02898988ca8aa 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,8 +8,8 @@ import numpy as np # noqa import pytest -from pandas import DataFrame, Series -import pandas.util.testing as tm +from pandas import DataFrame +import pandas._testing as tm def import_module(name): @@ -55,6 +55,10 @@ def test_oo_optimizable(): @tm.network # Cython import warning @pytest.mark.filterwarnings("ignore:can't:ImportWarning") +@pytest.mark.filterwarnings( + # patsy needs to update their imports + "ignore:Using or importing the ABCs from 'collections:DeprecationWarning" +) def test_statsmodels(): statsmodels = import_module("statsmodels") # noqa @@ -110,26 +114,6 @@ def test_geopandas(): assert geopandas.read_file(fp) is not None -def test_geopandas_coordinate_indexer(): - # this test is included to have coverage of one case in the indexing.py - # code that is only kept for compatibility with geopandas, see - # https://github.com/pandas-dev/pandas/issues/27258 - # We should be able to remove this after some time when its usage is - # removed in geopandas - from pandas.core.indexing import _NDFrameIndexer - - class _CoordinateIndexer(_NDFrameIndexer): - def _getitem_tuple(self, tup): - obj = self.obj - xs, ys = tup - return obj[xs][ys] - - Series._create_indexer("cx", _CoordinateIndexer) - s = Series(range(5)) - res = s.cx[:, :] - tm.assert_series_equal(s, res) - - # Cython import warning @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") @pytest.mark.filterwarnings("ignore:RangeIndex.* is deprecated:DeprecationWarning") @@ -152,7 +136,12 @@ def test_missing_required_dependency(): # https://github.com/MacPython/pandas-wheels/pull/50 call = ["python", "-sSE", "-c", "import pandas"] - with pytest.raises(subprocess.CalledProcessError) as exc: + msg = ( + r"Command '\['python', '-sSE', '-c', 'import pandas'\]' " + "returned non-zero exit status 1." + ) + + with pytest.raises(subprocess.CalledProcessError, match=msg) as exc: subprocess.check_output(call, stderr=subprocess.STDOUT) output = exc.value.stdout.decode() diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index fa2142444ed92..939ea8a64d94d 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -22,12 +22,15 @@ def test_exception_importable(exc): from pandas import errors - e = getattr(errors, exc) - assert e is not None + err = getattr(errors, exc) + assert err is not None # check that we can raise on them - with pytest.raises(e): - raise e() + + msg = "^$" + + with pytest.raises(err, match=msg): + raise err() def test_catch_oob(): diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 9808c3d78b436..fadab5d821470 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -5,9 +5,9 @@ from numpy.random import randn import pytest +import pandas._testing as tm from pandas.core.api import DataFrame from pandas.core.computation import expressions as expr -import pandas.util.testing as tm _frame = DataFrame(randn(10000, 4), columns=list("ABCD"), dtype="float64") _frame2 = DataFrame(randn(100, 4), columns=list("ABCD"), dtype="float64") diff --git a/pandas/tests/test_join.py b/pandas/tests/test_join.py index 8940a82b33777..129dc275c4d5a 100644 --- a/pandas/tests/test_join.py +++ b/pandas/tests/test_join.py @@ -4,7 +4,7 @@ from pandas._libs import join as _join from pandas import Categorical, DataFrame, Index, merge -import pandas.util.testing as tm +import pandas._testing as tm class TestIndexer: diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 77841f0bb9f0d..d914cf873de24 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -4,7 +4,7 @@ from pandas._libs import lib, writers as libwriters from pandas import Index -import pandas.util.testing as tm +import pandas._testing as tm class TestMisc: @@ -22,7 +22,8 @@ def test_max_len_string_array(self): assert libwriters.max_len_string_array(arr) == 3 # raises - with pytest.raises(TypeError): + msg = "No matching signature found" + with pytest.raises(TypeError, match=msg): libwriters.max_len_string_array(arr.astype("U")) def test_fast_unique_multiple_list_gen_sort(self): @@ -100,9 +101,11 @@ def test_maybe_indices_to_slice_right_edge(self): assert not isinstance(maybe_slice, slice) tm.assert_numpy_array_equal(maybe_slice, indices) - with pytest.raises(IndexError): + msg = "index 100 is out of bounds for axis (0|1) with size 100" + + with pytest.raises(IndexError, match=msg): target[indices] - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): target[maybe_slice] indices = np.array([100, 99, 98, 97], dtype=np.int64) @@ -111,9 +114,9 @@ def test_maybe_indices_to_slice_right_edge(self): assert not isinstance(maybe_slice, slice) tm.assert_numpy_array_equal(maybe_slice, indices) - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): target[indices] - with pytest.raises(IndexError): + with pytest.raises(IndexError, match=msg): target[maybe_slice] for case in [[99, 97, 99, 96], [99, 99, 98, 97], [98, 98, 97, 96]]: diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 39c122addd8b1..5382ad84bcca2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -12,7 +12,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, isna -import pandas.util.testing as tm +import pandas._testing as tm AGG_FUNCTIONS = [ "sum", @@ -1359,6 +1359,30 @@ def test_mixed_depth_drop(self): ) tm.assert_frame_equal(expected, result) + def test_drop_multiindex_other_level_nan(self): + # GH 12754 + df = ( + DataFrame( + { + "A": ["one", "one", "two", "two"], + "B": [np.nan, 0.0, 1.0, 2.0], + "C": ["a", "b", "c", "c"], + "D": [1, 2, 3, 4], + } + ) + .set_index(["A", "B", "C"]) + .sort_index() + ) + result = df.drop("c", level="C") + expected = DataFrame( + [2, 1], + columns=["D"], + index=pd.MultiIndex.from_tuples( + [("one", 0.0, "b"), ("one", np.nan, "a")], names=["A", "B", "C"] + ), + ) + tm.assert_frame_equal(result, expected) + def test_drop_nonunique(self): df = DataFrame( [ @@ -2286,6 +2310,14 @@ def test_sort_index_and_reconstruction_doc_example(self): tm.assert_frame_equal(result, expected) + def test_sort_index_non_existent_label_multiindex(self): + # GH 12261 + df = DataFrame(0, columns=[], index=pd.MultiIndex.from_product([[], []])) + df.loc["b", "2"] = 1 + df.loc["a", "3"] = 1 + result = df.sort_index().index.is_monotonic + assert result is True + def test_sort_index_reorder_on_ops(self): # 15687 df = DataFrame( diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index c207c803510ca..2c5d028ebe42e 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -11,9 +11,9 @@ import pandas as pd from pandas import Series, isna +import pandas._testing as tm from pandas.core.arrays import DatetimeArray import pandas.core.nanops as nanops -import pandas.util.testing as tm use_bn = nanops._USE_BOTTLENECK has_c16 = hasattr(np, "complex128") @@ -598,6 +598,14 @@ def test_nancorr_spearman(self): targ1 = spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0] self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman") + @td.skip_if_no_scipy + def test_invalid_method(self): + targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] + targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] + msg = "Unkown method 'foo', expected one of 'kendall', 'spearman'" + with pytest.raises(ValueError, match=msg): + self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="foo") + def test_nancov(self): targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1] targ1 = np.cov(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py index cd154ed5fe570..ce527214e55e7 100644 --- a/pandas/tests/test_optional_dependency.py +++ b/pandas/tests/test_optional_dependency.py @@ -5,7 +5,7 @@ from pandas.compat._optional import VERSIONS, import_optional_dependency -import pandas.util.testing as tm +import pandas._testing as tm def test_import_optional(): diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py index 6b40ff8b3fa1e..08a5581886522 100644 --- a/pandas/tests/test_register_accessor.py +++ b/pandas/tests/test_register_accessor.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -import pandas.util.testing as tm +import pandas._testing as tm @contextlib.contextmanager diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 90cd9cc3e006d..98297474243e4 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -6,6 +6,7 @@ import pytest from pandas import DataFrame, MultiIndex, Series, array, concat, merge +import pandas._testing as tm from pandas.core.algorithms import safe_sort import pandas.core.common as com from pandas.core.sorting import ( @@ -15,7 +16,6 @@ lexsort_indexer, nargsort, ) -import pandas.util.testing as tm class TestSorting: diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index ae7ab6addc3fb..62d26dacde67b 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -8,8 +8,8 @@ from pandas._libs import lib from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna +import pandas._testing as tm import pandas.core.strings as strings -import pandas.util.testing as tm def assert_series_or_index_equal(left, right): @@ -3392,8 +3392,8 @@ def test_encode_decode_errors(self): encodeBase = Series(["a", "b", "a\x9d"]) msg = ( - r"'charmap' codec can't encode character '\\x9d' in position 1:" - " character maps to " + r"'charmap' codec can't encode character '\\x9d' in position 1: " + "character maps to " ) with pytest.raises(UnicodeEncodeError, match=msg): encodeBase.str.encode("cp1252") @@ -3406,8 +3406,8 @@ def test_encode_decode_errors(self): decodeBase = Series([b"a", b"b", b"a\x9d"]) msg = ( - "'charmap' codec can't decode byte 0x9d in position 1:" - " character maps to " + "'charmap' codec can't decode byte 0x9d in position 1: " + "character maps to " ) with pytest.raises(UnicodeDecodeError, match=msg): decodeBase.str.decode("cp1252") @@ -3521,7 +3521,7 @@ def test_string_array(any_string_method): if isinstance(expected, Series): if expected.dtype == "object" and lib.is_string_array( - expected.values, skipna=True + expected.dropna().values, ): assert result.dtype == "string" result = result.astype(object) @@ -3573,3 +3573,18 @@ def test_string_array_boolean_array(method, expected): result = getattr(s.str, method)() expected = Series(expected, dtype="boolean") tm.assert_series_equal(result, expected) + + +def test_string_array_extract(): + # https://github.com/pandas-dev/pandas/issues/30969 + # Only expand=False & multiple groups was failing + a = Series(["a1", "b2", "cc"], dtype="string") + b = Series(["a1", "b2", "cc"], dtype="object") + pat = r"(\w)(\d)" + + result = a.str.extract(pat, expand=False) + expected = b.str.extract(pat, expand=False) + assert all(result.dtypes == "string") + + result = result.astype(object) + tm.assert_equal(result, expected) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index d2a9e1dc94bb5..1d2ab9358c01c 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -6,8 +6,8 @@ from pandas._libs.tslib import iNaT +import pandas._testing as tm import pandas.core.algorithms as algos -import pandas.util.testing as tm @pytest.fixture(params=[True, False]) @@ -423,16 +423,21 @@ class TestExtensionTake: def test_bounds_check_large(self): arr = np.array([1, 2]) - with pytest.raises(IndexError): + + msg = "indices are out-of-bounds" + with pytest.raises(IndexError, match=msg): algos.take(arr, [2, 3], allow_fill=True) - with pytest.raises(IndexError): + msg = "index 2 is out of bounds for( axis 0 with)? size 2" + with pytest.raises(IndexError, match=msg): algos.take(arr, [2, 3], allow_fill=False) def test_bounds_check_small(self): arr = np.array([1, 2, 3], dtype=np.int64) indexer = [0, -1, -2] - with pytest.raises(ValueError): + + msg = r"'indices' contains values less than allowed \(-2 < -1\)" + with pytest.raises(ValueError, match=msg): algos.take(arr, indexer, allow_fill=True) result = algos.take(arr, indexer) @@ -446,7 +451,11 @@ def test_take_empty(self, allow_fill): result = algos.take(arr, [], allow_fill=allow_fill) tm.assert_numpy_array_equal(arr, result) - with pytest.raises(IndexError): + msg = ( + r"cannot do a non-empty take from an empty axes.|" + "indices are out-of-bounds" + ) + with pytest.raises(IndexError, match=msg): algos.take(arr, [0], allow_fill=allow_fill) def test_take_na_empty(self): diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index 082277796e602..2fd39d5a7b703 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, Series, to_numeric -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[None, "ignore", "raise", "coerce"]) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index 50844aabb2c88..c4660417599a8 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -8,8 +8,8 @@ from pandas.compat import is_platform_windows from pandas import DatetimeIndex, Index, Series, Timestamp, date_range, period_range +import pandas._testing as tm from pandas.core.tools.datetimes import to_datetime -import pandas.util.testing as tm import pandas.tseries.frequencies as frequencies import pandas.tseries.offsets as offsets @@ -468,7 +468,7 @@ def test_series_datetime_index(freq): @pytest.mark.parametrize( "offset_func", [ - frequencies.get_offset, + frequencies._get_offset, lambda freq: date_range("2011-01-01", periods=5, freq=freq), ], ) @@ -528,8 +528,8 @@ def test_legacy_offset_warnings(offset_func, freq): def test_ms_vs_capital_ms(): - left = frequencies.get_offset("ms") - right = frequencies.get_offset("MS") + left = frequencies._get_offset("ms") + right = frequencies._get_offset("MS") assert left == offsets.Milli() assert right == offsets.MonthBegin() diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index c122f92ed228c..5b4a7c74b1af1 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -3,7 +3,7 @@ import pytest from pandas import DatetimeIndex, offsets, to_datetime -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.holiday import ( AbstractHolidayCalendar, diff --git a/pandas/tests/tseries/holiday/test_holiday.py b/pandas/tests/tseries/holiday/test_holiday.py index 7748b965f8962..a2c146dbd65e8 100644 --- a/pandas/tests/tseries/holiday/test_holiday.py +++ b/pandas/tests/tseries/holiday/test_holiday.py @@ -3,7 +3,7 @@ import pytest from pytz import utc -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries.holiday import ( MO, diff --git a/pandas/tests/tseries/offsets/common.py b/pandas/tests/tseries/offsets/common.py index a097636bbf0b4..71953fd095882 100644 --- a/pandas/tests/tseries/offsets/common.py +++ b/pandas/tests/tseries/offsets/common.py @@ -18,8 +18,8 @@ def assert_offset_equal(offset, base, expected): ) -def assert_onOffset(offset, date, expected): - actual = offset.onOffset(date) +def assert_is_on_offset(offset, date, expected): + actual = offset.is_on_offset(date) assert actual == expected, ( f"\nExpected: {expected}\nActual: {actual}\nFor Offset: {offset})" f"\nAt Date: {date}" diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index c97e0b8493f9c..5686119593e18 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -9,11 +9,12 @@ from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG from pandas import Timestamp +import pandas._testing as tm from pandas.tseries.frequencies import get_offset from pandas.tseries.offsets import FY5253, FY5253Quarter -from .common import assert_offset_equal, assert_onOffset +from .common import assert_is_on_offset, assert_offset_equal from .test_offsets import Base, WeekDay @@ -50,9 +51,11 @@ def test_get_offset_name(): def test_get_offset(): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset("gibberish") + with tm.assert_produces_warning(FutureWarning): + get_offset("gibberish") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset("QS-JAN-B") + with tm.assert_produces_warning(FutureWarning): + get_offset("QS-JAN-B") pairs = [ ("RE-N-DEC-MON", makeFY5253NearestEndMonth(weekday=0, startingMonth=12)), @@ -78,7 +81,8 @@ def test_get_offset(): ] for name, expected in pairs: - offset = get_offset(name) + with tm.assert_produces_warning(FutureWarning): + offset = get_offset(name) assert offset == expected, ( f"Expected {repr(name)} to yield {repr(expected)} " f"(actual: {repr(offset)})" @@ -128,9 +132,9 @@ class TestFY5253LastOfMonth(Base): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) def test_apply(self): offset_lom_aug_sat = makeFY5253LastOfMonth(startingMonth=8, weekday=WeekDay.SAT) @@ -253,9 +257,9 @@ def test_get_year_end(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) def test_apply(self): date_seq_nem_8_sat = [ @@ -329,16 +333,16 @@ def test_apply(self): class TestFY5253LastOfMonthQuarter(Base): - def test_isAnchored(self): + def test_is_anchored(self): assert makeFY5253LastOfMonthQuarter( startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).isAnchored() + ).is_anchored() assert makeFY5253LastOfMonthQuarter( weekday=WeekDay.SAT, startingMonth=3, qtr_with_extra_week=4 - ).isAnchored() + ).is_anchored() assert not makeFY5253LastOfMonthQuarter( 2, startingMonth=1, weekday=WeekDay.SAT, qtr_with_extra_week=4 - ).isAnchored() + ).is_anchored() def test_equality(self): assert makeFY5253LastOfMonthQuarter( @@ -491,9 +495,9 @@ def test_offset(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) def test_year_has_extra_week(self): # End of long Q1 @@ -596,9 +600,9 @@ class TestFY5253NearestEndMonthQuarter(Base): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) def test_offset(self): offset = makeFY5253NearestEndMonthQuarter( @@ -652,7 +656,7 @@ def test_fy5253_last_onoffset(): # GH#18877 dates on the year-end but not normalized to midnight offset = FY5253(n=-5, startingMonth=5, variation="last", weekday=0) ts = Timestamp("1984-05-28 06:29:43.955911354+0200", tz="Europe/San_Marino") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -661,7 +665,7 @@ def test_fy5253_nearest_onoffset(): # GH#18877 dates on the year-end but not normalized to midnight offset = FY5253(n=3, startingMonth=7, variation="nearest", weekday=2) ts = Timestamp("2032-07-28 00:12:59.035729419+0000", tz="Africa/Dakar") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -672,7 +676,7 @@ def test_fy5253qtr_onoffset_nearest(): offset = FY5253Quarter( n=3, qtr_with_extra_week=1, startingMonth=2, variation="nearest", weekday=0 ) - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -684,5 +688,5 @@ def test_fy5253qtr_onoffset_last(): ) ts = Timestamp("2011-01-26 19:03:40.331096129+0200", tz="Africa/Windhoek") slow = (ts + offset) - offset == ts - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) assert fast == slow diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 6f628bf86829a..2f00a58fe80be 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -22,12 +22,12 @@ from pandas.compat.numpy import np_datetime64_compat from pandas.errors import PerformanceWarning -from pandas.core.indexes.datetimes import DatetimeIndex, _to_M8, date_range +import pandas._testing as tm +from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.series import Series -import pandas.util.testing as tm from pandas.io.pickle import read_pickle -from pandas.tseries.frequencies import _offset_map, get_offset +from pandas.tseries.frequencies import _get_offset, _offset_map from pandas.tseries.holiday import USFederalHolidayCalendar import pandas.tseries.offsets as offsets from pandas.tseries.offsets import ( @@ -67,7 +67,7 @@ YearEnd, ) -from .common import assert_offset_equal, assert_onOffset +from .common import assert_is_on_offset, assert_offset_equal class WeekDay: @@ -81,17 +81,6 @@ class WeekDay: SUN = 6 -#### -# Misc function tests -#### - - -def test_to_M8(): - valb = datetime(2007, 10, 1) - valu = _to_M8(valb) - assert isinstance(valu, np.datetime64) - - ##### # DateOffset Tests ##### @@ -337,7 +326,7 @@ def test_offset_freqstr(self, offset_types): freqstr = offset.freqstr if freqstr not in ("", "", "LWOM-SAT"): - code = get_offset(freqstr) + code = _get_offset(freqstr) assert offset.rule_code == code def _check_offsetfunc_works(self, offset, funcname, dt, expected, normalize=False): @@ -557,24 +546,24 @@ def test_rollback(self, offset_types): offset_types, "rollback", dt, expected, normalize=True ) - def test_onOffset(self, offset_types): + def test_is_on_offset(self, offset_types): dt = self.expecteds[offset_types.__name__] offset_s = self._get_offset(offset_types) - assert offset_s.onOffset(dt) + assert offset_s.is_on_offset(dt) - # when normalize=True, onOffset checks time is 00:00:00 + # when normalize=True, is_on_offset checks time is 00:00:00 if issubclass(offset_types, Tick): # normalize=True disallowed for Tick subclasses GH#21427 return offset_n = self._get_offset(offset_types, normalize=True) - assert not offset_n.onOffset(dt) + assert not offset_n.is_on_offset(dt) if offset_types in (BusinessHour, CustomBusinessHour): # In default BusinessHour (9:00-17:00), normalized time # cannot be in business hour range return date = datetime(dt.year, dt.month, dt.day) - assert offset_n.onOffset(date) + assert offset_n.is_on_offset(date) def test_add(self, offset_types, tz_naive_fixture): tz = tz_naive_fixture @@ -666,6 +655,27 @@ def test_pickle_v0_15_2(self, datapath): # tm.assert_dict_equal(offsets, read_pickle(pickle_path)) + def test_onOffset_deprecated(self, offset_types): + # GH#30340 use idiomatic naming + off = self._get_offset(offset_types) + + ts = Timestamp.now() + with tm.assert_produces_warning(FutureWarning): + result = off.onOffset(ts) + + expected = off.is_on_offset(ts) + assert result == expected + + def test_isAnchored_deprecated(self, offset_types): + # GH#30340 use idiomatic naming + off = self._get_offset(offset_types) + + with tm.assert_produces_warning(FutureWarning): + result = off.isAnchored() + + expected = off.is_anchored() + assert result == expected + class TestDateOffset(Base): def setup_method(self, method): @@ -689,8 +699,8 @@ def test_constructor(self): assert (self.d + DateOffset(2)) == datetime(2008, 1, 4) - assert not DateOffset(2).isAnchored() - assert DateOffset(1).isAnchored() + assert not DateOffset(2).is_anchored() + assert DateOffset(1).is_anchored() d = datetime(2008, 1, 31) assert (d + DateOffset(months=1)) == datetime(2008, 2, 29) @@ -778,14 +788,14 @@ def test_roll_date_object(self): result = offset.rollforward(dt) assert result == datetime(2012, 9, 15) - def test_onOffset(self): + def test_is_on_offset(self): tests = [ (BDay(), datetime(2008, 1, 1), True), (BDay(), datetime(2008, 1, 5), False), ] for offset, d, expected in tests: - assert_onOffset(offset, d, expected) + assert_is_on_offset(offset, d, expected) apply_cases: _ApplyCases = [] apply_cases.append( @@ -1273,10 +1283,10 @@ def test_normalize(self, case): ) @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, cases = case for dt, expected in cases.items(): - assert offset.onOffset(dt) == expected + assert offset.is_on_offset(dt) == expected opening_time_cases = [] # opening time should be affected by sign of n, not by n's value and @@ -2472,7 +2482,7 @@ def test_normalize(self, norm_cases): for dt, expected in cases.items(): assert offset.apply(dt) == expected - def test_onOffset(self): + def test_is_on_offset(self): tests = [] tests.append( @@ -2491,7 +2501,7 @@ def test_onOffset(self): for offset, cases in tests: for dt, expected in cases.items(): - assert offset.onOffset(dt) == expected + assert offset.is_on_offset(dt) == expected apply_cases = [] apply_cases.append( @@ -2671,9 +2681,9 @@ def test_roll_date_object(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, d, expected = case - assert_onOffset(offset, d, expected) + assert_is_on_offset(offset, d, expected) apply_cases: _ApplyCases = [] apply_cases.append( @@ -2771,8 +2781,8 @@ def test_apply_large_n(self): def test_apply_corner(self): msg = ( - "Only know how to combine trading day with datetime, datetime64" - " or timedelta" + "Only know how to combine trading day " + "with datetime, datetime64 or timedelta" ) with pytest.raises(ApplyTypeError, match=msg): CDay().apply(BMonthEnd()) @@ -2918,9 +2928,9 @@ def test_roll_date_object(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, d, expected = case - assert_onOffset(offset, d, expected) + assert_is_on_offset(offset, d, expected) apply_cases: _ApplyCases = [] apply_cases.append( @@ -3067,9 +3077,9 @@ def test_roll_date_object(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) apply_cases: _ApplyCases = [] apply_cases.append( @@ -3184,11 +3194,11 @@ def test_corner(self): with pytest.raises(ValueError, match="Day must be"): Week(weekday=-1) - def test_isAnchored(self): - assert Week(weekday=0).isAnchored() - assert not Week().isAnchored() - assert not Week(2, weekday=2).isAnchored() - assert not Week(2).isAnchored() + def test_is_anchored(self): + assert Week(weekday=0).is_anchored() + assert not Week().is_anchored() + assert not Week(2, weekday=2).is_anchored() + assert not Week(2).is_anchored() offset_cases = [] # not business week @@ -3252,7 +3262,7 @@ def test_offset(self, case): assert_offset_equal(offset, base, expected) @pytest.mark.parametrize("weekday", range(7)) - def test_onOffset(self, weekday): + def test_is_on_offset(self, weekday): offset = Week(weekday=weekday) for day in range(1, 8): @@ -3262,7 +3272,7 @@ def test_onOffset(self, weekday): expected = True else: expected = False - assert_onOffset(offset, date, expected) + assert_is_on_offset(offset, date, expected) class TestWeekOfMonth(Base): @@ -3359,10 +3369,10 @@ def test_offset(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): week, weekday, dt, expected = case offset = WeekOfMonth(week=week, weekday=weekday) - assert offset.onOffset(dt) == expected + assert offset.is_on_offset(dt) == expected class TestLastWeekOfMonth(Base): @@ -3436,10 +3446,10 @@ def test_offset(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): weekday, dt, expected = case offset = LastWeekOfMonth(weekday=weekday) - assert offset.onOffset(dt) == expected + assert offset.is_on_offset(dt) == expected class TestSemiMonthEnd(Base): @@ -3646,9 +3656,9 @@ def test_apply_index(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): dt, expected = case - assert_onOffset(SemiMonthEnd(), dt, expected) + assert_is_on_offset(SemiMonthEnd(), dt, expected) @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) def test_vectorized_offset_addition(self, klass): @@ -3910,9 +3920,9 @@ def test_apply_index(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): dt, expected = case - assert_onOffset(SemiMonthBegin(), dt, expected) + assert_is_on_offset(SemiMonthBegin(), dt, expected) @pytest.mark.parametrize("klass", [Series, DatetimeIndex]) def test_vectorized_offset_addition(self, klass): @@ -3995,9 +4005,9 @@ def test_get_offset_name(self): def test_get_offset(): with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset("gibberish") + _get_offset("gibberish") with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset("QS-JAN-B") + _get_offset("QS-JAN-B") pairs = [ ("B", BDay()), @@ -4012,7 +4022,7 @@ def test_get_offset(): ] for name, expected in pairs: - offset = get_offset(name) + offset = _get_offset(name) assert offset == expected, ( f"Expected {repr(name)} to yield {repr(expected)} " f"(actual: {repr(offset)})" @@ -4023,7 +4033,7 @@ def test_get_offset_legacy(): pairs = [("w@Sat", Week(weekday=5))] for name, expected in pairs: with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - get_offset(name) + _get_offset(name) class TestOffsetAliases: @@ -4039,17 +4049,17 @@ def test_alias_equality(self): def test_rule_code(self): lst = ["M", "MS", "BM", "BMS", "D", "B", "H", "T", "S", "L", "U"] for k in lst: - assert k == get_offset(k).rule_code + assert k == _get_offset(k).rule_code # should be cached - this is kind of an internals test... assert k in _offset_map - assert k == (get_offset(k) * 3).rule_code + assert k == (_get_offset(k) * 3).rule_code suffix_lst = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"] base = "W" for v in suffix_lst: alias = "-".join([base, v]) - assert alias == get_offset(alias).rule_code - assert alias == (get_offset(alias) * 5).rule_code + assert alias == _get_offset(alias).rule_code + assert alias == (_get_offset(alias) * 5).rule_code suffix_lst = [ "JAN", @@ -4069,8 +4079,8 @@ def test_rule_code(self): for base in base_lst: for v in suffix_lst: alias = "-".join([base, v]) - assert alias == get_offset(alias).rule_code - assert alias == (get_offset(alias) * 5).rule_code + assert alias == _get_offset(alias).rule_code + assert alias == (_get_offset(alias) * 5).rule_code lst = ["M", "D", "B", "H", "T", "S", "L", "U"] for k in lst: @@ -4123,7 +4133,7 @@ def test_str_for_named_is_name(self): names += ["WOM-" + week + day for week in ("1", "2", "3", "4") for day in days] _offset_map.clear() for name in names: - offset = get_offset(name) + offset = _get_offset(name) assert offset.freqstr == name @@ -4348,34 +4358,34 @@ def test_tick_normalize_raises(tick_classes): def test_weeks_onoffset(): # GH#18510 Week with weekday = None, normalize = False should always - # be onOffset + # be is_on_offset offset = Week(n=2, weekday=None) ts = Timestamp("1862-01-13 09:03:34.873477378+0210", tz="Africa/Lusaka") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow # negative n offset = Week(n=2, weekday=None) ts = Timestamp("1856-10-24 16:18:36.556360110-0717", tz="Pacific/Easter") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow def test_weekofmonth_onoffset(): # GH#18864 - # Make sure that nanoseconds don't trip up onOffset (and with it apply) + # Make sure that nanoseconds don't trip up is_on_offset (and with it apply) offset = WeekOfMonth(n=2, week=2, weekday=0) ts = Timestamp("1916-05-15 01:14:49.583410462+0422", tz="Asia/Qyzylorda") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow # negative n offset = WeekOfMonth(n=-3, week=1, weekday=0) ts = Timestamp("1980-12-08 03:38:52.878321185+0500", tz="Asia/Oral") - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) slow = (ts + offset) - offset == ts assert fast == slow @@ -4385,14 +4395,14 @@ def test_last_week_of_month_on_offset(): offset = LastWeekOfMonth(n=4, weekday=6) ts = Timestamp("1917-05-27 20:55:27.084284178+0200", tz="Europe/Warsaw") slow = (ts + offset) - offset == ts - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) assert fast == slow # negative n offset = LastWeekOfMonth(n=-4, weekday=5) ts = Timestamp("2005-08-27 05:01:42.799392561-0500", tz="America/Rainy_River") slow = (ts + offset) - offset == ts - fast = offset.onOffset(ts) + fast = offset.is_on_offset(ts) assert fast == slow diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 351f0f9ad3b5b..716d3ff3faf1c 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -90,11 +90,11 @@ @given(gen_random_datetime, gen_yqm_offset) def test_on_offset_implementations(dt, offset): assume(not offset.normalize) - # check that the class-specific implementations of onOffset match + # check that the class-specific implementations of is_on_offset match # the general case definition: # (dt + offset) - offset == dt compare = (dt + offset) - offset - assert offset.onOffset(dt) == (compare == dt) + assert offset.is_on_offset(dt) == (compare == dt) @pytest.mark.xfail( diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index 98a3631c8e63a..297e5c3178379 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -8,7 +8,7 @@ import pytest from pandas import Timedelta, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm from pandas.tseries import offsets from pandas.tseries.offsets import Hour, Micro, Milli, Minute, Nano, Second @@ -284,7 +284,7 @@ def test_tick_equalities(cls): @pytest.mark.parametrize("cls", tick_classes) def test_tick_offset(cls): - assert not cls().isAnchored() + assert not cls().is_anchored() @pytest.mark.parametrize("cls", tick_classes) diff --git a/pandas/tests/tseries/offsets/test_yqm_offsets.py b/pandas/tests/tseries/offsets/test_yqm_offsets.py index 12a524d82fcf5..79a0e0f2c25eb 100644 --- a/pandas/tests/tseries/offsets/test_yqm_offsets.py +++ b/pandas/tests/tseries/offsets/test_yqm_offsets.py @@ -23,7 +23,7 @@ YearEnd, ) -from .common import assert_offset_equal, assert_onOffset +from .common import assert_is_on_offset, assert_offset_equal from .test_offsets import Base # -------------------------------------------------------------------- @@ -85,7 +85,7 @@ def test_on_offset(offset): if not (m == 11 and d == 31) ] for date in dates: - res = offset.onOffset(date) + res = offset.is_on_offset(date) slow_version = date == (date + offset) - offset assert res == slow_version @@ -247,9 +247,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBMonthBegin(Base): @@ -335,9 +335,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBMonthEnd(Base): @@ -424,9 +424,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) # -------------------------------------------------------------------- @@ -442,10 +442,10 @@ def test_repr(self): expected = "" assert repr(QuarterBegin(startingMonth=1)) == expected - def test_isAnchored(self): - assert QuarterBegin(startingMonth=1).isAnchored() - assert QuarterBegin().isAnchored() - assert not QuarterBegin(2, startingMonth=1).isAnchored() + def test_is_anchored(self): + assert QuarterBegin(startingMonth=1).is_anchored() + assert QuarterBegin().is_anchored() + assert not QuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -552,10 +552,10 @@ def test_repr(self): expected = "" assert repr(QuarterEnd(startingMonth=1)) == expected - def test_isAnchored(self): - assert QuarterEnd(startingMonth=1).isAnchored() - assert QuarterEnd().isAnchored() - assert not QuarterEnd(2, startingMonth=1).isAnchored() + def test_is_anchored(self): + assert QuarterEnd(startingMonth=1).is_anchored() + assert QuarterEnd().is_anchored() + assert not QuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -683,9 +683,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBQuarterBegin(Base): @@ -699,10 +699,10 @@ def test_repr(self): expected = "" assert repr(BQuarterBegin(startingMonth=1)) == expected - def test_isAnchored(self): - assert BQuarterBegin(startingMonth=1).isAnchored() - assert BQuarterBegin().isAnchored() - assert not BQuarterBegin(2, startingMonth=1).isAnchored() + def test_is_anchored(self): + assert BQuarterBegin(startingMonth=1).is_anchored() + assert BQuarterBegin().is_anchored() + assert not BQuarterBegin(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -824,10 +824,10 @@ def test_repr(self): expected = "" assert repr(BQuarterEnd(startingMonth=1)) == expected - def test_isAnchored(self): - assert BQuarterEnd(startingMonth=1).isAnchored() - assert BQuarterEnd().isAnchored() - assert not BQuarterEnd(2, startingMonth=1).isAnchored() + def test_is_anchored(self): + assert BQuarterEnd(startingMonth=1).is_anchored() + assert BQuarterEnd().is_anchored() + assert not BQuarterEnd(2, startingMonth=1).is_anchored() def test_offset_corner_case(self): # corner @@ -951,9 +951,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) # -------------------------------------------------------------------- @@ -1109,9 +1109,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestYearEnd(Base): @@ -1186,9 +1186,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestYearEndDiffMonth(Base): @@ -1258,9 +1258,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBYearBegin(Base): @@ -1404,9 +1404,9 @@ def test_offset(self, case): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) class TestBYearEndLagged(Base): @@ -1459,6 +1459,6 @@ def test_roll(self): ] @pytest.mark.parametrize("case", on_offset_cases) - def test_onOffset(self, case): + def test_is_on_offset(self, case): offset, dt, expected = case - assert_onOffset(offset, dt, expected) + assert_is_on_offset(offset, dt, expected) diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 5cf2165993cd7..a40fcd725d604 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -9,7 +9,7 @@ from pandas.compat.numpy import np_array_datetime64_compat from pandas import Timestamp -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 6c30e2b6c7a1c..96c2d6bbd8106 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -8,7 +8,7 @@ from pandas._libs.tslibs import conversion, timezones, tzconversion from pandas import Timestamp, date_range -import pandas.util.testing as tm +import pandas._testing as tm def _compare_utc_to_local(tz_didx): @@ -72,6 +72,15 @@ def test_length_zero_copy(dtype, copy): assert result.base is (None if copy else arr) +def test_ensure_datetime64ns_bigendian(): + # GH#29684 + arr = np.array([np.datetime64(1, "ms")], dtype=">M8[ms]") + result = conversion.ensure_datetime64ns(arr) + + expected = np.array([np.datetime64(1, "ms")], dtype="M8[ns]") + tm.assert_numpy_array_equal(result, expected) + + class SubDatetime(datetime): pass diff --git a/pandas/tests/tslibs/test_fields.py b/pandas/tests/tslibs/test_fields.py index cd729956a027c..943f4207df543 100644 --- a/pandas/tests/tslibs/test_fields.py +++ b/pandas/tests/tslibs/test_fields.py @@ -2,7 +2,7 @@ from pandas._libs.tslibs import fields -import pandas.util.testing as tm +import pandas._testing as tm def test_fields_readonly(): diff --git a/pandas/tests/tslibs/test_parse_iso8601.py b/pandas/tests/tslibs/test_parse_iso8601.py index a6e7aee46b485..a58f227c20c7f 100644 --- a/pandas/tests/tslibs/test_parse_iso8601.py +++ b/pandas/tests/tslibs/test_parse_iso8601.py @@ -59,9 +59,7 @@ def test_parsers_iso8601_invalid(date_str): def test_parsers_iso8601_invalid_offset_invalid(): date_str = "2001-01-01 12-34-56" - msg = "Timezone hours offset out of range " 'in datetime string "{s}"'.format( - s=date_str - ) + msg = f'Timezone hours offset out of range in datetime string "{date_str}"' with pytest.raises(ValueError, match=msg): tslib._test_parse_iso8601(date_str) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 0bc30347b3fa9..c452d5b12ce01 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -11,14 +11,13 @@ from pandas._libs.tslibs.parsing import parse_time_string import pandas.util._test_decorators as td -import pandas.util.testing as tm +import pandas._testing as tm def test_parse_time_string(): - (date, parsed, reso) = parse_time_string("4Q1984") - (date_lower, parsed_lower, reso_lower) = parse_time_string("4q1984") + (parsed, reso) = parse_time_string("4Q1984") + (parsed_lower, reso_lower) = parse_time_string("4q1984") - assert date == date_lower assert reso == reso_lower assert parsed == parsed_lower @@ -34,10 +33,9 @@ def test_parse_time_string_invalid_type(): ) def test_parse_time_quarter_with_dash(dashed, normal): # see gh-9688 - (date_dash, parsed_dash, reso_dash) = parse_time_string(dashed) - (date, parsed, reso) = parse_time_string(normal) + (parsed_dash, reso_dash) = parse_time_string(dashed) + (parsed, reso) = parse_time_string(normal) - assert date_dash == date assert parsed_dash == parsed assert reso_dash == reso @@ -106,7 +104,7 @@ def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg): ], ) def test_parsers_quarterly_with_freq(date_str, freq, expected): - result, _, _ = parsing.parse_time_string(date_str, freq=freq) + result, _ = parsing.parse_time_string(date_str, freq=freq) assert result == expected @@ -131,7 +129,7 @@ def test_parsers_quarter_invalid(date_str): [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))], ) def test_parsers_month_freq(date_str, expected): - result, _, _ = parsing.parse_time_string(date_str, freq="M") + result, _ = parsing.parse_time_string(date_str, freq="M") assert result == expected @@ -223,5 +221,5 @@ def test_parse_time_string_check_instance_type_raise_exception(): parse_time_string((1, 2, 3)) result = parse_time_string("2019") - expected = (datetime(2019, 1, 1), datetime(2019, 1, 1), "year") + expected = (datetime(2019, 1, 1), "year") assert result == expected diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index f430e2893ca33..b8048891e4876 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -2,7 +2,7 @@ import pytest from pandas import DataFrame, Index, Series, Timestamp -import pandas.util.testing as tm +import pandas._testing as tm def _assert_almost_equal_both(a, b, **kwargs): @@ -39,9 +39,7 @@ def _assert_not_almost_equal(a, b, **kwargs): """ try: tm.assert_almost_equal(a, b, **kwargs) - msg = ( - "{a} and {b} were approximately equal when they shouldn't have been" - ).format(a=a, b=b) + msg = f"{a} and {b} were approximately equal when they shouldn't have been" pytest.fail(msg=msg) except AssertionError: pass @@ -248,13 +246,12 @@ def test_assert_almost_equal_value_mismatch(): [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], ) def test_assert_almost_equal_class_mismatch(a, b, klass1, klass2): - msg = """numpy array are different + + msg = f"""numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format( - klass1=klass1, klass2=klass2 - ) +\\[right\\]: {klass2}""" with pytest.raises(AssertionError, match=msg): tm.assert_almost_equal(a, b) diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py index 44400498ddc64..8957e7a172666 100644 --- a/pandas/tests/util/test_assert_categorical_equal.py +++ b/pandas/tests/util/test_assert_categorical_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import Categorical -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( @@ -77,13 +77,11 @@ def test_categorical_equal_ordered_mismatch(): @pytest.mark.parametrize("obj", ["index", "foo", "pandas"]) def test_categorical_equal_object_override(obj): data = [1, 2, 3, 4] - msg = """{obj} are different + msg = f"""{obj} are different Attribute "ordered" are different \\[left\\]: False -\\[right\\]: True""".format( - obj=obj - ) +\\[right\\]: True""" c1 = Categorical(data, ordered=False) c2 = Categorical(data, ordered=True) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index cecf9273004d7..0547323b882f6 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -1,8 +1,8 @@ import numpy as np import pytest +import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray -import pandas.util.testing as tm @pytest.mark.parametrize( @@ -96,7 +96,7 @@ def test_assert_extension_array_equal_non_extension_array(side): numpy_array = np.arange(5) extension_array = SparseArray(numpy_array) - msg = "{side} is not an ExtensionArray".format(side=side) + msg = f"{side} is not an ExtensionArray" args = ( (numpy_array, extension_array) if side == "left" diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index b46a8460a28b2..23c845f2b2795 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import DataFrame -import pandas.util.testing as tm +import pandas._testing as tm @pytest.fixture(params=[True, False]) @@ -80,7 +80,7 @@ def test_frame_equal_row_order_mismatch(check_like, obj_fixture): df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, index=["c", "b", "a"]) if not check_like: # Do not ignore row-column orderings. - msg = "{obj}.index are different".format(obj=obj_fixture) + msg = f"{obj_fixture}.index are different" with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(df1, df2, check_like=check_like, obj=obj_fixture) else: @@ -95,7 +95,7 @@ def test_frame_equal_row_order_mismatch(check_like, obj_fixture): ], ) def test_frame_equal_shape_mismatch(df1, df2, obj_fixture): - msg = "{obj} are different".format(obj=obj_fixture) + msg = f"{obj_fixture} are different" with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(df1, df2, obj=obj_fixture) @@ -149,13 +149,11 @@ def test_empty_dtypes(check_dtype): def test_frame_equal_index_mismatch(obj_fixture): - msg = """{obj}\\.index are different + msg = f"""{obj_fixture}\\.index are different -{obj}\\.index values are different \\(33\\.33333 %\\) +{obj_fixture}\\.index values are different \\(33\\.33333 %\\) \\[left\\]: Index\\(\\['a', 'b', 'c'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""".format( - obj=obj_fixture - ) +\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='object'\\)""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "d"]) @@ -165,13 +163,11 @@ def test_frame_equal_index_mismatch(obj_fixture): def test_frame_equal_columns_mismatch(obj_fixture): - msg = """{obj}\\.columns are different + msg = f"""{obj_fixture}\\.columns are different -{obj}\\.columns values are different \\(50\\.0 %\\) +{obj_fixture}\\.columns values are different \\(50\\.0 %\\) \\[left\\]: Index\\(\\['A', 'B'\\], dtype='object'\\) -\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""".format( - obj=obj_fixture - ) +\\[right\\]: Index\\(\\['A', 'b'\\], dtype='object'\\)""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]) df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"]) @@ -181,13 +177,12 @@ def test_frame_equal_columns_mismatch(obj_fixture): def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): - msg = """{obj}\\.iloc\\[:, 1\\] are different + obj = obj_fixture + msg = f"""{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) are different -{obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) values are different \\(33\\.33333 %\\) \\[left\\]: \\[4, 5, 6\\] -\\[right\\]: \\[4, 5, 7\\]""".format( - obj=obj_fixture - ) +\\[right\\]: \\[4, 5, 7\\]""" df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]}) @@ -202,18 +197,18 @@ def test_frame_equal_block_mismatch(by_blocks_fixture, obj_fixture): ( DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]}), - """{obj}\\.iloc\\[:, 1\\] are different + """{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) are different -{obj}\\.iloc\\[:, 1\\] values are different \\(33\\.33333 %\\) +{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) values are different \\(33\\.33333 %\\) \\[left\\]: \\[é, è, ë\\] \\[right\\]: \\[é, è, e̊\\]""", ), ( DataFrame({"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]}), DataFrame({"A": ["a", "a", "a"], "E": ["e", "e", "e"]}), - """{obj}\\.iloc\\[:, 0\\] are different + """{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) are different -{obj}\\.iloc\\[:, 0\\] values are different \\(100\\.0 %\\) +{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) values are different \\(100\\.0 %\\) \\[left\\]: \\[á, à, ä\\] \\[right\\]: \\[a, a, a\\]""", ), diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 8c3f242f0c96b..bbbeebcec2569 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -2,7 +2,7 @@ import pytest from pandas import Categorical, Index, MultiIndex, NaT -import pandas.util.testing as tm +import pandas._testing as tm def test_index_equal_levels_mismatch(): @@ -135,11 +135,6 @@ def test_index_equal_level_values_mismatch(check_exact, check_less_precise): [(None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT)], ) def test_index_equal_names(name1, name2): - msg = """Index are different - -Attribute "names" are different -\\[left\\]: \\[{name1}\\] -\\[right\\]: \\[{name2}\\]""" idx1 = Index([1, 2, 3], name=name1) idx2 = Index([1, 2, 3], name=name2) @@ -149,7 +144,11 @@ def test_index_equal_names(name1, name2): else: name1 = "'x'" if name1 == "x" else name1 name2 = "'x'" if name2 == "x" else name2 - msg = msg.format(name1=name1, name2=name2) + msg = f"""Index are different + +Attribute "names" are different +\\[left\\]: \\[{name1}\\] +\\[right\\]: \\[{name2}\\]""" with pytest.raises(AssertionError, match=msg): tm.assert_index_equal(idx1, idx2) diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py index b264b484a04ab..96f2973a1528c 100644 --- a/pandas/tests/util/test_assert_interval_array_equal.py +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import interval_range -import pandas.util.testing as tm +import pandas._testing as tm @pytest.mark.parametrize( diff --git a/pandas/tests/util/test_assert_numpy_array_equal.py b/pandas/tests/util/test_assert_numpy_array_equal.py index 53bcedf3a16f1..c8ae9ebdd8651 100644 --- a/pandas/tests/util/test_assert_numpy_array_equal.py +++ b/pandas/tests/util/test_assert_numpy_array_equal.py @@ -2,7 +2,7 @@ import pytest from pandas import Timestamp -import pandas.util.testing as tm +import pandas._testing as tm def test_assert_numpy_array_equal_shape_mismatch(): @@ -28,13 +28,11 @@ def test_assert_numpy_array_equal_bad_type(): [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")], ) def test_assert_numpy_array_equal_class_mismatch(a, b, klass1, klass2): - msg = """numpy array are different + msg = f"""numpy array are different numpy array classes are different \\[left\\]: {klass1} -\\[right\\]: {klass2}""".format( - klass1=klass1, klass2=klass2 - ) +\\[right\\]: {klass2}""" with pytest.raises(AssertionError, match=msg): tm.assert_numpy_array_equal(a, b) diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py index c681817896903..87765c909938d 100644 --- a/pandas/tests/util/test_assert_produces_warning.py +++ b/pandas/tests/util/test_assert_produces_warning.py @@ -2,7 +2,7 @@ import pytest -import pandas.util.testing as tm +import pandas._testing as tm def f(): diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 0a6047c4662ba..eaf0824f52927 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -1,7 +1,7 @@ import pytest from pandas import Categorical, DataFrame, Series -import pandas.util.testing as tm +import pandas._testing as tm def _assert_series_equal_both(a, b, **kwargs): diff --git a/pandas/tests/util/test_deprecate.py b/pandas/tests/util/test_deprecate.py index 8fbc8037ed7c5..ee4f7e3f34f2e 100644 --- a/pandas/tests/util/test_deprecate.py +++ b/pandas/tests/util/test_deprecate.py @@ -4,7 +4,7 @@ from pandas.util._decorators import deprecate -import pandas.util.testing as tm +import pandas._testing as tm def new_func(): diff --git a/pandas/tests/util/test_deprecate_kwarg.py b/pandas/tests/util/test_deprecate_kwarg.py index c17c48197ccf7..b165e9fba0e4f 100644 --- a/pandas/tests/util/test_deprecate_kwarg.py +++ b/pandas/tests/util/test_deprecate_kwarg.py @@ -2,7 +2,7 @@ from pandas.util._decorators import deprecate_kwarg -import pandas.util.testing as tm +import pandas._testing as tm @deprecate_kwarg("old", "new") diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index ebbdbd6c29842..c856585f20138 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -5,9 +5,9 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +import pandas._testing as tm from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples from pandas.util import hash_array, hash_pandas_object -import pandas.util.testing as tm @pytest.fixture( @@ -353,3 +353,31 @@ def test_hash_collisions(): result = hash_array(np.asarray(hashes, dtype=object), "utf8") tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0)) + + +def test_hash_with_tuple(): + # GH#28969 array containing a tuple raises on call to arr.astype(str) + # apparently a numpy bug github.com/numpy/numpy/issues/9441 + + df = pd.DataFrame({"data": [tuple("1"), tuple("2")]}) + result = hash_pandas_object(df) + expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]}) + result = hash_pandas_object(df2) + expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64) + tm.assert_series_equal(result, expected) + + # require that the elements of such tuples are themselves hashable + + df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]}) + with pytest.raises(TypeError, match="unhashable type: 'list'"): + hash_pandas_object(df3) + + +def test_hash_object_none_key(): + # https://github.com/pandas-dev/pandas/issues/30887 + result = pd.util.hash_pandas_object(pd.Series(["a", "b"]), hash_key=None) + expected = pd.Series([4578374827886788867, 17338122309987883691], dtype="uint64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 60124c8e943ad..6a19adef728e4 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -4,7 +4,7 @@ import pandas.compat as compat -import pandas.util.testing as tm +import pandas._testing as tm def test_rands(): diff --git a/pandas/tests/util/test_validate_args.py b/pandas/tests/util/test_validate_args.py index 1f1365d62c64e..746d859b3322e 100644 --- a/pandas/tests/util/test_validate_args.py +++ b/pandas/tests/util/test_validate_args.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from pandas.util._validators import validate_args @@ -22,10 +20,8 @@ def test_bad_arg_length_max_value_single(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"argument \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -40,10 +36,8 @@ def test_bad_arg_length_max_value_multiple(): max_length = len(compat_args) + min_fname_arg_count actual_length = len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"arguments \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -54,15 +48,11 @@ def test_bad_arg_length_max_value_multiple(): def test_not_all_defaults(i): bad_arg = "foo" msg = ( - "the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + f"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" ) - compat_args = OrderedDict() - compat_args["foo"] = 2 - compat_args["bar"] = -1 - compat_args["baz"] = 3 - + compat_args = {"foo": 2, "bar": -1, "baz": 3} arg_vals = (1, -1, 3) with pytest.raises(ValueError, match=msg): @@ -73,8 +63,5 @@ def test_validation(): # No exceptions should be raised. validate_args(_fname, (None,), 2, dict(out=None)) - compat_args = OrderedDict() - compat_args["axis"] = 1 - compat_args["out"] = None - + compat_args = {"axis": 1, "out": None} validate_args(_fname, (1, None), 2, compat_args) diff --git a/pandas/tests/util/test_validate_args_and_kwargs.py b/pandas/tests/util/test_validate_args_and_kwargs.py index 396056466bb81..941ba86c61319 100644 --- a/pandas/tests/util/test_validate_args_and_kwargs.py +++ b/pandas/tests/util/test_validate_args_and_kwargs.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from pandas.util._validators import validate_args_and_kwargs @@ -17,10 +15,8 @@ def test_invalid_total_length_max_length_one(): actual_length = len(kwargs) + len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"argument \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"argument \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -37,10 +33,8 @@ def test_invalid_total_length_max_length_multiple(): actual_length = len(kwargs) + len(args) + min_fname_arg_count msg = ( - r"{fname}\(\) takes at most {max_length} " - r"arguments \({actual_length} given\)".format( - fname=_fname, max_length=max_length, actual_length=actual_length - ) + fr"{_fname}\(\) takes at most {max_length} " + fr"arguments \({actual_length} given\)" ) with pytest.raises(TypeError, match=msg): @@ -52,13 +46,11 @@ def test_missing_args_or_kwargs(args, kwargs): bad_arg = "bar" min_fname_arg_count = 2 - compat_args = OrderedDict() - compat_args["foo"] = -5 - compat_args[bad_arg] = 1 + compat_args = {"foo": -5, bad_arg: 1} msg = ( - r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + fr"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" ) with pytest.raises(ValueError, match=msg): @@ -68,17 +60,11 @@ def test_missing_args_or_kwargs(args, kwargs): def test_duplicate_argument(): min_fname_arg_count = 2 - compat_args = OrderedDict() - compat_args["foo"] = None - compat_args["bar"] = None - compat_args["baz"] = None - + compat_args = {"foo": None, "bar": None, "baz": None} kwargs = {"foo": None, "bar": None} args = (None,) # duplicate value for "foo" - msg = r"{fname}\(\) got multiple values for keyword " r"argument '{arg}'".format( - fname=_fname, arg="foo" - ) + msg = fr"{_fname}\(\) got multiple values for keyword argument 'foo'" with pytest.raises(TypeError, match=msg): validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args) @@ -86,10 +72,7 @@ def test_duplicate_argument(): def test_validation(): # No exceptions should be raised. - compat_args = OrderedDict() - compat_args["foo"] = 1 - compat_args["bar"] = None - compat_args["baz"] = -2 + compat_args = {"foo": 1, "bar": None, "baz": -2} kwargs = {"baz": -2} args = (1, None) diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py index b6241def4e5d6..8fe2a3712bf49 100644 --- a/pandas/tests/util/test_validate_kwargs.py +++ b/pandas/tests/util/test_validate_kwargs.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import pytest from pandas.util._validators import validate_bool_kwarg, validate_kwargs @@ -11,14 +9,10 @@ def test_bad_kwarg(): good_arg = "f" bad_arg = good_arg + "o" - compat_args = OrderedDict() - compat_args[good_arg] = "foo" - compat_args[bad_arg + "o"] = "bar" + compat_args = {good_arg: "foo", bad_arg + "o": "bar"} kwargs = {good_arg: "foo", bad_arg: "bar"} - msg = r"{fname}\(\) got an unexpected " r"keyword argument '{arg}'".format( - fname=_fname, arg=bad_arg - ) + msg = fr"{_fname}\(\) got an unexpected keyword argument '{bad_arg}'" with pytest.raises(TypeError, match=msg): validate_kwargs(_fname, kwargs, compat_args) @@ -28,14 +22,11 @@ def test_bad_kwarg(): def test_not_all_none(i): bad_arg = "foo" msg = ( - r"the '{arg}' parameter is not supported " - r"in the pandas implementation of {func}\(\)".format(arg=bad_arg, func=_fname) + fr"the '{bad_arg}' parameter is not supported " + fr"in the pandas implementation of {_fname}\(\)" ) - compat_args = OrderedDict() - compat_args["foo"] = 1 - compat_args["bar"] = "s" - compat_args["baz"] = None + compat_args = {"foo": 1, "bar": "s", "baz": None} kwarg_keys = ("foo", "bar", "baz") kwarg_vals = (2, "s", None) @@ -48,10 +39,7 @@ def test_not_all_none(i): def test_validation(): # No exceptions should be raised. - compat_args = OrderedDict() - compat_args["f"] = None - compat_args["b"] = 1 - compat_args["ba"] = "s" + compat_args = {"f": None, "b": 1, "ba": "s"} kwargs = dict(f=None, b=1) validate_kwargs(_fname, kwargs, compat_args) @@ -61,8 +49,8 @@ def test_validation(): @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0]) def test_validate_bool_kwarg_fail(name, value): msg = ( - f'For argument "{name}" expected type bool,' - f" received type {type(value).__name__}" + f'For argument "{name}" expected type bool, ' + f"received type {type(value).__name__}" ) with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/window/common.py b/pandas/tests/window/common.py index 1dfc0f34b2b8d..6aeada3152dbb 100644 --- a/pandas/tests/window/common.py +++ b/pandas/tests/window/common.py @@ -3,7 +3,8 @@ import numpy as np from numpy.random import randn -from pandas import DataFrame, Series, bdate_range +from pandas import DataFrame, Series, bdate_range, notna +import pandas._testing as tm N, K = 100, 10 @@ -21,3 +22,365 @@ def _create_data(self): self.rng = bdate_range(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) + + +# create the data only once as we are not setting it +def _create_consistency_data(): + def create_series(): + return [ + Series(dtype=object), + Series([np.nan]), + Series([np.nan, np.nan]), + Series([3.0]), + Series([np.nan, 3.0]), + Series([3.0, np.nan]), + Series([1.0, 3.0]), + Series([2.0, 2.0]), + Series([3.0, 1.0]), + Series( + [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] + ), + Series( + [ + np.nan, + 5.0, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + np.nan, + 5.0, + 5.0, + np.nan, + np.nan, + ] + ), + Series( + [ + np.nan, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + np.nan, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 3.0, + np.nan, + 3.0, + 4.0, + 5.0, + 6.0, + np.nan, + np.nan, + 7.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series( + [ + 2.0, + 5.0, + np.nan, + 2.0, + 4.0, + 0.0, + 9.0, + np.nan, + np.nan, + 3.0, + 12.0, + 13.0, + 14.0, + 15.0, + ] + ), + Series(range(10)), + Series(range(20, 0, -2)), + ] + + def create_dataframes(): + return [ + DataFrame(), + DataFrame(columns=["a"]), + DataFrame(columns=["a", "a"]), + DataFrame(columns=["a", "b"]), + DataFrame(np.arange(10).reshape((5, 2))), + DataFrame(np.arange(25).reshape((5, 5))), + DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), + ] + [DataFrame(s) for s in create_series()] + + def is_constant(x): + values = x.values.ravel() + return len(set(values[notna(values)])) == 1 + + def no_nans(x): + return x.notna().all().all() + + # data is a tuple(object, is_constant, no_nans) + data = create_series() + create_dataframes() + + return [(x, is_constant(x), no_nans(x)) for x in data] + + +_consistency_data = _create_consistency_data() + + +class ConsistencyBase(Base): + base_functions = [ + (lambda v: Series(v).count(), None, "count"), + (lambda v: Series(v).max(), None, "max"), + (lambda v: Series(v).min(), None, "min"), + (lambda v: Series(v).sum(), None, "sum"), + (lambda v: Series(v).mean(), None, "mean"), + (lambda v: Series(v).std(), 1, "std"), + (lambda v: Series(v).cov(Series(v)), None, "cov"), + (lambda v: Series(v).corr(Series(v)), None, "corr"), + (lambda v: Series(v).var(), 1, "var"), + # restore once GH 8086 is fixed + # lambda v: Series(v).skew(), 3, 'skew'), + # (lambda v: Series(v).kurt(), 4, 'kurt'), + # restore once GH 8084 is fixed + # lambda v: Series(v).quantile(0.3), None, 'quantile'), + (lambda v: Series(v).median(), None, "median"), + (np.nanmax, 1, "max"), + (np.nanmin, 1, "min"), + (np.nansum, 1, "sum"), + (np.nanmean, 1, "mean"), + (lambda v: np.nanstd(v, ddof=1), 1, "std"), + (lambda v: np.nanvar(v, ddof=1), 1, "var"), + (np.nanmedian, 1, "median"), + ] + no_nan_functions = [ + (np.max, None, "max"), + (np.min, None, "min"), + (np.sum, None, "sum"), + (np.mean, None, "mean"), + (lambda v: np.std(v, ddof=1), 1, "std"), + (lambda v: np.var(v, ddof=1), 1, "var"), + (np.median, None, "median"), + ] + + def _create_data(self): + super()._create_data() + self.data = _consistency_data + + def _test_moments_consistency_mock_mean(self, mean, mock_mean): + for (x, is_constant, no_nans) in self.data: + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + + if mock_mean: + # check that mean equals mock_mean + expected = mock_mean(x) + tm.assert_equal(mean_x, expected.astype("float64")) + + def _test_moments_consistency_is_constant(self, min_periods, count, mean, corr): + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + # check that correlation of a series with itself is either 1 or NaN + corr_x_x = corr(x, x) + + if is_constant: + exp = x.max() if isinstance(x, Series) else x.max().max() + + # check mean of constant series + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = exp + tm.assert_equal(mean_x, expected) + + # check correlation of constant series with itself is NaN + expected[:] = np.nan + tm.assert_equal(corr_x_x, expected) + + def _test_moments_consistency_var_debiasing_factors( + self, var_biased=None, var_unbiased=None, var_debiasing_factors=None + ): + for (x, is_constant, no_nans) in self.data: + if var_unbiased and var_biased and var_debiasing_factors: + # check variance debiasing factors + var_unbiased_x = var_unbiased(x) + var_biased_x = var_biased(x) + var_debiasing_factors_x = var_debiasing_factors(x) + tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) + + def _test_moments_consistency( + self, + min_periods, + count, + mean, + corr, + var_unbiased=None, + std_unbiased=None, + cov_unbiased=None, + var_biased=None, + std_biased=None, + cov_biased=None, + ): + + for (x, is_constant, no_nans) in self.data: + count_x = count(x) + mean_x = mean(x) + + for (std, var, cov) in [ + (std_biased, var_biased, cov_biased), + (std_unbiased, var_unbiased, cov_unbiased), + ]: + + # check that var(x), std(x), and cov(x) are all >= 0 + var_x = var(x) + std_x = std(x) + assert not (var_x < 0).any().any() + assert not (std_x < 0).any().any() + if cov: + cov_x_x = cov(x, x) + assert not (cov_x_x < 0).any().any() + + # check that var(x) == cov(x, x) + tm.assert_equal(var_x, cov_x_x) + + # check that var(x) == std(x)^2 + tm.assert_equal(var_x, std_x * std_x) + + if var is var_biased: + # check that biased var(x) == mean(x^2) - mean(x)^2 + mean_x2 = mean(x * x) + tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) + + if is_constant: + # check that variance of constant series is identically 0 + assert not (var_x > 0).any().any() + expected = x * np.nan + expected[count_x >= max(min_periods, 1)] = 0.0 + if var is var_unbiased: + expected[count_x < 2] = np.nan + tm.assert_equal(var_x, expected) + + if isinstance(x, Series): + for (y, is_constant, no_nans) in self.data: + if not x.isna().equals(y.isna()): + # can only easily test two Series with similar + # structure + continue + + # check that cor(x, y) is symmetric + corr_x_y = corr(x, y) + corr_y_x = corr(y, x) + tm.assert_equal(corr_x_y, corr_y_x) + + if cov: + # check that cov(x, y) is symmetric + cov_x_y = cov(x, y) + cov_y_x = cov(y, x) + tm.assert_equal(cov_x_y, cov_y_x) + + # check that cov(x, y) == (var(x+y) - var(x) - + # var(y)) / 2 + var_x_plus_y = var(x + y) + var_y = var(y) + tm.assert_equal( + cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) + ) + + # check that corr(x, y) == cov(x, y) / (std(x) * + # std(y)) + std_y = std(y) + tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) + + if cov is cov_biased: + # check that biased cov(x, y) == mean(x*y) - + # mean(x)*mean(y) + mean_y = mean(y) + mean_x_times_y = mean(x * y) + tm.assert_equal( + cov_x_y, mean_x_times_y - (mean_x * mean_y) + ) + + def _check_pairwise_moment(self, dispatch, name, **kwargs): + def get_result(obj, obj2=None): + return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) + + result = get_result(self.frame) + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) + expected = get_result(self.frame[1], self.frame[5]) + tm.assert_series_equal(result, expected, check_names=False) + + +def ew_func(A, B, com, name, **kwargs): + return getattr(A.ewm(com, **kwargs), name)(B) + + +def check_binary_ew(name, A, B): + + result = ew_func(A=A, B=B, com=20, name=name, min_periods=5) + assert np.isnan(result.values[:14]).all() + assert not np.isnan(result.values[14:]).any() + + +def check_binary_ew_min_periods(name, min_periods, A, B): + # GH 7898 + result = ew_func(A, B, 20, name=name, min_periods=min_periods) + # binary functions (ewmcov, ewmcorr) with bias=False require at + # least two values + assert np.isnan(result.values[:11]).all() + assert not np.isnan(result.values[11:]).any() + + # check series of length 0 + empty = Series([], dtype=np.float64) + result = ew_func(empty, empty, 50, name=name, min_periods=min_periods) + tm.assert_series_equal(result, empty) + + # check series of length 1 + result = ew_func( + Series([1.0]), Series([1.0]), 50, name=name, min_periods=min_periods + ) + tm.assert_series_equal(result, Series([np.NaN])) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 7ea4be25ca2a6..fb46ca51ace58 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -1,5 +1,7 @@ import pytest +import pandas.util._test_decorators as td + @pytest.fixture(params=[True, False]) def raw(request): @@ -47,3 +49,41 @@ def center(request): @pytest.fixture(params=[None, 1]) def min_periods(request): return request.param + + +@pytest.fixture(params=[True, False]) +def parallel(request): + """parallel keyword argument for numba.jit""" + return request.param + + +@pytest.fixture(params=[True, False]) +def nogil(request): + """nogil keyword argument for numba.jit""" + return request.param + + +@pytest.fixture(params=[True, False]) +def nopython(request): + """nopython keyword argument for numba.jit""" + return request.param + + +@pytest.fixture( + params=[pytest.param("numba", marks=td.skip_if_no("numba", "0.46.0")), "cython"] +) +def engine(request): + """engine keyword argument for rolling.apply""" + return request.param + + +@pytest.fixture( + params=[ + pytest.param(("numba", True), marks=td.skip_if_no("numba", "0.46.0")), + ("cython", True), + ("cython", False), + ] +) +def engine_and_raw(request): + """engine and raw keyword arguments for rolling.apply""" + return request.param diff --git a/pandas/tests/window/moments/conftest.py b/pandas/tests/window/moments/conftest.py new file mode 100644 index 0000000000000..2002f4d0bff43 --- /dev/null +++ b/pandas/tests/window/moments/conftest.py @@ -0,0 +1,20 @@ +import numpy as np +from numpy.random import randn +import pytest + +from pandas import Series + + +@pytest.fixture +def binary_ew_data(): + A = Series(randn(50), index=np.arange(50)) + B = A[2:] + randn(48) + + A[:10] = np.NaN + B[-10:] = np.NaN + return A, B + + +@pytest.fixture(params=[0, 1, 2]) +def min_periods(request): + return request.param diff --git a/pandas/tests/window/moments/test_moments_ewm.py b/pandas/tests/window/moments/test_moments_ewm.py new file mode 100644 index 0000000000000..599761259e041 --- /dev/null +++ b/pandas/tests/window/moments/test_moments_ewm.py @@ -0,0 +1,439 @@ +import numpy as np +from numpy.random import randn +import pytest + +import pandas as pd +from pandas import DataFrame, Series, concat +import pandas._testing as tm +from pandas.tests.window.common import ( + Base, + ConsistencyBase, + check_binary_ew, + check_binary_ew_min_periods, + ew_func, +) + + +@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning") +class TestMoments(Base): + def setup_method(self, method): + self._create_data() + + def test_ewma(self): + self._check_ew(name="mean") + + vals = pd.Series(np.zeros(1000)) + vals[5] = 1 + result = vals.ewm(span=100, adjust=False).mean().sum() + assert np.abs(result - 1) < 1e-2 + + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewma_cases(self, adjust, ignore_na): + # try adjust/ignore_na args matrix + + s = Series([1.0, 2.0, 4.0, 8.0]) + + if adjust: + expected = Series([1.0, 1.6, 2.736842, 4.923077]) + else: + expected = Series([1.0, 1.333333, 2.222222, 4.148148]) + + result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() + tm.assert_series_equal(result, expected) + + def test_ewma_nan_handling(self): + s = Series([1.0] + [np.nan] * 5 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([1.0] * len(s))) + + s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) + result = s.ewm(com=5).mean() + tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) + + # GH 7603 + s0 = Series([np.nan, 1.0, 101.0]) + s1 = Series([1.0, np.nan, 101.0]) + s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) + s3 = Series([1.0, np.nan, 101.0, 50.0]) + com = 2.0 + alpha = 1.0 / (1.0 + com) + + def simple_wma(s, w): + return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") + + for (s, adjust, ignore_na, w) in [ + (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), + (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), + (s0, False, False, [np.nan, (1.0 - alpha), alpha]), + (s0, False, True, [np.nan, (1.0 - alpha), alpha]), + (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), + (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), + (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), + (s1, False, True, [(1.0 - alpha), np.nan, alpha]), + ( + s2, + True, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], + ), + (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), + ( + s2, + False, + False, + [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], + ), + (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), + (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), + (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), + ( + s3, + False, + False, + [ + (1.0 - alpha) ** 3, + np.nan, + (1.0 - alpha) * alpha, + alpha * ((1.0 - alpha) ** 2 + alpha), + ], + ), + ( + s3, + False, + True, + [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], + ), + ]: + expected = simple_wma(s, Series(w)) + result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() + + tm.assert_series_equal(result, expected) + if ignore_na is False: + # check that ignore_na defaults to False + result = s.ewm(com=com, adjust=adjust).mean() + tm.assert_series_equal(result, expected) + + def test_ewmvar(self): + self._check_ew(name="var") + + def test_ewmvol(self): + self._check_ew(name="vol") + + def test_ewma_span_com_args(self): + A = self.series.ewm(com=9.5).mean() + B = self.series.ewm(span=20).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20) + with pytest.raises(ValueError): + self.series.ewm().mean() + + def test_ewma_halflife_arg(self): + A = self.series.ewm(com=13.932726172912965).mean() + B = self.series.ewm(halflife=10.0).mean() + tm.assert_almost_equal(A, B) + + with pytest.raises(ValueError): + self.series.ewm(span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, halflife=50) + with pytest.raises(ValueError): + self.series.ewm(com=9.5, span=20, halflife=50) + with pytest.raises(ValueError): + self.series.ewm() + + def test_ewm_alpha(self): + # GH 10789 + s = Series(self.arr) + a = s.ewm(alpha=0.61722699889169674).mean() + b = s.ewm(com=0.62014947789973052).mean() + c = s.ewm(span=2.240298955799461).mean() + d = s.ewm(halflife=0.721792864318).mean() + tm.assert_series_equal(a, b) + tm.assert_series_equal(a, c) + tm.assert_series_equal(a, d) + + def test_ewm_alpha_arg(self): + # GH 10789 + s = self.series + with pytest.raises(ValueError): + s.ewm() + with pytest.raises(ValueError): + s.ewm(com=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(span=10.0, alpha=0.5) + with pytest.raises(ValueError): + s.ewm(halflife=10.0, alpha=0.5) + + def test_ewm_domain_checks(self): + # GH 12492 + s = Series(self.arr) + msg = "comass must satisfy: comass >= 0" + with pytest.raises(ValueError, match=msg): + s.ewm(com=-0.1) + s.ewm(com=0.0) + s.ewm(com=0.1) + + msg = "span must satisfy: span >= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(span=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.0) + with pytest.raises(ValueError, match=msg): + s.ewm(span=0.9) + s.ewm(span=1.0) + s.ewm(span=1.1) + + msg = "halflife must satisfy: halflife > 0" + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(halflife=0.0) + s.ewm(halflife=0.1) + + msg = "alpha must satisfy: 0 < alpha <= 1" + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=-0.1) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=0.0) + s.ewm(alpha=0.1) + s.ewm(alpha=1.0) + with pytest.raises(ValueError, match=msg): + s.ewm(alpha=1.1) + + @pytest.mark.parametrize("method", ["mean", "vol", "var"]) + def test_ew_empty_series(self, method): + vals = pd.Series([], dtype=np.float64) + + ewm = vals.ewm(3) + result = getattr(ewm, method)() + tm.assert_almost_equal(result, vals) + + def _check_ew(self, name=None, preserve_nan=False): + series_result = getattr(self.series.ewm(com=10), name)() + assert isinstance(series_result, Series) + + frame_result = getattr(self.frame.ewm(com=10), name)() + assert type(frame_result) == DataFrame + + result = getattr(self.series.ewm(com=10), name)() + if preserve_nan: + assert result[self._nan_locs].isna().all() + + @pytest.mark.parametrize("min_periods", [0, 1]) + @pytest.mark.parametrize("name", ["mean", "var", "vol"]) + def test_ew_min_periods(self, min_periods, name): + # excluding NaNs correctly + arr = randn(50) + arr[:10] = np.NaN + arr[-10:] = np.NaN + s = Series(arr) + + # check min_periods + # GH 7898 + result = getattr(s.ewm(com=50, min_periods=2), name)() + assert result[:11].isna().all() + assert not result[11:].isna().any() + + result = getattr(s.ewm(com=50, min_periods=min_periods), name)() + if name == "mean": + assert result[:10].isna().all() + assert not result[10:].isna().any() + else: + # ewm.std, ewm.vol, ewm.var (with bias=False) require at least + # two values + assert result[:11].isna().all() + assert not result[11:].isna().any() + + # check series of length 0 + result = getattr( + Series(dtype=object).ewm(com=50, min_periods=min_periods), name + )() + tm.assert_series_equal(result, Series(dtype="float64")) + + # check series of length 1 + result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() + if name == "mean": + tm.assert_series_equal(result, Series([1.0])) + else: + # ewm.std, ewm.vol, ewm.var with bias=False require at least + # two values + tm.assert_series_equal(result, Series([np.NaN])) + + # pass in ints + result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() + assert result2.dtype == np.float_ + + +class TestEwmMomentsConsistency(ConsistencyBase): + def setup_method(self, method): + self._create_data() + + def test_ewmcov_pairwise(self): + self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) + + @pytest.mark.parametrize("name", ["cov", "corr"]) + def test_ewm_corr_cov(self, name, min_periods, binary_ew_data): + A, B = binary_ew_data + + check_binary_ew(name="corr", A=A, B=B) + check_binary_ew_min_periods("corr", min_periods, A, B) + + def test_ewmcorr_pairwise(self): + self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) + + @pytest.mark.parametrize("name", ["cov", "corr"]) + def test_different_input_array_raise_exception(self, name, binary_ew_data): + + A, _ = binary_ew_data + msg = "Input arrays must be of the same type!" + # exception raised is Exception + with pytest.raises(Exception, match=msg): + ew_func(A, randn(50), 20, name=name, min_periods=5) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + @pytest.mark.parametrize("adjust", [True, False]) + @pytest.mark.parametrize("ignore_na", [True, False]) + def test_ewm_consistency(self, min_periods, adjust, ignore_na): + def _weights(s, com, adjust, ignore_na): + if isinstance(s, DataFrame): + if not len(s.columns): + return DataFrame(index=s.index, columns=s.columns) + w = concat( + [ + _weights( + s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na + ) + for i, _ in enumerate(s.columns) + ], + axis=1, + ) + w.index = s.index + w.columns = s.columns + return w + + w = Series(np.nan, index=s.index) + alpha = 1.0 / (1.0 + com) + if ignore_na: + w[s.notna()] = _weights( + s[s.notna()], com=com, adjust=adjust, ignore_na=False + ) + elif adjust: + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + w.iat[i] = pow(1.0 / (1.0 - alpha), i) + else: + sum_wts = 0.0 + prev_i = -1 + for i in range(len(s)): + if s.iat[i] == s.iat[i]: + if prev_i == -1: + w.iat[i] = 1.0 + else: + w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) + sum_wts += w.iat[i] + prev_i = i + return w + + def _variance_debiasing_factors(s, com, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + cum_sum = weights.cumsum().fillna(method="ffill") + cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") + numerator = cum_sum * cum_sum + denominator = numerator - cum_sum_sq + denominator[denominator <= 0.0] = np.nan + return numerator / denominator + + def _ewma(s, com, min_periods, adjust, ignore_na): + weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) + result = ( + s.multiply(weights) + .cumsum() + .divide(weights.cumsum()) + .fillna(method="ffill") + ) + result[ + s.expanding().count() < (max(min_periods, 1) if min_periods else 1) + ] = np.nan + return result + + com = 3.0 + self._test_moments_consistency_mock_mean( + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + mock_mean=lambda x: _ewma( + x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), + var_biased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + var_debiasing_factors=lambda x: ( + _variance_debiasing_factors( + x, com=com, adjust=adjust, ignore_na=ignore_na + ) + ), + ) + # test consistency between different ewm* moments + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).mean(), + corr=lambda x, y: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).corr(y), + var_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=False) + ), + std_unbiased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=False) + ), + cov_unbiased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=False) + ), + var_biased=lambda x: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).var(bias=True) + ), + std_biased=lambda x: x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).std(bias=True), + cov_biased=lambda x, y: ( + x.ewm( + com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na + ).cov(y, bias=True) + ), + ) diff --git a/pandas/tests/window/moments/test_moments_expanding.py b/pandas/tests/window/moments/test_moments_expanding.py new file mode 100644 index 0000000000000..322082187f531 --- /dev/null +++ b/pandas/tests/window/moments/test_moments_expanding.py @@ -0,0 +1,420 @@ +import warnings + +import numpy as np +from numpy.random import randn +import pytest + +from pandas import DataFrame, Index, MultiIndex, Series, isna, notna +import pandas._testing as tm +from pandas.tests.window.common import ConsistencyBase + + +class TestExpandingMomentsConsistency(ConsistencyBase): + def setup_method(self, method): + self._create_data() + + def test_expanding_apply_args_kwargs(self, engine_and_raw): + def mean_w_arg(x, const): + return np.mean(x) + const + + engine, raw = engine_and_raw + + df = DataFrame(np.random.rand(20, 3)) + + expected = df.expanding().apply(np.mean, engine=engine, raw=raw) + 20.0 + + result = df.expanding().apply(mean_w_arg, engine=engine, raw=raw, args=(20,)) + tm.assert_frame_equal(result, expected) + + result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) + tm.assert_frame_equal(result, expected) + + def test_expanding_corr(self): + A = self.series.dropna() + B = (A + randn(len(A)))[:-5] + + result = A.expanding().corr(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_count(self): + result = self.series.expanding().count() + tm.assert_almost_equal( + result, self.series.rolling(window=len(self.series)).count() + ) + + def test_expanding_quantile(self): + result = self.series.expanding().quantile(0.5) + + rolling_result = self.series.rolling( + window=len(self.series), min_periods=1 + ).quantile(0.5) + + tm.assert_almost_equal(result, rolling_result) + + def test_expanding_cov(self): + A = self.series + B = (A + randn(len(A)))[:-5] + + result = A.expanding().cov(B) + + rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) + + tm.assert_almost_equal(rolling_result, result) + + def test_expanding_cov_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_corr_pairwise(self): + result = self.frame.expanding().corr() + + rolling_result = self.frame.rolling( + window=len(self.frame), min_periods=1 + ).corr() + tm.assert_frame_equal(result, rolling_result) + + def test_expanding_cov_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().cov(s2) + expected = Series([None, None, 2.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().cov(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().cov(s2) + expected = Series([None, None, None, 4.5]) + tm.assert_series_equal(result, expected) + + def test_expanding_corr_diff_index(self): + # GH 7512 + s1 = Series([1, 2, 3], index=[0, 1, 2]) + s2 = Series([1, 3], index=[0, 2]) + result = s1.expanding().corr(s2) + expected = Series([None, None, 1.0]) + tm.assert_series_equal(result, expected) + + s2a = Series([1, None, 3], index=[0, 1, 2]) + result = s1.expanding().corr(s2a) + tm.assert_series_equal(result, expected) + + s1 = Series([7, 8, 10], index=[0, 1, 3]) + s2 = Series([7, 9, 10], index=[0, 2, 3]) + result = s1.expanding().corr(s2) + expected = Series([None, None, None, 1.0]) + tm.assert_series_equal(result, expected) + + def test_expanding_cov_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) + df1a = DataFrame( + [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") + ) + # TODO: xref gh-15826 + # .loc is not preserving the names + result1 = df1.expanding().cov(df2, pairwise=True).loc[2] + result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] + result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-3.0, -6.0], [-5.0, -10.0]], + columns=Index(["A", "B"], name="foo"), + index=Index(["X", "Y"], name="foo"), + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + def test_expanding_corr_pairwise_diff_length(self): + # GH 7512 + df1 = DataFrame( + [[1, 2], [3, 2], [3, 4]], + columns=["A", "B"], + index=Index(range(3), name="bar"), + ) + df1a = DataFrame( + [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] + ) + df2 = DataFrame( + [[5, 6], [None, None], [2, 1]], + columns=["X", "Y"], + index=Index(range(3), name="bar"), + ) + df2a = DataFrame( + [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] + ) + result1 = df1.expanding().corr(df2, pairwise=True).loc[2] + result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] + result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] + result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] + expected = DataFrame( + [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) + ) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + tm.assert_frame_equal(result3, expected) + tm.assert_frame_equal(result4, expected) + + @pytest.mark.parametrize("has_min_periods", [True, False]) + @pytest.mark.parametrize( + "func,static_comp", + [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], + ids=["sum", "mean", "max", "min"], + ) + def test_expanding_func(self, func, static_comp, has_min_periods): + def expanding_func(x, min_periods=1, center=False, axis=0): + exp = x.expanding(min_periods=min_periods, center=center, axis=axis) + return getattr(exp, func)() + + self._check_expanding(expanding_func, static_comp, preserve_nan=False) + self._check_expanding_has_min_periods( + expanding_func, static_comp, has_min_periods + ) + + @pytest.mark.parametrize("has_min_periods", [True, False]) + def test_expanding_apply(self, engine_and_raw, has_min_periods): + + engine, raw = engine_and_raw + + def expanding_mean(x, min_periods=1): + + exp = x.expanding(min_periods=min_periods) + result = exp.apply(lambda x: x.mean(), raw=raw, engine=engine) + return result + + # TODO(jreback), needed to add preserve_nan=False + # here to make this pass + self._check_expanding(expanding_mean, np.mean, preserve_nan=False) + self._check_expanding_has_min_periods(expanding_mean, np.mean, has_min_periods) + + def test_expanding_apply_empty_series(self, engine_and_raw): + engine, raw = engine_and_raw + ser = Series([], dtype=np.float64) + tm.assert_series_equal( + ser, ser.expanding().apply(lambda x: x.mean(), raw=raw, engine=engine) + ) + + def test_expanding_apply_min_periods_0(self, engine_and_raw): + # GH 8080 + engine, raw = engine_and_raw + s = Series([None, None, None]) + result = s.expanding(min_periods=0).apply( + lambda x: len(x), raw=raw, engine=engine + ) + expected = Series([1.0, 2.0, 3.0]) + tm.assert_series_equal(result, expected) + + def _check_expanding(self, func, static_comp, preserve_nan=True): + + series_result = func(self.series) + assert isinstance(series_result, Series) + frame_result = func(self.frame) + assert isinstance(frame_result, DataFrame) + + result = func(self.series) + tm.assert_almost_equal(result[10], static_comp(self.series[:11])) + + if preserve_nan: + assert result.iloc[self._nan_locs].isna().all() + + def _check_expanding_has_min_periods(self, func, static_comp, has_min_periods): + ser = Series(randn(50)) + + if has_min_periods: + result = func(ser, min_periods=30) + assert result[:29].isna().all() + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + # min_periods is working correctly + result = func(ser, min_periods=15) + assert isna(result.iloc[13]) + assert notna(result.iloc[14]) + + ser2 = Series(randn(20)) + result = func(ser2, min_periods=5) + assert isna(result[3]) + assert notna(result[4]) + + # min_periods=0 + result0 = func(ser, min_periods=0) + result1 = func(ser, min_periods=1) + tm.assert_almost_equal(result0, result1) + else: + result = func(ser) + tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) + + @pytest.mark.parametrize( + "f", + [ + lambda x: x.expanding().count(), + lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), + lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), + lambda x: x.expanding(min_periods=5).max(), + lambda x: x.expanding(min_periods=5).min(), + lambda x: x.expanding(min_periods=5).sum(), + lambda x: x.expanding(min_periods=5).mean(), + lambda x: x.expanding(min_periods=5).std(), + lambda x: x.expanding(min_periods=5).var(), + lambda x: x.expanding(min_periods=5).skew(), + lambda x: x.expanding(min_periods=5).kurt(), + lambda x: x.expanding(min_periods=5).quantile(0.5), + lambda x: x.expanding(min_periods=5).median(), + lambda x: x.expanding(min_periods=5).apply(sum, raw=False), + lambda x: x.expanding(min_periods=5).apply(sum, raw=True), + ], + ) + def test_moment_functions_zero_length(self, f): + # GH 8056 + s = Series(dtype=np.float64) + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") + df2_expected = df2 + + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + + @pytest.mark.parametrize( + "f", + [ + lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), + lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), + ], + ) + def test_moment_functions_zero_length_pairwise(self, f): + + df1 = DataFrame() + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") + + df1_expected = DataFrame( + index=MultiIndex.from_product([df1.index, df1.columns]), columns=Index([]) + ) + df2_expected = DataFrame( + index=MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + + @pytest.mark.slow + @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) + def test_expanding_consistency(self, min_periods): + + # suppress warnings about empty slices, as we are deliberately testing + # with empty/0-length Series/DataFrames + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) + + # test consistency between different expanding_* moments + self._test_moments_consistency_mock_mean( + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() + / x.expanding().count(), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + var_debiasing_factors=lambda x: ( + x.expanding().count() + / (x.expanding().count() - 1.0).replace(0.0, np.nan) + ), + ) + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: x.expanding().count(), + mean=lambda x: x.expanding(min_periods=min_periods).mean(), + corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), + var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), + std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), + cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), + var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), + std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), + cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( + y, ddof=0 + ), + ) + + # test consistency between expanding_xyz() and either (a) + # expanding_apply of Series.xyz(), or (b) expanding_apply of + # np.nanxyz() + for (x, is_constant, no_nans) in self.data: + functions = self.base_functions + + # GH 8269 + if no_nans: + functions = self.base_functions + self.no_nan_functions + for (f, require_min_periods, name) in functions: + expanding_f = getattr(x.expanding(min_periods=min_periods), name) + + if ( + require_min_periods + and (min_periods is not None) + and (min_periods < require_min_periods) + ): + continue + + if name == "count": + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding(min_periods=0).apply( + func=f, raw=True + ) + else: + if name in ["cov", "corr"]: + expanding_f_result = expanding_f(pairwise=False) + else: + expanding_f_result = expanding_f() + expanding_apply_f_result = x.expanding( + min_periods=min_periods + ).apply(func=f, raw=True) + + # GH 9422 + if name in ["sum", "prod"]: + tm.assert_equal(expanding_f_result, expanding_apply_f_result) diff --git a/pandas/tests/window/test_moments.py b/pandas/tests/window/moments/test_moments_rolling.py similarity index 54% rename from pandas/tests/window/test_moments.py rename to pandas/tests/window/moments/test_moments_rolling.py index 2c65c9e2ac82c..9acb4ffcb40b8 100644 --- a/pandas/tests/window/test_moments.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -9,10 +9,10 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, Series, concat, isna, notna +from pandas import DataFrame, Index, Series, isna, notna +import pandas._testing as tm from pandas.core.window.common import _flex_binary_moment -from pandas.tests.window.common import Base -import pandas.util.testing as tm +from pandas.tests.window.common import Base, ConsistencyBase import pandas.tseries.offsets as offsets @@ -674,57 +674,6 @@ def f(x): self._check_moment_func(np.mean, name="apply", func=f, raw=raw) - expected = Series([], dtype="float64") - result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw) - tm.assert_series_equal(result, expected) - - # gh-8080 - s = Series([None, None, None]) - result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 2.0]) - tm.assert_series_equal(result, expected) - - result = s.rolling(2, min_periods=0).apply(len, raw=raw) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize("bad_raw", [None, 1, 0]) - def test_rolling_apply_invalid_raw(self, bad_raw): - with pytest.raises(ValueError, match="raw parameter must be `True` or `False`"): - Series(range(3)).rolling(1).apply(len, raw=bad_raw) - - def test_rolling_apply_out_of_bounds(self, raw): - # gh-1850 - vals = pd.Series([1, 2, 3, 4]) - - result = vals.rolling(10).apply(np.sum, raw=raw) - assert result.isna().all() - - result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw) - expected = pd.Series([1, 3, 6, 10], dtype=float) - tm.assert_almost_equal(result, expected) - - @pytest.mark.parametrize("window", [2, "2s"]) - def test_rolling_apply_with_pandas_objects(self, window): - # 5071 - df = pd.DataFrame( - {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, - index=pd.date_range("20130101", periods=5, freq="s"), - ) - - # we have an equal spaced timeseries index - # so simulate removing the first period - def f(x): - if x.index[0] == df.index[0]: - return np.nan - return x.iloc[-1] - - result = df.rolling(window).apply(f, raw=False) - expected = df.iloc[2:].reindex_like(df) - tm.assert_frame_equal(result, expected) - - with pytest.raises(AttributeError): - df.rolling(window).apply(f, raw=True) - def test_rolling_std(self, raw): self._check_moment_func(lambda x: np.std(x, ddof=1), name="std", raw=raw) self._check_moment_func( @@ -966,400 +915,6 @@ def get_result(obj, window, min_periods=None, center=False): tm.assert_series_equal(series_xp, series_rs) tm.assert_frame_equal(frame_xp, frame_rs) - def test_ewma(self): - self._check_ew(name="mean") - - vals = pd.Series(np.zeros(1000)) - vals[5] = 1 - result = vals.ewm(span=100, adjust=False).mean().sum() - assert np.abs(result - 1) < 1e-2 - - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewma_cases(self, adjust, ignore_na): - # try adjust/ignore_na args matrix - - s = Series([1.0, 2.0, 4.0, 8.0]) - - if adjust: - expected = Series([1.0, 1.6, 2.736842, 4.923077]) - else: - expected = Series([1.0, 1.333333, 2.222222, 4.148148]) - - result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() - tm.assert_series_equal(result, expected) - - def test_ewma_nan_handling(self): - s = Series([1.0] + [np.nan] * 5 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([1.0] * len(s))) - - s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0]) - result = s.ewm(com=5).mean() - tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) - - # GH 7603 - s0 = Series([np.nan, 1.0, 101.0]) - s1 = Series([1.0, np.nan, 101.0]) - s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) - s3 = Series([1.0, np.nan, 101.0, 50.0]) - com = 2.0 - alpha = 1.0 / (1.0 + com) - - def simple_wma(s, w): - return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") - - for (s, adjust, ignore_na, w) in [ - (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), - (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), - (s0, False, False, [np.nan, (1.0 - alpha), alpha]), - (s0, False, True, [np.nan, (1.0 - alpha), alpha]), - (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), - (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), - (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), - (s1, False, True, [(1.0 - alpha), np.nan, alpha]), - ( - s2, - True, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan], - ), - (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), - ( - s2, - False, - False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], - ), - (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), - (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), - (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), - ( - s3, - False, - False, - [ - (1.0 - alpha) ** 3, - np.nan, - (1.0 - alpha) * alpha, - alpha * ((1.0 - alpha) ** 2 + alpha), - ], - ), - ( - s3, - False, - True, - [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha], - ), - ]: - expected = simple_wma(s, Series(w)) - result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() - - tm.assert_series_equal(result, expected) - if ignore_na is False: - # check that ignore_na defaults to False - result = s.ewm(com=com, adjust=adjust).mean() - tm.assert_series_equal(result, expected) - - def test_ewmvar(self): - self._check_ew(name="var") - - def test_ewmvol(self): - self._check_ew(name="vol") - - def test_ewma_span_com_args(self): - A = self.series.ewm(com=9.5).mean() - B = self.series.ewm(span=20).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20) - with pytest.raises(ValueError): - self.series.ewm().mean() - - def test_ewma_halflife_arg(self): - A = self.series.ewm(com=13.932726172912965).mean() - B = self.series.ewm(halflife=10.0).mean() - tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): - self.series.ewm(span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, halflife=50) - with pytest.raises(ValueError): - self.series.ewm(com=9.5, span=20, halflife=50) - with pytest.raises(ValueError): - self.series.ewm() - - def test_ewm_alpha(self): - # GH 10789 - s = Series(self.arr) - a = s.ewm(alpha=0.61722699889169674).mean() - b = s.ewm(com=0.62014947789973052).mean() - c = s.ewm(span=2.240298955799461).mean() - d = s.ewm(halflife=0.721792864318).mean() - tm.assert_series_equal(a, b) - tm.assert_series_equal(a, c) - tm.assert_series_equal(a, d) - - def test_ewm_alpha_arg(self): - # GH 10789 - s = self.series - with pytest.raises(ValueError): - s.ewm() - with pytest.raises(ValueError): - s.ewm(com=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(span=10.0, alpha=0.5) - with pytest.raises(ValueError): - s.ewm(halflife=10.0, alpha=0.5) - - def test_ewm_domain_checks(self): - # GH 12492 - s = Series(self.arr) - msg = "comass must satisfy: comass >= 0" - with pytest.raises(ValueError, match=msg): - s.ewm(com=-0.1) - s.ewm(com=0.0) - s.ewm(com=0.1) - - msg = "span must satisfy: span >= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(span=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.0) - with pytest.raises(ValueError, match=msg): - s.ewm(span=0.9) - s.ewm(span=1.0) - s.ewm(span=1.1) - - msg = "halflife must satisfy: halflife > 0" - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(halflife=0.0) - s.ewm(halflife=0.1) - - msg = "alpha must satisfy: 0 < alpha <= 1" - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=-0.1) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=0.0) - s.ewm(alpha=0.1) - s.ewm(alpha=1.0) - with pytest.raises(ValueError, match=msg): - s.ewm(alpha=1.1) - - @pytest.mark.parametrize("method", ["mean", "vol", "var"]) - def test_ew_empty_series(self, method): - vals = pd.Series([], dtype=np.float64) - - ewm = vals.ewm(3) - result = getattr(ewm, method)() - tm.assert_almost_equal(result, vals) - - def _check_ew(self, name=None, preserve_nan=False): - series_result = getattr(self.series.ewm(com=10), name)() - assert isinstance(series_result, Series) - - frame_result = getattr(self.frame.ewm(com=10), name)() - assert type(frame_result) == DataFrame - - result = getattr(self.series.ewm(com=10), name)() - if preserve_nan: - assert result[self._nan_locs].isna().all() - - # excluding NaNs correctly - arr = randn(50) - arr[:10] = np.NaN - arr[-10:] = np.NaN - s = Series(arr) - - # check min_periods - # GH 7898 - result = getattr(s.ewm(com=50, min_periods=2), name)() - assert result[:11].isna().all() - assert not result[11:].isna().any() - - for min_periods in (0, 1): - result = getattr(s.ewm(com=50, min_periods=min_periods), name)() - if name == "mean": - assert result[:10].isna().all() - assert not result[10:].isna().any() - else: - # ewm.std, ewm.vol, ewm.var (with bias=False) require at least - # two values - assert result[:11].isna().all() - assert not result[11:].isna().any() - - # check series of length 0 - result = getattr( - Series(dtype=object).ewm(com=50, min_periods=min_periods), name - )() - tm.assert_series_equal(result, Series(dtype="float64")) - - # check series of length 1 - result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)() - if name == "mean": - tm.assert_series_equal(result, Series([1.0])) - else: - # ewm.std, ewm.vol, ewm.var with bias=False require at least - # two values - tm.assert_series_equal(result, Series([np.NaN])) - - # pass in ints - result2 = getattr(Series(np.arange(50)).ewm(span=10), name)() - assert result2.dtype == np.float_ - - -# create the data only once as we are not setting it -def _create_consistency_data(): - def create_series(): - return [ - Series(dtype=object), - Series([np.nan]), - Series([np.nan, np.nan]), - Series([3.0]), - Series([np.nan, 3.0]), - Series([3.0, np.nan]), - Series([1.0, 3.0]), - Series([2.0, 2.0]), - Series([3.0, 1.0]), - Series( - [5.0, 5.0, 5.0, 5.0, np.nan, np.nan, np.nan, 5.0, 5.0, np.nan, np.nan] - ), - Series( - [ - np.nan, - 5.0, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - np.nan, - 5.0, - 5.0, - np.nan, - np.nan, - ] - ), - Series( - [ - np.nan, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - np.nan, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 3.0, - np.nan, - 3.0, - 4.0, - 5.0, - 6.0, - np.nan, - np.nan, - 7.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series( - [ - 2.0, - 5.0, - np.nan, - 2.0, - 4.0, - 0.0, - 9.0, - np.nan, - np.nan, - 3.0, - 12.0, - 13.0, - 14.0, - 15.0, - ] - ), - Series(range(10)), - Series(range(20, 0, -2)), - ] - - def create_dataframes(): - return [ - DataFrame(), - DataFrame(columns=["a"]), - DataFrame(columns=["a", "a"]), - DataFrame(columns=["a", "b"]), - DataFrame(np.arange(10).reshape((5, 2))), - DataFrame(np.arange(25).reshape((5, 5))), - DataFrame(np.arange(25).reshape((5, 5)), columns=["a", "b", 99, "d", "d"]), - ] + [DataFrame(s) for s in create_series()] - - def is_constant(x): - values = x.values.ravel() - return len(set(values[notna(values)])) == 1 - - def no_nans(x): - return x.notna().all().all() - - # data is a tuple(object, is_constant, no_nans) - data = create_series() + create_dataframes() - - return [(x, is_constant(x), no_nans(x)) for x in data] - - -_consistency_data = _create_consistency_data() - def _rolling_consistency_cases(): for window in [1, 2, 3, 10, 20]: @@ -1370,363 +925,10 @@ def _rolling_consistency_cases(): yield window, min_periods, center -class TestMomentsConsistency(Base): - base_functions = [ - (lambda v: Series(v).count(), None, "count"), - (lambda v: Series(v).max(), None, "max"), - (lambda v: Series(v).min(), None, "min"), - (lambda v: Series(v).sum(), None, "sum"), - (lambda v: Series(v).mean(), None, "mean"), - (lambda v: Series(v).std(), 1, "std"), - (lambda v: Series(v).cov(Series(v)), None, "cov"), - (lambda v: Series(v).corr(Series(v)), None, "corr"), - (lambda v: Series(v).var(), 1, "var"), - # restore once GH 8086 is fixed - # lambda v: Series(v).skew(), 3, 'skew'), - # (lambda v: Series(v).kurt(), 4, 'kurt'), - # restore once GH 8084 is fixed - # lambda v: Series(v).quantile(0.3), None, 'quantile'), - (lambda v: Series(v).median(), None, "median"), - (np.nanmax, 1, "max"), - (np.nanmin, 1, "min"), - (np.nansum, 1, "sum"), - (np.nanmean, 1, "mean"), - (lambda v: np.nanstd(v, ddof=1), 1, "std"), - (lambda v: np.nanvar(v, ddof=1), 1, "var"), - (np.nanmedian, 1, "median"), - ] - no_nan_functions = [ - (np.max, None, "max"), - (np.min, None, "min"), - (np.sum, None, "sum"), - (np.mean, None, "mean"), - (lambda v: np.std(v, ddof=1), 1, "std"), - (lambda v: np.var(v, ddof=1), 1, "var"), - (np.median, None, "median"), - ] - - def _create_data(self): - super()._create_data() - self.data = _consistency_data - +class TestRollingMomentsConsistency(ConsistencyBase): def setup_method(self, method): self._create_data() - def _test_moments_consistency( - self, - min_periods, - count, - mean, - mock_mean, - corr, - var_unbiased=None, - std_unbiased=None, - cov_unbiased=None, - var_biased=None, - std_biased=None, - cov_biased=None, - var_debiasing_factors=None, - ): - def _non_null_values(x): - values = x.values.ravel() - return set(values[notna(values)].tolist()) - - for (x, is_constant, no_nans) in self.data: - count_x = count(x) - mean_x = mean(x) - - if mock_mean: - # check that mean equals mock_mean - expected = mock_mean(x) - tm.assert_equal(mean_x, expected.astype("float64")) - - # check that correlation of a series with itself is either 1 or NaN - corr_x_x = corr(x, x) - - # assert _non_null_values(corr_x_x).issubset(set([1.])) - # restore once rolling_cov(x, x) is identically equal to var(x) - - if is_constant: - exp = x.max() if isinstance(x, Series) else x.max().max() - - # check mean of constant series - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = exp - tm.assert_equal(mean_x, expected) - - # check correlation of constant series with itself is NaN - expected[:] = np.nan - tm.assert_equal(corr_x_x, expected) - - if var_unbiased and var_biased and var_debiasing_factors: - # check variance debiasing factors - var_unbiased_x = var_unbiased(x) - var_biased_x = var_biased(x) - var_debiasing_factors_x = var_debiasing_factors(x) - tm.assert_equal(var_unbiased_x, var_biased_x * var_debiasing_factors_x) - - for (std, var, cov) in [ - (std_biased, var_biased, cov_biased), - (std_unbiased, var_unbiased, cov_unbiased), - ]: - - # check that var(x), std(x), and cov(x) are all >= 0 - var_x = var(x) - std_x = std(x) - assert not (var_x < 0).any().any() - assert not (std_x < 0).any().any() - if cov: - cov_x_x = cov(x, x) - assert not (cov_x_x < 0).any().any() - - # check that var(x) == cov(x, x) - tm.assert_equal(var_x, cov_x_x) - - # check that var(x) == std(x)^2 - tm.assert_equal(var_x, std_x * std_x) - - if var is var_biased: - # check that biased var(x) == mean(x^2) - mean(x)^2 - mean_x2 = mean(x * x) - tm.assert_equal(var_x, mean_x2 - (mean_x * mean_x)) - - if is_constant: - # check that variance of constant series is identically 0 - assert not (var_x > 0).any().any() - expected = x * np.nan - expected[count_x >= max(min_periods, 1)] = 0.0 - if var is var_unbiased: - expected[count_x < 2] = np.nan - tm.assert_equal(var_x, expected) - - if isinstance(x, Series): - for (y, is_constant, no_nans) in self.data: - if not x.isna().equals(y.isna()): - # can only easily test two Series with similar - # structure - continue - - # check that cor(x, y) is symmetric - corr_x_y = corr(x, y) - corr_y_x = corr(y, x) - tm.assert_equal(corr_x_y, corr_y_x) - - if cov: - # check that cov(x, y) is symmetric - cov_x_y = cov(x, y) - cov_y_x = cov(y, x) - tm.assert_equal(cov_x_y, cov_y_x) - - # check that cov(x, y) == (var(x+y) - var(x) - - # var(y)) / 2 - var_x_plus_y = var(x + y) - var_y = var(y) - tm.assert_equal( - cov_x_y, 0.5 * (var_x_plus_y - var_x - var_y) - ) - - # check that corr(x, y) == cov(x, y) / (std(x) * - # std(y)) - std_y = std(y) - tm.assert_equal(corr_x_y, cov_x_y / (std_x * std_y)) - - if cov is cov_biased: - # check that biased cov(x, y) == mean(x*y) - - # mean(x)*mean(y) - mean_y = mean(y) - mean_x_times_y = mean(x * y) - tm.assert_equal( - cov_x_y, mean_x_times_y - (mean_x * mean_y) - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - @pytest.mark.parametrize("adjust", [True, False]) - @pytest.mark.parametrize("ignore_na", [True, False]) - def test_ewm_consistency(self, min_periods, adjust, ignore_na): - def _weights(s, com, adjust, ignore_na): - if isinstance(s, DataFrame): - if not len(s.columns): - return DataFrame(index=s.index, columns=s.columns) - w = concat( - [ - _weights( - s.iloc[:, i], com=com, adjust=adjust, ignore_na=ignore_na - ) - for i, _ in enumerate(s.columns) - ], - axis=1, - ) - w.index = s.index - w.columns = s.columns - return w - - w = Series(np.nan, index=s.index) - alpha = 1.0 / (1.0 + com) - if ignore_na: - w[s.notna()] = _weights( - s[s.notna()], com=com, adjust=adjust, ignore_na=False - ) - elif adjust: - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - w.iat[i] = pow(1.0 / (1.0 - alpha), i) - else: - sum_wts = 0.0 - prev_i = -1 - for i in range(len(s)): - if s.iat[i] == s.iat[i]: - if prev_i == -1: - w.iat[i] = 1.0 - else: - w.iat[i] = alpha * sum_wts / pow(1.0 - alpha, i - prev_i) - sum_wts += w.iat[i] - prev_i = i - return w - - def _variance_debiasing_factors(s, com, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - cum_sum = weights.cumsum().fillna(method="ffill") - cum_sum_sq = (weights * weights).cumsum().fillna(method="ffill") - numerator = cum_sum * cum_sum - denominator = numerator - cum_sum_sq - denominator[denominator <= 0.0] = np.nan - return numerator / denominator - - def _ewma(s, com, min_periods, adjust, ignore_na): - weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na) - result = ( - s.multiply(weights) - .cumsum() - .divide(weights.cumsum()) - .fillna(method="ffill") - ) - result[ - s.expanding().count() < (max(min_periods, 1) if min_periods else 1) - ] = np.nan - return result - - com = 3.0 - # test consistency between different ewm* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).mean(), - mock_mean=lambda x: _ewma( - x, com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ), - corr=lambda x, y: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).corr(y), - var_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=False) - ), - std_unbiased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=False) - ), - cov_unbiased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=False) - ), - var_biased=lambda x: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).var(bias=True) - ), - std_biased=lambda x: x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).std(bias=True), - cov_biased=lambda x, y: ( - x.ewm( - com=com, min_periods=min_periods, adjust=adjust, ignore_na=ignore_na - ).cov(y, bias=True) - ), - var_debiasing_factors=lambda x: ( - _variance_debiasing_factors( - x, com=com, adjust=adjust, ignore_na=ignore_na - ) - ), - ) - - @pytest.mark.slow - @pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4]) - def test_expanding_consistency(self, min_periods): - - # suppress warnings about empty slices, as we are deliberately testing - # with empty/0-length Series/DataFrames - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning, - ) - - # test consistency between different expanding_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: x.expanding().count(), - mean=lambda x: x.expanding(min_periods=min_periods).mean(), - mock_mean=lambda x: x.expanding(min_periods=min_periods).sum() - / x.expanding().count(), - corr=lambda x, y: x.expanding(min_periods=min_periods).corr(y), - var_unbiased=lambda x: x.expanding(min_periods=min_periods).var(), - std_unbiased=lambda x: x.expanding(min_periods=min_periods).std(), - cov_unbiased=lambda x, y: x.expanding(min_periods=min_periods).cov(y), - var_biased=lambda x: x.expanding(min_periods=min_periods).var(ddof=0), - std_biased=lambda x: x.expanding(min_periods=min_periods).std(ddof=0), - cov_biased=lambda x, y: x.expanding(min_periods=min_periods).cov( - y, ddof=0 - ), - var_debiasing_factors=lambda x: ( - x.expanding().count() - / (x.expanding().count() - 1.0).replace(0.0, np.nan) - ), - ) - - # test consistency between expanding_xyz() and either (a) - # expanding_apply of Series.xyz(), or (b) expanding_apply of - # np.nanxyz() - for (x, is_constant, no_nans) in self.data: - functions = self.base_functions - - # GH 8269 - if no_nans: - functions = self.base_functions + self.no_nan_functions - for (f, require_min_periods, name) in functions: - expanding_f = getattr(x.expanding(min_periods=min_periods), name) - - if ( - require_min_periods - and (min_periods is not None) - and (min_periods < require_min_periods) - ): - continue - - if name == "count": - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding(min_periods=0).apply( - func=f, raw=True - ) - else: - if name in ["cov", "corr"]: - expanding_f_result = expanding_f(pairwise=False) - else: - expanding_f_result = expanding_f() - expanding_apply_f_result = x.expanding( - min_periods=min_periods - ).apply(func=f, raw=True) - - # GH 9422 - if name in ["sum", "prod"]: - tm.assert_equal(expanding_f_result, expanding_apply_f_result) - @pytest.mark.slow @pytest.mark.parametrize( "window,min_periods,center", list(_rolling_consistency_cases()) @@ -1743,9 +945,7 @@ def test_rolling_consistency(self, window, min_periods, center): ) # test consistency between different rolling_* moments - self._test_moments_consistency( - min_periods=min_periods, - count=lambda x: (x.rolling(window=window, center=center).count()), + self._test_moments_consistency_mock_mean( mean=lambda x: ( x.rolling( window=window, min_periods=min_periods, center=center @@ -1760,6 +960,53 @@ def test_rolling_consistency(self, window, min_periods, center): ).count() ) ), + ) + + self._test_moments_consistency_is_constant( + min_periods=min_periods, + count=lambda x: (x.rolling(window=window, center=center).count()), + mean=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), + corr=lambda x, y: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).corr(y) + ), + ) + + self._test_moments_consistency_var_debiasing_factors( + var_unbiased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var() + ), + var_biased=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).var(ddof=0) + ), + var_debiasing_factors=lambda x: ( + x.rolling(window=window, center=center) + .count() + .divide( + (x.rolling(window=window, center=center).count() - 1.0).replace( + 0.0, np.nan + ) + ) + ), + ) + + self._test_moments_consistency( + min_periods=min_periods, + count=lambda x: (x.rolling(window=window, center=center).count()), + mean=lambda x: ( + x.rolling( + window=window, min_periods=min_periods, center=center + ).mean() + ), corr=lambda x, y: ( x.rolling( window=window, min_periods=min_periods, center=center @@ -1795,15 +1042,6 @@ def test_rolling_consistency(self, window, min_periods, center): window=window, min_periods=min_periods, center=center ).cov(y, ddof=0) ), - var_debiasing_factors=lambda x: ( - x.rolling(window=window, center=center) - .count() - .divide( - (x.rolling(window=window, center=center).count() - 1.0).replace( - 0.0, np.nan - ) - ) - ), ) # test consistency between rolling_xyz() and either (a) @@ -1886,22 +1124,12 @@ def test_rolling_corr_with_zero_variance(self, window): assert s.rolling(window=window).corr(other=other).isna().all() - def _check_pairwise_moment(self, dispatch, name, **kwargs): - def get_result(obj, obj2=None): - return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) - - result = get_result(self.frame) - result = result.loc[(slice(None), 1), 5] - result.index = result.index.droplevel(1) - expected = get_result(self.frame[1], self.frame[5]) - tm.assert_series_equal(result, expected, check_names=False) - def test_flex_binary_moment(self): # GH3155 # don't blow the stack msg = ( - "arguments to moment function must be of type" - " np.ndarray/Series/DataFrame" + "arguments to moment function must be of type " + "np.ndarray/Series/DataFrame" ) with pytest.raises(TypeError, match=msg): _flex_binary_moment(5, 6, None) @@ -1956,156 +1184,6 @@ def test_flex_binary_frame(self, method): ) tm.assert_frame_equal(res3, exp) - def test_ewmcov(self): - self._check_binary_ew("cov") - - def test_ewmcov_pairwise(self): - self._check_pairwise_moment("ewm", "cov", span=10, min_periods=5) - - def test_ewmcorr(self): - self._check_binary_ew("corr") - - def test_ewmcorr_pairwise(self): - self._check_pairwise_moment("ewm", "corr", span=10, min_periods=5) - - def _check_binary_ew(self, name): - def func(A, B, com, **kwargs): - return getattr(A.ewm(com, **kwargs), name)(B) - - A = Series(randn(50), index=np.arange(50)) - B = A[2:] + randn(48) - - A[:10] = np.NaN - B[-10:] = np.NaN - - result = func(A, B, 20, min_periods=5) - assert np.isnan(result.values[:14]).all() - assert not np.isnan(result.values[14:]).any() - - # GH 7898 - for min_periods in (0, 1, 2): - result = func(A, B, 20, min_periods=min_periods) - # binary functions (ewmcov, ewmcorr) with bias=False require at - # least two values - assert np.isnan(result.values[:11]).all() - assert not np.isnan(result.values[11:]).any() - - # check series of length 0 - empty = Series([], dtype=np.float64) - result = func(empty, empty, 50, min_periods=min_periods) - tm.assert_series_equal(result, empty) - - # check series of length 1 - result = func(Series([1.0]), Series([1.0]), 50, min_periods=min_periods) - tm.assert_series_equal(result, Series([np.NaN])) - - msg = "Input arrays must be of the same type!" - # exception raised is Exception - with pytest.raises(Exception, match=msg): - func(A, randn(50), 20, min_periods=5) - - def test_expanding_apply_args_kwargs(self, raw): - def mean_w_arg(x, const): - return np.mean(x) + const - - df = DataFrame(np.random.rand(20, 3)) - - expected = df.expanding().apply(np.mean, raw=raw) + 20.0 - - result = df.expanding().apply(mean_w_arg, raw=raw, args=(20,)) - tm.assert_frame_equal(result, expected) - - result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) - tm.assert_frame_equal(result, expected) - - def test_expanding_corr(self): - A = self.series.dropna() - B = (A + randn(len(A)))[:-5] - - result = A.expanding().corr(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).corr(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_count(self): - result = self.series.expanding().count() - tm.assert_almost_equal( - result, self.series.rolling(window=len(self.series)).count() - ) - - def test_expanding_quantile(self): - result = self.series.expanding().quantile(0.5) - - rolling_result = self.series.rolling( - window=len(self.series), min_periods=1 - ).quantile(0.5) - - tm.assert_almost_equal(result, rolling_result) - - def test_expanding_cov(self): - A = self.series - B = (A + randn(len(A)))[:-5] - - result = A.expanding().cov(B) - - rolling_result = A.rolling(window=len(A), min_periods=1).cov(B) - - tm.assert_almost_equal(rolling_result, result) - - def test_expanding_cov_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_corr_pairwise(self): - result = self.frame.expanding().corr() - - rolling_result = self.frame.rolling( - window=len(self.frame), min_periods=1 - ).corr() - tm.assert_frame_equal(result, rolling_result) - - def test_expanding_cov_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().cov(s2) - expected = Series([None, None, 2.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().cov(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().cov(s2) - expected = Series([None, None, None, 4.5]) - tm.assert_series_equal(result, expected) - - def test_expanding_corr_diff_index(self): - # GH 7512 - s1 = Series([1, 2, 3], index=[0, 1, 2]) - s2 = Series([1, 3], index=[0, 2]) - result = s1.expanding().corr(s2) - expected = Series([None, None, 1.0]) - tm.assert_series_equal(result, expected) - - s2a = Series([1, None, 3], index=[0, 1, 2]) - result = s1.expanding().corr(s2a) - tm.assert_series_equal(result, expected) - - s1 = Series([7, 8, 10], index=[0, 1, 3]) - s2 = Series([7, 9, 10], index=[0, 2, 3]) - result = s1.expanding().corr(s2) - expected = Series([None, None, None, 1.0]) - tm.assert_series_equal(result, expected) - def test_rolling_cov_diff_length(self): # GH 7512 s1 = Series([1, 2, 3], index=[0, 1, 2]) @@ -2133,8 +1211,8 @@ def test_rolling_corr_diff_length(self): @pytest.mark.parametrize( "f", [ - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=False)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=False)), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).max(), lambda x: x.rolling(window=10, min_periods=5).min(), lambda x: x.rolling(window=10, min_periods=5).sum(), @@ -2150,6 +1228,7 @@ def test_rolling_corr_diff_length(self): lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), ], ) + @td.skip_if_no_scipy def test_rolling_functions_window_non_shrinkage(self, f): # GH 7764 s = Series(range(4)) @@ -2157,16 +1236,11 @@ def test_rolling_functions_window_non_shrinkage(self, f): df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=["A", "B"]) df_expected = DataFrame(np.nan, index=df.index, columns=df.columns) - try: - s_result = f(s) - tm.assert_series_equal(s_result, s_expected) + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) - df_result = f(df) - tm.assert_frame_equal(df_result, df_expected) - except (ImportError): - - # scipy needed for rolling_window - pytest.skip("scipy not available") + df_result = f(df) + tm.assert_frame_equal(df_result, df_expected) def test_rolling_functions_window_non_shrinkage_binary(self): @@ -2191,154 +1265,6 @@ def test_rolling_functions_window_non_shrinkage_binary(self): df_result = f(df) tm.assert_frame_equal(df_result, df_expected) - def test_moment_functions_zero_length(self): - # GH 8056 - s = Series(dtype=np.float64) - s_expected = s - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=["a"]) - df2["a"] = df2["a"].astype("float64") - df2_expected = df2 - - functions = [ - lambda x: x.expanding().count(), - lambda x: x.expanding(min_periods=5).cov(x, pairwise=False), - lambda x: x.expanding(min_periods=5).corr(x, pairwise=False), - lambda x: x.expanding(min_periods=5).max(), - lambda x: x.expanding(min_periods=5).min(), - lambda x: x.expanding(min_periods=5).sum(), - lambda x: x.expanding(min_periods=5).mean(), - lambda x: x.expanding(min_periods=5).std(), - lambda x: x.expanding(min_periods=5).var(), - lambda x: x.expanding(min_periods=5).skew(), - lambda x: x.expanding(min_periods=5).kurt(), - lambda x: x.expanding(min_periods=5).quantile(0.5), - lambda x: x.expanding(min_periods=5).median(), - lambda x: x.expanding(min_periods=5).apply(sum, raw=False), - lambda x: x.expanding(min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(window=10).count(), - lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), - lambda x: x.rolling(window=10, min_periods=5).max(), - lambda x: x.rolling(window=10, min_periods=5).min(), - lambda x: x.rolling(window=10, min_periods=5).sum(), - lambda x: x.rolling(window=10, min_periods=5).mean(), - lambda x: x.rolling(window=10, min_periods=5).std(), - lambda x: x.rolling(window=10, min_periods=5).var(), - lambda x: x.rolling(window=10, min_periods=5).skew(), - lambda x: x.rolling(window=10, min_periods=5).kurt(), - lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), - lambda x: x.rolling(window=10, min_periods=5).median(), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), - lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), - lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), - ] - for f in functions: - try: - s_result = f(s) - tm.assert_series_equal(s_result, s_expected) - - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - except (ImportError): - - # scipy needed for rolling_window - continue - - def test_moment_functions_zero_length_pairwise(self): - - df1 = DataFrame() - df1_expected = df1 - df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) - df2["a"] = df2["a"].astype("float64") - - df1_expected = DataFrame( - index=pd.MultiIndex.from_product([df1.index, df1.columns]), - columns=Index([]), - ) - df2_expected = DataFrame( - index=pd.MultiIndex.from_product( - [df2.index, df2.columns], names=["bar", "foo"] - ), - columns=Index(["a"], name="foo"), - dtype="float64", - ) - - functions = [ - lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)), - lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), - lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), - ] - for f in functions: - df1_result = f(df1) - tm.assert_frame_equal(df1_result, df1_expected) - - df2_result = f(df2) - tm.assert_frame_equal(df2_result, df2_expected) - - def test_expanding_cov_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo")) - df1a = DataFrame( - [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo") - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo") - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo") - ) - # TODO: xref gh-15826 - # .loc is not preserving the names - result1 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result2 = df1.expanding().cov(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().cov(df2, pairwise=True).loc[2] - result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-3.0, -6.0], [-5.0, -10.0]], - columns=Index(["A", "B"], name="foo"), - index=Index(["X", "Y"], name="foo"), - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - - def test_expanding_corr_pairwise_diff_length(self): - # GH 7512 - df1 = DataFrame( - [[1, 2], [3, 2], [3, 4]], - columns=["A", "B"], - index=Index(range(3), name="bar"), - ) - df1a = DataFrame( - [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] - ) - df2 = DataFrame( - [[5, 6], [None, None], [2, 1]], - columns=["X", "Y"], - index=Index(range(3), name="bar"), - ) - df2a = DataFrame( - [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"] - ) - result1 = df1.expanding().corr(df2, pairwise=True).loc[2] - result2 = df1.expanding().corr(df2a, pairwise=True).loc[2] - result3 = df1a.expanding().corr(df2, pairwise=True).loc[2] - result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2] - expected = DataFrame( - [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"]) - ) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - tm.assert_frame_equal(result3, expected) - tm.assert_frame_equal(result4, expected) - def test_rolling_skew_edge_cases(self): all_nan = Series([np.NaN] * 5) @@ -2389,83 +1315,6 @@ def test_rolling_kurt_eq_value_fperr(self): a = Series([1.1] * 15).rolling(window=10).kurt() assert np.isnan(a).all() - @pytest.mark.parametrize( - "func,static_comp", - [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)], - ids=["sum", "mean", "max", "min"], - ) - def test_expanding_func(self, func, static_comp): - def expanding_func(x, min_periods=1, center=False, axis=0): - exp = x.expanding(min_periods=min_periods, center=center, axis=axis) - return getattr(exp, func)() - - self._check_expanding(expanding_func, static_comp, preserve_nan=False) - - def test_expanding_apply(self, raw): - def expanding_mean(x, min_periods=1): - - exp = x.expanding(min_periods=min_periods) - result = exp.apply(lambda x: x.mean(), raw=raw) - return result - - # TODO(jreback), needed to add preserve_nan=False - # here to make this pass - self._check_expanding(expanding_mean, np.mean, preserve_nan=False) - - ser = Series([], dtype=np.float64) - tm.assert_series_equal(ser, ser.expanding().apply(lambda x: x.mean(), raw=raw)) - - # GH 8080 - s = Series([None, None, None]) - result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw) - expected = Series([1.0, 2.0, 3.0]) - tm.assert_series_equal(result, expected) - - def _check_expanding( - self, - func, - static_comp, - has_min_periods=True, - has_time_rule=True, - preserve_nan=True, - ): - - series_result = func(self.series) - assert isinstance(series_result, Series) - frame_result = func(self.frame) - assert isinstance(frame_result, DataFrame) - - result = func(self.series) - tm.assert_almost_equal(result[10], static_comp(self.series[:11])) - - if preserve_nan: - assert result.iloc[self._nan_locs].isna().all() - - ser = Series(randn(50)) - - if has_min_periods: - result = func(ser, min_periods=30) - assert result[:29].isna().all() - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - - # min_periods is working correctly - result = func(ser, min_periods=15) - assert isna(result.iloc[13]) - assert notna(result.iloc[14]) - - ser2 = Series(randn(20)) - result = func(ser2, min_periods=5) - assert isna(result[3]) - assert notna(result[4]) - - # min_periods=0 - result0 = func(ser, min_periods=0) - result1 = func(ser, min_periods=1) - tm.assert_almost_equal(result0, result1) - else: - result = func(ser) - tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50])) - def test_rolling_max_gh6297(self): """Replicate result expected in GH #6297""" @@ -2587,3 +1436,76 @@ def test_rolling_min_max_numeric_types(self): assert result.dtypes[0] == np.dtype("f8") result = DataFrame(np.arange(20, dtype=data_type)).rolling(window=5).min() assert result.dtypes[0] == np.dtype("f8") + + def test_moment_functions_zero_length(self): + # GH 8056 + s = Series(dtype=np.float64) + s_expected = s + df1 = DataFrame() + df1_expected = df1 + df2 = DataFrame(columns=["a"]) + df2["a"] = df2["a"].astype("float64") + df2_expected = df2 + + functions = [ + lambda x: x.rolling(window=10).count(), + lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), + lambda x: x.rolling(window=10, min_periods=5).max(), + lambda x: x.rolling(window=10, min_periods=5).min(), + lambda x: x.rolling(window=10, min_periods=5).sum(), + lambda x: x.rolling(window=10, min_periods=5).mean(), + lambda x: x.rolling(window=10, min_periods=5).std(), + lambda x: x.rolling(window=10, min_periods=5).var(), + lambda x: x.rolling(window=10, min_periods=5).skew(), + lambda x: x.rolling(window=10, min_periods=5).kurt(), + lambda x: x.rolling(window=10, min_periods=5).quantile(0.5), + lambda x: x.rolling(window=10, min_periods=5).median(), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False), + lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True), + lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(), + ] + for f in functions: + try: + s_result = f(s) + tm.assert_series_equal(s_result, s_expected) + + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) + except (ImportError): + + # scipy needed for rolling_window + continue + + def test_moment_functions_zero_length_pairwise(self): + + df1 = DataFrame() + df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar")) + df2["a"] = df2["a"].astype("float64") + + df1_expected = DataFrame( + index=pd.MultiIndex.from_product([df1.index, df1.columns]), + columns=Index([]), + ) + df2_expected = DataFrame( + index=pd.MultiIndex.from_product( + [df2.index, df2.columns], names=["bar", "foo"] + ), + columns=Index(["a"], name="foo"), + dtype="float64", + ) + + functions = [ + lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)), + lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)), + ] + + for f in functions: + df1_result = f(df1) + tm.assert_frame_equal(df1_result, df1_expected) + + df2_result = f(df2) + tm.assert_frame_equal(df2_result, df2_expected) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 5085576cc96f0..5e70e13209de5 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -7,9 +7,9 @@ import pandas as pd from pandas import DataFrame, Index, Series, Timestamp, concat +import pandas._testing as tm from pandas.core.base import SpecificationError from pandas.tests.window.common import Base -import pandas.util.testing as tm class TestApi(Base): diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py new file mode 100644 index 0000000000000..7132e64c1191c --- /dev/null +++ b/pandas/tests/window/test_apply.py @@ -0,0 +1,140 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series, Timestamp, date_range +import pandas._testing as tm + + +@pytest.mark.parametrize("bad_raw", [None, 1, 0]) +def test_rolling_apply_invalid_raw(bad_raw): + with pytest.raises(ValueError, match="raw parameter must be `True` or `False`"): + Series(range(3)).rolling(1).apply(len, raw=bad_raw) + + +def test_rolling_apply_out_of_bounds(engine_and_raw): + # gh-1850 + engine, raw = engine_and_raw + + vals = Series([1, 2, 3, 4]) + + result = vals.rolling(10).apply(np.sum, engine=engine, raw=raw) + assert result.isna().all() + + result = vals.rolling(10, min_periods=1).apply(np.sum, engine=engine, raw=raw) + expected = Series([1, 3, 6, 10], dtype=float) + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize("window", [2, "2s"]) +def test_rolling_apply_with_pandas_objects(window): + # 5071 + df = DataFrame( + {"A": np.random.randn(5), "B": np.random.randint(0, 10, size=5)}, + index=date_range("20130101", periods=5, freq="s"), + ) + + # we have an equal spaced timeseries index + # so simulate removing the first period + def f(x): + if x.index[0] == df.index[0]: + return np.nan + return x.iloc[-1] + + result = df.rolling(window).apply(f, raw=False) + expected = df.iloc[2:].reindex_like(df) + tm.assert_frame_equal(result, expected) + + with pytest.raises(AttributeError): + df.rolling(window).apply(f, raw=True) + + +def test_rolling_apply(engine_and_raw): + engine, raw = engine_and_raw + + expected = Series([], dtype="float64") + result = expected.rolling(10).apply(lambda x: x.mean(), engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + # gh-8080 + s = Series([None, None, None]) + result = s.rolling(2, min_periods=0).apply(lambda x: len(x), engine=engine, raw=raw) + expected = Series([1.0, 2.0, 2.0]) + tm.assert_series_equal(result, expected) + + result = s.rolling(2, min_periods=0).apply(len, engine=engine, raw=raw) + tm.assert_series_equal(result, expected) + + +def test_all_apply(engine_and_raw): + engine, raw = engine_and_raw + + df = ( + DataFrame( + {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)} + ).set_index("A") + * 2 + ) + er = df.rolling(window=1) + r = df.rolling(window="1s") + + result = r.apply(lambda x: 1, engine=engine, raw=raw) + expected = er.apply(lambda x: 1, engine=engine, raw=raw) + tm.assert_frame_equal(result, expected) + + +def test_ragged_apply(engine_and_raw): + engine, raw = engine_and_raw + + df = DataFrame({"B": range(5)}) + df.index = [ + Timestamp("20130101 09:00:00"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:03"), + Timestamp("20130101 09:00:05"), + Timestamp("20130101 09:00:06"), + ] + + f = lambda x: 1 + result = df.rolling(window="1s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="2s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + result = df.rolling(window="5s", min_periods=1).apply(f, engine=engine, raw=raw) + expected = df.copy() + expected["B"] = 1.0 + tm.assert_frame_equal(result, expected) + + +def test_invalid_engine(): + with pytest.raises(ValueError, match="engine must be either 'numba' or 'cython'"): + Series(range(1)).rolling(1).apply(lambda x: x, engine="foo") + + +def test_invalid_engine_kwargs_cython(): + with pytest.raises(ValueError, match="cython engine does not accept engine_kwargs"): + Series(range(1)).rolling(1).apply( + lambda x: x, engine="cython", engine_kwargs={"nopython": False} + ) + + +def test_invalid_raw_numba(): + with pytest.raises( + ValueError, match="raw must be `True` when using the numba engine" + ): + Series(range(1)).rolling(1).apply(lambda x: x, raw=False, engine="numba") + + +@td.skip_if_no("numba") +def test_invalid_kwargs_nopython(): + with pytest.raises(ValueError, match="numba does not support kwargs with"): + Series(range(1)).rolling(1).apply( + lambda x: x, kwargs={"a": 1}, engine="numba", raw=True + ) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 6a3f2c19babdc..606520c6d68ca 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -2,9 +2,9 @@ import pytest from pandas import DataFrame, Series +import pandas._testing as tm from pandas.api.indexers import BaseIndexer from pandas.core.window.indexers import ExpandingIndexer -import pandas.util.testing as tm def test_bad_get_window_bounds_signature(): diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 9d023034c570a..b1c9b66ab09d3 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -4,8 +4,8 @@ import pytest from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.base import DataError -import pandas.util.testing as tm # gh-12373 : rolling functions error on float32 data # make sure rolling functions works for different dtypes diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 098acdff93ac6..fc4bd50f25c73 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -5,9 +5,9 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.window import Expanding from pandas.tests.window.common import Base -import pandas.util.testing as tm class TestExpanding(Base): diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 189942bc07d2a..355ef3a90d424 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -3,8 +3,8 @@ import pandas as pd from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.groupby.groupby import get_groupby -import pandas.util.testing as tm class TestGrouperGrouping: diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py new file mode 100644 index 0000000000000..cc8aef1779b46 --- /dev/null +++ b/pandas/tests/window/test_numba.py @@ -0,0 +1,74 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import Series +import pandas._testing as tm + + +@td.skip_if_no("numba", "0.46.0") +@pytest.mark.filterwarnings("ignore:\\nThe keyword argument") +# Filter warnings when parallel=True and the function can't be parallelized by Numba +class TestApply: + @pytest.mark.parametrize("jit", [True, False]) + def test_numba_vs_cython(self, jit, nogil, parallel, nopython): + def f(x, *args): + arg_sum = 0 + for arg in args: + arg_sum += arg + return np.mean(x) + arg_sum + + if jit: + import numba + + f = numba.jit(f) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + args = (2,) + + s = Series(range(10)) + result = s.rolling(2).apply( + f, args=args, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = s.rolling(2).apply(f, engine="cython", args=args, raw=True) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("jit", [True, False]) + def test_cache(self, jit, nogil, parallel, nopython): + # Test that the functions are cached correctly if we switch functions + def func_1(x): + return np.mean(x) + 4 + + def func_2(x): + return np.std(x) * 5 + + if jit: + import numba + + func_1 = numba.jit(func_1) + func_2 = numba.jit(func_2) + + engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} + + roll = Series(range(10)).rolling(2) + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + + # func_1 should be in the cache now + assert func_1 in roll._numba_func_cache + + result = roll.apply( + func_2, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_2, engine="cython", raw=True) + tm.assert_series_equal(result, expected) + # This run should use the cached func_1 + result = roll.apply( + func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True + ) + expected = roll.apply(func_1, engine="cython", raw=True) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 6f6d4c09526ff..717273cff64ea 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -3,8 +3,8 @@ import pytest from pandas import DataFrame, Series +import pandas._testing as tm from pandas.core.algorithms import safe_sort -import pandas.util.testing as tm class TestPairwise: diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 227055eb222f8..ff435f8386a85 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -8,9 +8,9 @@ import pandas as pd from pandas import DataFrame, Index, Series +import pandas._testing as tm from pandas.core.window import Rolling from pandas.tests.window.common import Base -import pandas.util.testing as tm class TestRolling(Base): @@ -32,23 +32,34 @@ def test_constructor(self, which): c = o.rolling # valid + c(0) c(window=2) c(window=2, min_periods=1) c(window=2, min_periods=1, center=True) c(window=2, min_periods=1, center=False) # GH 13383 - with pytest.raises(ValueError): - c(0) + + msg = "window must be non-negative" + + with pytest.raises(ValueError, match=msg): c(-1) # not valid for w in [2.0, "foo", np.array([2])]: - with pytest.raises(ValueError): + msg = ( + "window must be an integer|" + "passed window foo is not compatible with a datetimelike index" + ) + with pytest.raises(ValueError, match=msg): c(window=w) - with pytest.raises(ValueError): + + msg = "min_periods must be an integer" + with pytest.raises(ValueError, match=msg): c(window=2, min_periods=w) - with pytest.raises(ValueError): + + msg = "center must be a boolean" + with pytest.raises(ValueError, match=msg): c(window=2, min_periods=1, center=w) @td.skip_if_no_scipy @@ -57,7 +68,10 @@ def test_constructor_with_win_type(self, which): # GH 13383 o = getattr(self, which) c = o.rolling - with pytest.raises(ValueError): + + msg = "window must be > 0" + + with pytest.raises(ValueError, match=msg): c(-1, win_type="boxcar") @pytest.mark.parametrize("window", [timedelta(days=3), pd.Timedelta(days=3)]) @@ -113,7 +127,10 @@ def test_numpy_compat(self, method): def test_closed(self): df = DataFrame({"A": [0, 1, 2, 3, 4]}) # closed only allowed for datetimelike - with pytest.raises(ValueError): + + msg = "closed only implemented for datetimelike and offset based windows" + + with pytest.raises(ValueError, match=msg): df.rolling(window=3, closed="neither") @pytest.mark.parametrize("closed", ["neither", "left"]) @@ -296,7 +313,10 @@ def test_iter_raises(self, klass): # https://github.com/pandas-dev/pandas/issues/11704 # Iteration over a Window obj = klass([1, 2, 3, 4]) - with pytest.raises(NotImplementedError): + + msg = "See issue #11704 https://github.com/pandas-dev/pandas/issues/11704" + + with pytest.raises(NotImplementedError, match=msg): iter(obj.rolling(2)) def test_rolling_axis_sum(self, axis_frame): diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 46582b4b50c84..5f5e10b5dd497 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -10,7 +10,7 @@ date_range, to_datetime, ) -import pandas.util.testing as tm +import pandas._testing as tm import pandas.tseries.offsets as offsets @@ -566,26 +566,6 @@ def test_freqs_ops(self, freq, op, result_data): tm.assert_series_equal(result, expected) - def test_ragged_apply(self, raw): - - df = self.ragged - - f = lambda x: 1 - result = df.rolling(window="1s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="2s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - - result = df.rolling(window="5s", min_periods=1).apply(f, raw=raw) - expected = df.copy() - expected["B"] = 1.0 - tm.assert_frame_equal(result, expected) - def test_all(self): # simple comparison of integer vs time-based windowing @@ -614,16 +594,6 @@ def test_all(self): expected = er.quantile(0.5) tm.assert_frame_equal(result, expected) - def test_all_apply(self, raw): - - df = self.regular * 2 - er = df.rolling(window=1) - r = df.rolling(window="1s") - - result = r.apply(lambda x: 1, raw=raw) - expected = er.apply(lambda x: 1, raw=raw) - tm.assert_frame_equal(result, expected) - def test_all2(self): # more sophisticated comparison of integer vs. diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index 39ab3ffd9319e..cc29ab4f2cd62 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -65,7 +65,7 @@ def test_agg_function_support(self, arg): df = pd.DataFrame({"A": np.arange(5)}) roll = df.rolling(2, win_type="triang") - msg = "'{arg}' is not a valid function for 'Window' object".format(arg=arg) + msg = f"'{arg}' is not a valid function for 'Window' object" with pytest.raises(AttributeError, match=msg): roll.agg(arg) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 26b13b42b1af6..af34180fb3170 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -1,6 +1,7 @@ from datetime import timedelta import re from typing import Dict, Optional +import warnings import numpy as np from pytz import AmbiguousTimeError @@ -125,7 +126,7 @@ def to_offset(freq) -> Optional[DateOffset]: if isinstance(stride, str): name, stride = stride, name name, _ = libfreqs._base_and_stride(name) - delta = get_offset(name) * stride + delta = _get_offset(name) * stride elif isinstance(freq, timedelta): delta = None @@ -166,7 +167,7 @@ def to_offset(freq) -> Optional[DateOffset]: float(stride), prefix ) stride = int(stride) - offset = get_offset(name) + offset = _get_offset(name) offset = offset * int(np.fabs(stride) * stride_sign) if delta is None: delta = offset @@ -185,10 +186,29 @@ def get_offset(name: str) -> DateOffset: """ Return DateOffset object associated with rule name. + .. deprecated:: 1.0.0 + Examples -------- get_offset('EOM') --> BMonthEnd(1) """ + warnings.warn( + "get_offset is deprecated and will be removed in a future version, " + "use to_offset instead", + FutureWarning, + stacklevel=2, + ) + return _get_offset(name) + + +def _get_offset(name: str) -> DateOffset: + """ + Return DateOffset object associated with rule name. + + Examples + -------- + _get_offset('EOM') --> BMonthEnd(1) + """ if name not in libfreqs._dont_uppercase: name = name.upper() name = libfreqs._lite_rule_alias.get(name, name) @@ -224,7 +244,7 @@ def infer_freq(index, warn: bool = True) -> Optional[str]: Parameters ---------- index : DatetimeIndex or TimedeltaIndex - if passed a Series will use the values of the series (NOT THE INDEX). + If passed a Series will use the values of the series (NOT THE INDEX). warn : bool, default True Returns @@ -314,7 +334,7 @@ def is_unique(self) -> bool: return len(self.deltas) == 1 @cache_readonly - def is_unique_asi8(self): + def is_unique_asi8(self) -> bool: return len(self.deltas_asi8) == 1 def get_freq(self) -> Optional[str]: diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 2e5477ea00e39..62d7c26b590cc 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -186,16 +186,16 @@ class from pandas.tseries.offsets def __repr__(self) -> str: info = "" if self.year is not None: - info += "year={year}, ".format(year=self.year) - info += "month={mon}, day={day}, ".format(mon=self.month, day=self.day) + info += f"year={self.year}, " + info += f"month={self.month}, day={self.day}, " if self.offset is not None: - info += "offset={offset}".format(offset=self.offset) + info += f"offset={self.offset}" if self.observance is not None: - info += "observance={obs}".format(obs=self.observance) + info += f"observance={self.observance}" - repr = "Holiday: {name} ({info})".format(name=self.name, info=info) + repr = f"Holiday: {self.name} ({info})" return repr def dates(self, start_date, end_date, return_name=False): @@ -394,8 +394,7 @@ def holidays(self, start=None, end=None, return_name=False): """ if self.rules is None: raise Exception( - "Holiday Calendar {name} does not have any " - "rules specified".format(name=self.name) + f"Holiday Calendar {self.name} does not have any rules specified" ) if start is None: diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 75698f7354bf9..220ff241efa0c 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -2,6 +2,7 @@ import functools import operator from typing import Any, Optional +import warnings from dateutil.easter import easter import numpy as np @@ -25,7 +26,7 @@ BaseOffset, _get_calendar, _is_normalized, - _to_dt64, + _to_dt64D, apply_index_wraps, as_datetime, roll_yearday, @@ -165,7 +166,7 @@ class DateOffset(BaseOffset): that conform to the DateOffset. For example, Bday defines this set to be the set of dates that are weekdays (M-F). To test if a date is in the set of a DateOffset dateOffset we can use the - onOffset method: dateOffset.onOffset(date). + is_on_offset method: dateOffset.is_on_offset(date). If a date is not on a valid date, the rollback and rollforward methods can be used to roll the date to the nearest valid date @@ -251,6 +252,7 @@ def __add__(date): _use_relativedelta = False _adjust_dst = False _attributes = frozenset(["n", "normalize"] + list(liboffsets.relativedelta_kwds)) + _deprecations = frozenset(["isAnchored", "onOffset"]) # default for prior pickles normalize = False @@ -363,15 +365,31 @@ def apply_index(self, i): "applied vectorized" ) - def isAnchored(self): + def is_anchored(self) -> bool: # TODO: Does this make sense for the general case? It would help - # if there were a canonical docstring for what isAnchored means. + # if there were a canonical docstring for what is_anchored means. return self.n == 1 + def onOffset(self, dt): + warnings.warn( + "onOffset is a deprecated, use is_on_offset instead", + FutureWarning, + stacklevel=2, + ) + return self.is_on_offset(dt) + + def isAnchored(self) -> bool: + warnings.warn( + "isAnchored is a deprecated, use is_anchored instead", + FutureWarning, + stacklevel=2, + ) + return self.is_anchored() + # TODO: Combine this with BusinessMixin version by defining a whitelisted # set of attributes on each object rather than the existing behavior of # iterating over internal ``__dict__`` - def _repr_attrs(self): + def _repr_attrs(self) -> str: exclude = {"n", "inc", "normalize"} attrs = [] for attr in sorted(self.__dict__): @@ -387,7 +405,7 @@ def _repr_attrs(self): return out @property - def name(self): + def name(self) -> str: return self.rule_code def rollback(self, dt): @@ -400,7 +418,7 @@ def rollback(self, dt): Rolled timestamp if not on offset, otherwise unchanged timestamp. """ dt = as_timestamp(dt) - if not self.onOffset(dt): + if not self.is_on_offset(dt): dt = dt - type(self)(1, normalize=self.normalize, **self.kwds) return dt @@ -414,11 +432,11 @@ def rollforward(self, dt): Rolled timestamp if not on offset, otherwise unchanged timestamp. """ dt = as_timestamp(dt) - if not self.onOffset(dt): + if not self.is_on_offset(dt): dt = dt + type(self)(1, normalize=self.normalize, **self.kwds) return dt - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False # XXX, see #1395 @@ -434,15 +452,15 @@ def onOffset(self, dt): # way to get around weirdness with rule_code @property - def _prefix(self): + def _prefix(self) -> str: raise NotImplementedError("Prefix not defined") @property - def rule_code(self): + def rule_code(self) -> str: return self._prefix @cache_readonly - def freqstr(self): + def freqstr(self) -> str: try: code = self.rule_code except NotImplementedError: @@ -462,7 +480,7 @@ def freqstr(self): return fstr - def _offset_str(self): + def _offset_str(self) -> str: return "" @property @@ -511,11 +529,11 @@ def offset(self): # Alias for backward compat return self._offset - def _repr_attrs(self): + def _repr_attrs(self) -> str: if self.offset: attrs = [f"offset={repr(self.offset)}"] else: - attrs = None + attrs = [] out = "" if attrs: out += ": " + ", ".join(attrs) @@ -535,7 +553,7 @@ def __init__(self, n=1, normalize=False, offset=timedelta(0)): BaseOffset.__init__(self, n, normalize) object.__setattr__(self, "_offset", offset) - def _offset_str(self): + def _offset_str(self) -> str: def get_str(td): off_str = "" if td.days > 0: @@ -631,7 +649,7 @@ def apply_index(self, i): result = shifted.to_timestamp() + time return result - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.weekday() < 5 @@ -728,7 +746,7 @@ def _next_opening_time(self, other, sign=1): earliest_start = self.start[0] latest_start = self.start[-1] - if not self.next_bday.onOffset(other): + if not self.next_bday.is_on_offset(other): # today is not business day other = other + sign * self.next_bday if self.n * sign >= 0: @@ -795,7 +813,7 @@ def rollback(self, dt): """ Roll provided date backward to next offset only if not on offset. """ - if not self.onOffset(dt): + if not self.is_on_offset(dt): if self.n >= 0: dt = self._prev_opening_time(dt) else: @@ -808,7 +826,7 @@ def rollforward(self, dt): """ Roll provided date forward to next offset only if not on offset. """ - if not self.onOffset(dt): + if not self.is_on_offset(dt): if self.n >= 0: return self._next_opening_time(dt) else: @@ -856,13 +874,13 @@ def apply(self, other): # adjust other to reduce number of cases to handle if n >= 0: - if other.time() in self.end or not self._onOffset(other): + if other.time() in self.end or not self._is_on_offset(other): other = self._next_opening_time(other) else: if other.time() in self.start: # adjustment to move to previous business day other = other - timedelta(seconds=1) - if not self._onOffset(other): + if not self._is_on_offset(other): other = self._next_opening_time(other) other = self._get_closing_time(other) @@ -878,9 +896,17 @@ def apply(self, other): # adjust by business days first if bd != 0: - skip_bd = BusinessDay(n=bd) + if isinstance(self, _CustomMixin): # GH 30593 + skip_bd = CustomBusinessDay( + n=bd, + weekmask=self.weekmask, + holidays=self.holidays, + calendar=self.calendar, + ) + else: + skip_bd = BusinessDay(n=bd) # midnight business hour may not on BusinessDay - if not self.next_bday.onOffset(other): + if not self.next_bday.is_on_offset(other): prev_open = self._prev_opening_time(other) remain = other - prev_open other = prev_open + skip_bd + remain @@ -929,7 +955,7 @@ def apply(self, other): else: raise ApplyTypeError("Only know how to combine business hour with datetime") - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False @@ -939,9 +965,9 @@ def onOffset(self, dt): ) # Valid BH can be on the different BusinessDay during midnight # Distinguish by the time spent from previous opening time - return self._onOffset(dt) + return self._is_on_offset(dt) - def _onOffset(self, dt): + def _is_on_offset(self, dt): """ Slight speedups using calculated values. """ @@ -1061,10 +1087,10 @@ def apply(self, other): def apply_index(self, i): raise NotImplementedError - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False - day64 = _to_dt64(dt, "datetime64[D]") + day64 = _to_dt64D(dt) return np.is_busday(day64, busdaycal=self.calendar) @@ -1108,14 +1134,14 @@ class MonthOffset(SingleConstructorOffset): __init__ = BaseOffset.__init__ @property - def name(self): - if self.isAnchored: + def name(self) -> str: + if self.is_anchored: return self.rule_code else: month = ccalendar.MONTH_ALIASES[self.n] return f"{self.code_rule}-{month}" - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.day == self._get_offset_day(dt) @@ -1197,7 +1223,7 @@ class _CustomBusinessMonth(_CustomMixin, BusinessMixin, MonthOffset): ["n", "normalize", "weekmask", "holidays", "calendar", "offset"] ) - onOffset = DateOffset.onOffset # override MonthOffset method + is_on_offset = DateOffset.is_on_offset # override MonthOffset method apply_index = DateOffset.apply_index # override MonthOffset method def __init__( @@ -1307,7 +1333,7 @@ def _from_name(cls, suffix=None): return cls(day_of_month=suffix) @property - def rule_code(self): + def rule_code(self) -> str: suffix = f"-{self.day_of_month}" return self._prefix + suffix @@ -1403,7 +1429,7 @@ class SemiMonthEnd(SemiMonthOffset): _prefix = "SM" _min_day_of_month = 1 - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False days_in_month = ccalendar.get_days_in_month(dt.year, dt.month) @@ -1461,7 +1487,7 @@ class SemiMonthBegin(SemiMonthOffset): _prefix = "SMS" - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.day in (1, self.day_of_month) @@ -1530,7 +1556,7 @@ def __init__(self, n=1, normalize=False, weekday=None): if self.weekday < 0 or self.weekday > 6: raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") - def isAnchored(self): + def is_anchored(self) -> bool: return self.n == 1 and self.weekday is not None @apply_wraps @@ -1606,7 +1632,7 @@ def _end_apply_index(self, dtindex): return base + off + Timedelta(1, "ns") - Timedelta(1, "D") - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False elif self.weekday is None: @@ -1614,7 +1640,7 @@ def onOffset(self, dt): return dt.weekday() == self.weekday @property - def rule_code(self): + def rule_code(self) -> str: suffix = "" if self.weekday is not None: weekday = ccalendar.int_to_weekday[self.weekday] @@ -1649,7 +1675,7 @@ def apply(self, other): to_day = self._get_offset_day(shifted) return liboffsets.shift_day(shifted, to_day - shifted.day) - def onOffset(self, dt): + def is_on_offset(self, dt): if self.normalize and not _is_normalized(dt): return False return dt.day == self._get_offset_day(dt) @@ -1691,7 +1717,7 @@ def __init__(self, n=1, normalize=False, week=0, weekday=0): if self.week < 0 or self.week > 3: raise ValueError(f"Week must be 0<=week<=3, got {self.week}") - def _get_offset_day(self, other): + def _get_offset_day(self, other: datetime) -> int: """ Find the day in the same month as other that has the same weekday as self.weekday and is the self.week'th such day in the month. @@ -1710,7 +1736,7 @@ def _get_offset_day(self, other): return 1 + shift_days + self.week * 7 @property - def rule_code(self): + def rule_code(self) -> str: weekday = ccalendar.int_to_weekday.get(self.weekday, "") return f"{self._prefix}-{self.week + 1}{weekday}" @@ -1759,7 +1785,7 @@ def __init__(self, n=1, normalize=False, weekday=0): if self.weekday < 0 or self.weekday > 6: raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") - def _get_offset_day(self, other): + def _get_offset_day(self, other: datetime) -> int: """ Find the day in the same month as other that has the same weekday as self.weekday and is the last such day in the month. @@ -1779,7 +1805,7 @@ def _get_offset_day(self, other): return dim - shift_days @property - def rule_code(self): + def rule_code(self) -> str: weekday = ccalendar.int_to_weekday.get(self.weekday, "") return f"{self._prefix}-{weekday}" @@ -1806,7 +1832,7 @@ class QuarterOffset(DateOffset): _adjust_dst = True _attributes = frozenset(["n", "normalize", "startingMonth"]) # TODO: Consider combining QuarterOffset and YearOffset __init__ at some - # point. Also apply_index, onOffset, rule_code if + # point. Also apply_index, is_on_offset, rule_code if # startingMonth vs month attr names are resolved def __init__(self, n=1, normalize=False, startingMonth=None): @@ -1816,7 +1842,7 @@ def __init__(self, n=1, normalize=False, startingMonth=None): startingMonth = self._default_startingMonth object.__setattr__(self, "startingMonth", startingMonth) - def isAnchored(self): + def is_anchored(self) -> bool: return self.n == 1 and self.startingMonth is not None @classmethod @@ -1830,7 +1856,7 @@ def _from_name(cls, suffix=None): return cls(**kwargs) @property - def rule_code(self): + def rule_code(self) -> str: month = ccalendar.MONTH_ALIASES[self.startingMonth] return f"{self._prefix}-{month}" @@ -1838,7 +1864,7 @@ def rule_code(self): def apply(self, other): # months_since: find the calendar quarter containing other.month, # e.g. if other.month == 8, the calendar quarter is [Jul, Aug, Sep]. - # Then find the month in that quarter containing an onOffset date for + # Then find the month in that quarter containing an is_on_offset date for # self. `months_since` is the number of months to shift other.month # to get to this on-offset month. months_since = other.month % 3 - self.startingMonth % 3 @@ -1848,7 +1874,7 @@ def apply(self, other): months = qtrs * 3 - months_since return shift_month(other, months, self._day_opt) - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False mod_month = (dt.month - self.startingMonth) % 3 @@ -1927,7 +1953,7 @@ class YearOffset(DateOffset): _adjust_dst = True _attributes = frozenset(["n", "normalize", "month"]) - def _get_offset_day(self, other): + def _get_offset_day(self, other: datetime) -> int: # override BaseOffset method to use self.month instead of other.month # TODO: there may be a more performant way to do this return liboffsets.get_day_of_month( @@ -1951,7 +1977,7 @@ def apply_index(self, dtindex): shifted, freq=dtindex.freq, dtype=dtindex.dtype ) - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.month == self.month and dt.day == self._get_offset_day(dt) @@ -1973,7 +1999,7 @@ def _from_name(cls, suffix=None): return cls(**kwargs) @property - def rule_code(self): + def rule_code(self) -> str: month = ccalendar.MONTH_ALIASES[self.month] return f"{self._prefix}-{month}" @@ -2091,12 +2117,12 @@ def __init__( if self.variation not in ["nearest", "last"]: raise ValueError(f"{self.variation} is not a valid variation") - def isAnchored(self): + def is_anchored(self) -> bool: return ( self.n == 1 and self.startingMonth is not None and self.weekday is not None ) - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False dt = datetime(dt.year, dt.month, dt.day) @@ -2191,18 +2217,18 @@ def get_year_end(self, dt): return target_date + timedelta(days_forward - 7) @property - def rule_code(self): + def rule_code(self) -> str: prefix = self._prefix suffix = self.get_rule_code_suffix() return f"{prefix}-{suffix}" - def _get_suffix_prefix(self): + def _get_suffix_prefix(self) -> str: if self.variation == "nearest": return "N" else: return "L" - def get_rule_code_suffix(self): + def get_rule_code_suffix(self) -> str: prefix = self._get_suffix_prefix() month = ccalendar.MONTH_ALIASES[self.startingMonth] weekday = ccalendar.int_to_weekday[self.weekday] @@ -2320,8 +2346,8 @@ def _offset(self): variation=self.variation, ) - def isAnchored(self): - return self.n == 1 and self._offset.isAnchored() + def is_anchored(self) -> bool: + return self.n == 1 and self._offset.is_anchored() def _rollback_to_year(self, other): """ @@ -2347,7 +2373,7 @@ def _rollback_to_year(self, other): norm = Timestamp(other).tz_localize(None) start = self._offset.rollback(norm) - # Note: start <= norm and self._offset.onOffset(start) + # Note: start <= norm and self._offset.is_on_offset(start) if start < norm: # roll adjustment @@ -2355,7 +2381,7 @@ def _rollback_to_year(self, other): # check thet qtr_lens is consistent with self._offset addition end = liboffsets.shift_day(start, days=7 * sum(qtr_lens)) - assert self._offset.onOffset(end), (start, end, qtr_lens) + assert self._offset.is_on_offset(end), (start, end, qtr_lens) tdelta = norm - start for qlen in qtr_lens: @@ -2408,7 +2434,7 @@ def get_weeks(self, dt): return ret - def year_has_extra_week(self, dt): + def year_has_extra_week(self, dt: datetime) -> bool: # Avoid round-down errors --> normalize to get # e.g. '370D' instead of '360D23H' norm = Timestamp(dt).normalize().tz_localize(None) @@ -2419,10 +2445,10 @@ def year_has_extra_week(self, dt): assert weeks_in_year in [52, 53], weeks_in_year return weeks_in_year == 53 - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False - if self._offset.onOffset(dt): + if self._offset.is_on_offset(dt): return True next_year_end = dt - self._offset @@ -2437,7 +2463,7 @@ def onOffset(self, dt): return False @property - def rule_code(self): + def rule_code(self) -> str: suffix = self._offset.get_rule_code_suffix() qtr = self.qtr_with_extra_week return f"{self._prefix}-{suffix}-{qtr}" @@ -2490,7 +2516,7 @@ def apply(self, other): ) return new - def onOffset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return date(dt.year, dt.month, dt.day) == easter(dt.year) @@ -2570,7 +2596,7 @@ def __eq__(self, other: Any) -> bool: # This is identical to DateOffset.__hash__, but has to be redefined here # for Python 3, because we've redefined __eq__. - def __hash__(self): + def __hash__(self) -> int: return hash(self._params) def __ne__(self, other): @@ -2591,7 +2617,7 @@ def __ne__(self, other): return True @property - def delta(self): + def delta(self) -> Timedelta: return self.n * self._inc @property @@ -2622,11 +2648,11 @@ def apply(self, other): raise ApplyTypeError(f"Unhandled type: {type(other).__name__}") - def isAnchored(self): + def is_anchored(self) -> bool: return False -def _delta_to_tick(delta): +def _delta_to_tick(delta: timedelta) -> Tick: if delta.microseconds == 0 and getattr(delta, "nanoseconds", 0) == 0: # nanoseconds only for pd.Timedelta if delta.seconds == 0: @@ -2641,8 +2667,8 @@ def _delta_to_tick(delta): return Second(seconds) else: nanos = delta_to_nanoseconds(delta) - if nanos % 1000000 == 0: - return Milli(nanos // 1000000) + if nanos % 1_000_000 == 0: + return Milli(nanos // 1_000_000) elif nanos % 1000 == 0: return Micro(nanos // 1000) else: # pragma: no cover @@ -2727,10 +2753,10 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): end = Timestamp(end) end = end if end is not NaT else None - if start and not offset.onOffset(start): + if start and not offset.is_on_offset(start): start = offset.rollforward(start) - elif end and not offset.onOffset(end): + elif end and not offset.is_on_offset(end): end = offset.rollback(end) if periods is None and end < start and offset.n >= 0: diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index d906c0371d207..b5271dbc0443e 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -1,3 +1,30 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa +from pandas import compat from pandas.core.util.hashing import hash_array, hash_pandas_object # noqa + +# compatibility for import pandas; pandas.util.testing + +if compat.PY37: + + def __getattr__(name): + if name == "testing": + import pandas.util.testing + + return pandas.util.testing + else: + raise AttributeError(f"module 'pandas.util' has no attribute '{name}'") + + +else: + + class _testing: + def __getattr__(self, item): + import pandas.util.testing + + return getattr(pandas.util.testing, item) + + testing = _testing() + + +del compat diff --git a/pandas/util/_depr_module.py b/pandas/util/_depr_module.py index 5733663dd7ab3..5694ca24aab57 100644 --- a/pandas/util/_depr_module.py +++ b/pandas/util/_depr_module.py @@ -46,7 +46,7 @@ def __repr__(self) -> str: __str__ = __repr__ - def __getattr__(self, name): + def __getattr__(self, name: str): if name in self.self_dir: return object.__getattribute__(self, name) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 7e14ed27d5bd4..d8804994af426 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -32,6 +32,7 @@ def test_foo(): import pytest from pandas.compat import is_platform_32bit, is_platform_windows +from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import _np_version from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR @@ -191,7 +192,7 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: ) skip_if_no_ne = pytest.mark.skipif( not _USE_NUMEXPR, - reason=f"numexpr enabled->{_USE_NUMEXPR}, " f"installed->{_NUMEXPR_INSTALLED}", + reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{_NUMEXPR_INSTALLED}", ) @@ -251,3 +252,13 @@ def new_func(*args, **kwargs): assert flist2 == flist return new_func + + +def async_mark(): + try: + import_optional_dependency("pytest_asyncio") + async_mark = pytest.mark.asyncio + except ImportError: + async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") + + return async_mark diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 547fe748ae941..a715094e65e98 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -84,15 +84,13 @@ def validate_args(fname, args, max_fname_arg_count, compat_args): The maximum number of arguments that the function `fname` can accept, excluding those in `args`. Used for displaying appropriate error messages. Must be non-negative. - compat_args : Dict - An ordered dictionary of keys and their associated default values. + compat_args : dict + A dictionary of keys and their associated default values. In order to accommodate buggy behaviour in some versions of `numpy`, where a signature displayed keyword arguments but then passed those arguments **positionally** internally when calling downstream - implementations, an ordered dictionary ensures that the original - order of the keyword arguments is enforced. Note that if there is - only one key, a generic dict can be passed in as well. - + implementations, a dict ensures that the original + order of the keyword arguments is enforced. Raises ------ TypeError @@ -120,9 +118,7 @@ def _check_for_invalid_keys(fname, kwargs, compat_args): if diff: bad_arg = list(diff)[0] - raise TypeError( - (f"{fname}() got an unexpected " f"keyword argument '{bad_arg}'") - ) + raise TypeError(f"{fname}() got an unexpected keyword argument '{bad_arg}'") def validate_kwargs(fname, kwargs, compat_args): @@ -170,10 +166,9 @@ def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_ar The minimum number of arguments that the function `fname` requires, excluding those in `args`. Used for displaying appropriate error messages. Must be non-negative. - compat_args: OrderedDict - A ordered dictionary of keys that `kwargs` is allowed to - have and their associated default values. Note that if there - is only one key, a generic dict can be passed in as well. + compat_args: dict + A dictionary of keys that `kwargs` is allowed to + have and their associated default values. Raises ------ @@ -202,7 +197,7 @@ def validate_args_and_kwargs(fname, args, kwargs, max_fname_arg_count, compat_ar for key in args_dict: if key in kwargs: raise TypeError( - f"{fname}() got multiple values for keyword " f"argument '{key}'" + f"{fname}() got multiple values for keyword argument '{key}'" ) kwargs.update(args_dict) @@ -302,7 +297,7 @@ def validate_axis_style_args(data, args, kwargs, arg_name, method_name): "\n\t'.{method_name}(index=a, columns=b)'.\nUse named " "arguments to remove any ambiguity. In the future, using " "positional arguments for 'index' or 'columns' will raise " - " a 'TypeError'." + "a 'TypeError'." ) warnings.warn(msg.format(method_name=method_name), FutureWarning, stacklevel=4) out[data._AXIS_NAMES[0]] = args[0] diff --git a/pandas/util/testing.py b/pandas/util/testing.py index c31cddc102afb..af9fe4846b27d 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1,2745 +1,12 @@ -import bz2 -from collections import Counter -from contextlib import contextmanager -from datetime import datetime -from functools import wraps -import gzip -import os -from shutil import rmtree -import string -import tempfile -from typing import List, Optional, Union, cast import warnings -import zipfile -import numpy as np -from numpy.random import rand, randn +from pandas._testing import * # noqa -from pandas._config.localization import ( # noqa:F401 - can_set_locale, - get_locales, - set_locale, +warnings.warn( + ( + "pandas.util.testing is deprecated. Use the functions in the " + "public API at pandas.testing instead." + ), + FutureWarning, + stacklevel=2, ) - -import pandas._libs.testing as _testing -from pandas._typing import FrameOrSeries -from pandas.compat import _get_lzma_file, _import_lzma - -from pandas.core.dtypes.common import ( - is_bool, - is_categorical_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, - is_extension_array_dtype, - is_interval_dtype, - is_list_like, - is_number, - is_period_dtype, - is_sequence, - is_timedelta64_dtype, - needs_i8_conversion, -) -from pandas.core.dtypes.missing import array_equivalent - -import pandas as pd -from pandas import ( - Categorical, - CategoricalIndex, - DataFrame, - DatetimeIndex, - Index, - IntervalIndex, - MultiIndex, - RangeIndex, - Series, - bdate_range, -) -from pandas.core.algorithms import take_1d -from pandas.core.arrays import ( - DatetimeArray, - ExtensionArray, - IntervalArray, - PeriodArray, - TimedeltaArray, - period_array, -) - -from pandas.io.common import urlopen -from pandas.io.formats.printing import pprint_thing - -lzma = _import_lzma() - -N = 30 -K = 4 -_RAISE_NETWORK_ERROR_DEFAULT = False - -# set testing_mode -_testing_mode_warnings = (DeprecationWarning, ResourceWarning) - - -def set_testing_mode(): - # set the testing mode filters - testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") - if "deprecate" in testing_mode: - warnings.simplefilter("always", _testing_mode_warnings) - - -def reset_testing_mode(): - # reset the testing mode filters - testing_mode = os.environ.get("PANDAS_TESTING_MODE", "None") - if "deprecate" in testing_mode: - warnings.simplefilter("ignore", _testing_mode_warnings) - - -set_testing_mode() - - -def reset_display_options(): - """ - Reset the display options for printing and representing objects. - """ - pd.reset_option("^display.", silent=True) - - -def round_trip_pickle(obj: FrameOrSeries, path: Optional[str] = None) -> FrameOrSeries: - """ - Pickle an object and then read it again. - - Parameters - ---------- - obj : pandas object - The object to pickle and then re-read. - path : str, default None - The path where the pickled object is written and then read. - - Returns - ------- - pandas object - The original object that was pickled and then re-read. - """ - if path is None: - path = f"__{rands(10)}__.pickle" - with ensure_clean(path) as path: - pd.to_pickle(obj, path) - return pd.read_pickle(path) - - -def round_trip_pathlib(writer, reader, path: Optional[str] = None): - """ - Write an object to file specified by a pathlib.Path and read it back - - Parameters - ---------- - writer : callable bound to pandas object - IO writing function (e.g. DataFrame.to_csv ) - reader : callable - IO reading function (e.g. pd.read_csv ) - path : str, default None - The path where the object is written and then read. - - Returns - ------- - pandas object - The original object that was serialized and then re-read. - """ - import pytest - - Path = pytest.importorskip("pathlib").Path - if path is None: - path = "___pathlib___" - with ensure_clean(path) as path: - writer(Path(path)) - obj = reader(Path(path)) - return obj - - -def round_trip_localpath(writer, reader, path: Optional[str] = None): - """ - Write an object to file specified by a py.path LocalPath and read it back. - - Parameters - ---------- - writer : callable bound to pandas object - IO writing function (e.g. DataFrame.to_csv ) - reader : callable - IO reading function (e.g. pd.read_csv ) - path : str, default None - The path where the object is written and then read. - - Returns - ------- - pandas object - The original object that was serialized and then re-read. - """ - import pytest - - LocalPath = pytest.importorskip("py.path").local - if path is None: - path = "___localpath___" - with ensure_clean(path) as path: - writer(LocalPath(path)) - obj = reader(LocalPath(path)) - return obj - - -@contextmanager -def decompress_file(path, compression): - """ - Open a compressed file and return a file object. - - Parameters - ---------- - path : str - The path where the file is read from. - - compression : {'gzip', 'bz2', 'zip', 'xz', None} - Name of the decompression to use - - Returns - ------- - file object - """ - if compression is None: - f = open(path, "rb") - elif compression == "gzip": - f = gzip.open(path, "rb") - elif compression == "bz2": - f = bz2.BZ2File(path, "rb") - elif compression == "xz": - f = _get_lzma_file(lzma)(path, "rb") - elif compression == "zip": - zip_file = zipfile.ZipFile(path) - zip_names = zip_file.namelist() - if len(zip_names) == 1: - f = zip_file.open(zip_names.pop()) - else: - raise ValueError(f"ZIP file {path} error. Only one file per ZIP.") - else: - raise ValueError(f"Unrecognized compression type: {compression}") - - try: - yield f - finally: - f.close() - if compression == "zip": - zip_file.close() - - -def write_to_compressed(compression, path, data, dest="test"): - """ - Write data to a compressed file. - - Parameters - ---------- - compression : {'gzip', 'bz2', 'zip', 'xz'} - The compression type to use. - path : str - The file path to write the data. - data : str - The data to write. - dest : str, default "test" - The destination file (for ZIP only) - - Raises - ------ - ValueError : An invalid compression value was passed in. - """ - if compression == "zip": - import zipfile - - compress_method = zipfile.ZipFile - elif compression == "gzip": - import gzip - - compress_method = gzip.GzipFile - elif compression == "bz2": - import bz2 - - compress_method = bz2.BZ2File - elif compression == "xz": - compress_method = _get_lzma_file(lzma) - else: - raise ValueError(f"Unrecognized compression type: {compression}") - - if compression == "zip": - mode = "w" - args = (dest, data) - method = "writestr" - else: - mode = "wb" - args = (data,) - method = "write" - - with compress_method(path, mode=mode) as f: - getattr(f, method)(*args) - - -def assert_almost_equal( - left, - right, - check_dtype: Union[bool, str] = "equiv", - check_less_precise: Union[bool, int] = False, - **kwargs, -): - """ - Check that the left and right objects are approximately equal. - - By approximately equal, we refer to objects that are numbers or that - contain numbers which may be equivalent to specific levels of precision. - - Parameters - ---------- - left : object - right : object - check_dtype : bool or {'equiv'}, default 'equiv' - Check dtype if both a and b are the same type. If 'equiv' is passed in, - then `RangeIndex` and `Int64Index` are also considered equivalent - when doing type checking. - check_less_precise : bool or int, default False - Specify comparison precision. 5 digits (False) or 3 digits (True) - after decimal points are compared. If int, then specify the number - of digits to compare. - - When comparing two numbers, if the first number has magnitude less - than 1e-5, we compare the two numbers directly and check whether - they are equivalent within the specified precision. Otherwise, we - compare the **ratio** of the second number to the first number and - check whether it is equivalent to 1 within the specified precision. - """ - if isinstance(left, pd.Index): - assert_index_equal( - left, - right, - check_exact=False, - exact=check_dtype, - check_less_precise=check_less_precise, - **kwargs, - ) - - elif isinstance(left, pd.Series): - assert_series_equal( - left, - right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs, - ) - - elif isinstance(left, pd.DataFrame): - assert_frame_equal( - left, - right, - check_exact=False, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs, - ) - - else: - # Other sequences. - if check_dtype: - if is_number(left) and is_number(right): - # Do not compare numeric classes, like np.float64 and float. - pass - elif is_bool(left) and is_bool(right): - # Do not compare bool classes, like np.bool_ and bool. - pass - else: - if isinstance(left, np.ndarray) or isinstance(right, np.ndarray): - obj = "numpy array" - else: - obj = "Input" - assert_class_equal(left, right, obj=obj) - _testing.assert_almost_equal( - left, - right, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs, - ) - - -def _check_isinstance(left, right, cls): - """ - Helper method for our assert_* methods that ensures that - the two objects being compared have the right type before - proceeding with the comparison. - - Parameters - ---------- - left : The first object being compared. - right : The second object being compared. - cls : The class type to check against. - - Raises - ------ - AssertionError : Either `left` or `right` is not an instance of `cls`. - """ - cls_name = cls.__name__ - - if not isinstance(left, cls): - raise AssertionError( - f"{cls_name} Expected type {cls}, found {type(left)} instead" - ) - if not isinstance(right, cls): - raise AssertionError( - f"{cls_name} Expected type {cls}, found {type(right)} instead" - ) - - -def assert_dict_equal(left, right, compare_keys: bool = True): - - _check_isinstance(left, right, dict) - _testing.assert_dict_equal(left, right, compare_keys=compare_keys) - - -def randbool(size=(), p: float = 0.5): - return rand(*size) <= p - - -RANDS_CHARS = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1)) -RANDU_CHARS = np.array( - list("".join(map(chr, range(1488, 1488 + 26))) + string.digits), - dtype=(np.unicode_, 1), -) - - -def rands_array(nchars, size, dtype="O"): - """ - Generate an array of byte strings. - """ - retval = ( - np.random.choice(RANDS_CHARS, size=nchars * np.prod(size)) - .view((np.str_, nchars)) - .reshape(size) - ) - if dtype is None: - return retval - else: - return retval.astype(dtype) - - -def randu_array(nchars, size, dtype="O"): - """ - Generate an array of unicode strings. - """ - retval = ( - np.random.choice(RANDU_CHARS, size=nchars * np.prod(size)) - .view((np.unicode_, nchars)) - .reshape(size) - ) - if dtype is None: - return retval - else: - return retval.astype(dtype) - - -def rands(nchars): - """ - Generate one random byte string. - - See `rands_array` if you want to create an array of random strings. - - """ - return "".join(np.random.choice(RANDS_CHARS, nchars)) - - -def randu(nchars): - """ - Generate one random unicode string. - - See `randu_array` if you want to create an array of random unicode strings. - - """ - return "".join(np.random.choice(RANDU_CHARS, nchars)) - - -def close(fignum=None): - from matplotlib.pyplot import get_fignums, close as _close - - if fignum is None: - for fignum in get_fignums(): - _close(fignum) - else: - _close(fignum) - - -# ----------------------------------------------------------------------------- -# contextmanager to ensure the file cleanup - - -@contextmanager -def ensure_clean(filename=None, return_filelike=False): - """ - Gets a temporary path and agrees to remove on close. - - Parameters - ---------- - filename : str (optional) - if None, creates a temporary file which is then removed when out of - scope. if passed, creates temporary file with filename as ending. - return_filelike : bool (default False) - if True, returns a file-like which is *always* cleaned. Necessary for - savefig and other functions which want to append extensions. - """ - filename = filename or "" - fd = None - - if return_filelike: - f = tempfile.TemporaryFile(suffix=filename) - try: - yield f - finally: - f.close() - else: - # don't generate tempfile if using a path with directory specified - if len(os.path.dirname(filename)): - raise ValueError("Can't pass a qualified name to ensure_clean()") - - try: - fd, filename = tempfile.mkstemp(suffix=filename) - except UnicodeEncodeError: - import pytest - - pytest.skip("no unicode file names on this system") - - try: - yield filename - finally: - try: - os.close(fd) - except OSError: - print(f"Couldn't close file descriptor: {fd} (file: {filename})") - try: - if os.path.exists(filename): - os.remove(filename) - except OSError as e: - print(f"Exception on removing file: {e}") - - -@contextmanager -def ensure_clean_dir(): - """ - Get a temporary directory path and agrees to remove on close. - - Yields - ------ - Temporary directory path - """ - directory_name = tempfile.mkdtemp(suffix="") - try: - yield directory_name - finally: - try: - rmtree(directory_name) - except OSError: - pass - - -@contextmanager -def ensure_safe_environment_variables(): - """ - Get a context manager to safely set environment variables - - All changes will be undone on close, hence environment variables set - within this contextmanager will neither persist nor change global state. - """ - saved_environ = dict(os.environ) - try: - yield - finally: - os.environ.clear() - os.environ.update(saved_environ) - - -# ----------------------------------------------------------------------------- -# Comparators - - -def equalContents(arr1, arr2) -> bool: - """ - Checks if the set of unique elements of arr1 and arr2 are equivalent. - """ - return frozenset(arr1) == frozenset(arr2) - - -def assert_index_equal( - left: Index, - right: Index, - exact: Union[bool, str] = "equiv", - check_names: bool = True, - check_less_precise: Union[bool, int] = False, - check_exact: bool = True, - check_categorical: bool = True, - obj: str = "Index", -) -> None: - """ - Check that left and right Index are equal. - - Parameters - ---------- - left : Index - right : Index - exact : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. If 'equiv', then RangeIndex can be substituted for - Int64Index as well. - check_names : bool, default True - Whether to check the names attribute. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - check_exact : bool, default True - Whether to compare number exactly. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - obj : str, default 'Index' - Specify object name being compared, internally used to show appropriate - assertion message. - """ - __tracebackhide__ = True - - def _check_types(l, r, obj="Index"): - if exact: - assert_class_equal(l, r, exact=exact, obj=obj) - - # Skip exact dtype checking when `check_categorical` is False - if check_categorical: - assert_attr_equal("dtype", l, r, obj=obj) - - # allow string-like to have different inferred_types - if l.inferred_type in ("string", "unicode"): - assert r.inferred_type in ("string", "unicode") - else: - assert_attr_equal("inferred_type", l, r, obj=obj) - - def _get_ilevel_values(index, level): - # accept level number only - unique = index.levels[level] - level_codes = index.codes[level] - filled = take_1d(unique.values, level_codes, fill_value=unique._na_value) - values = unique._shallow_copy(filled, name=index.names[level]) - return values - - # instance validation - _check_isinstance(left, right, Index) - - # class / dtype comparison - _check_types(left, right, obj=obj) - - # level comparison - if left.nlevels != right.nlevels: - msg1 = f"{obj} levels are different" - msg2 = f"{left.nlevels}, {left}" - msg3 = f"{right.nlevels}, {right}" - raise_assert_detail(obj, msg1, msg2, msg3) - - # length comparison - if len(left) != len(right): - msg1 = f"{obj} length are different" - msg2 = f"{len(left)}, {left}" - msg3 = f"{len(right)}, {right}" - raise_assert_detail(obj, msg1, msg2, msg3) - - # MultiIndex special comparison for little-friendly error messages - if left.nlevels > 1: - left = cast(MultiIndex, left) - right = cast(MultiIndex, right) - - for level in range(left.nlevels): - # cannot use get_level_values here because it can change dtype - llevel = _get_ilevel_values(left, level) - rlevel = _get_ilevel_values(right, level) - - lobj = f"MultiIndex level [{level}]" - assert_index_equal( - llevel, - rlevel, - exact=exact, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - obj=lobj, - ) - # get_level_values may change dtype - _check_types(left.levels[level], right.levels[level], obj=obj) - - # skip exact index checking when `check_categorical` is False - if check_exact and check_categorical: - if not left.equals(right): - diff = np.sum((left.values != right.values).astype(int)) * 100.0 / len(left) - msg = f"{obj} values are different ({np.round(diff, 5)} %)" - raise_assert_detail(obj, msg, left, right) - else: - _testing.assert_almost_equal( - left.values, - right.values, - check_less_precise=check_less_precise, - check_dtype=exact, - obj=obj, - lobj=left, - robj=right, - ) - - # metadata comparison - if check_names: - assert_attr_equal("names", left, right, obj=obj) - if isinstance(left, pd.PeriodIndex) or isinstance(right, pd.PeriodIndex): - assert_attr_equal("freq", left, right, obj=obj) - if isinstance(left, pd.IntervalIndex) or isinstance(right, pd.IntervalIndex): - assert_interval_array_equal(left.values, right.values) - - if check_categorical: - if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, obj=f"{obj} category") - - -def assert_class_equal(left, right, exact: Union[bool, str] = True, obj="Input"): - """ - Checks classes are equal. - """ - __tracebackhide__ = True - - def repr_class(x): - if isinstance(x, Index): - # return Index as it is to include values in the error message - return x - - try: - return type(x).__name__ - except AttributeError: - return repr(type(x)) - - if exact == "equiv": - if type(left) != type(right): - # allow equivalence of Int64Index/RangeIndex - types = {type(left).__name__, type(right).__name__} - if len(types - {"Int64Index", "RangeIndex"}): - msg = f"{obj} classes are not equivalent" - raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) - elif exact: - if type(left) != type(right): - msg = f"{obj} classes are different" - raise_assert_detail(obj, msg, repr_class(left), repr_class(right)) - - -def assert_attr_equal(attr, left, right, obj="Attributes"): - """checks attributes are equal. Both objects must have attribute. - - Parameters - ---------- - attr : str - Attribute name being compared. - left : object - right : object - obj : str, default 'Attributes' - Specify object name being compared, internally used to show appropriate - assertion message - """ - __tracebackhide__ = True - - left_attr = getattr(left, attr) - right_attr = getattr(right, attr) - - if left_attr is right_attr: - return True - elif ( - is_number(left_attr) - and np.isnan(left_attr) - and is_number(right_attr) - and np.isnan(right_attr) - ): - # np.nan - return True - - try: - result = left_attr == right_attr - except TypeError: - # datetimetz on rhs may raise TypeError - result = False - if not isinstance(result, bool): - result = result.all() - - if result: - return True - else: - msg = f'Attribute "{attr}" are different' - raise_assert_detail(obj, msg, left_attr, right_attr) - - -def assert_is_valid_plot_return_object(objs): - import matplotlib.pyplot as plt - - if isinstance(objs, (pd.Series, np.ndarray)): - for el in objs.ravel(): - msg = ( - "one of 'objs' is not a matplotlib Axes instance, " - f"type encountered {repr(type(el).__name__)}" - ) - assert isinstance(el, (plt.Axes, dict)), msg - else: - msg = ( - "objs is neither an ndarray of Artist instances nor a single " - "ArtistArtist instance, tuple, or dict, 'objs' is a " - f"{repr(type(objs).__name__)}" - ) - assert isinstance(objs, (plt.Artist, tuple, dict)), msg - - -def isiterable(obj): - return hasattr(obj, "__iter__") - - -def assert_is_sorted(seq): - """Assert that the sequence is sorted.""" - if isinstance(seq, (Index, Series)): - seq = seq.values - # sorting does not change precisions - assert_numpy_array_equal(seq, np.sort(np.array(seq))) - - -def assert_categorical_equal( - left, right, check_dtype=True, check_category_order=True, obj="Categorical" -): - """Test that Categoricals are equivalent. - - Parameters - ---------- - left : Categorical - right : Categorical - check_dtype : bool, default True - Check that integer dtype of the codes are the same - check_category_order : bool, default True - Whether the order of the categories should be compared, which - implies identical integer codes. If False, only the resulting - values are compared. The ordered attribute is - checked regardless. - obj : str, default 'Categorical' - Specify object name being compared, internally used to show appropriate - assertion message - """ - _check_isinstance(left, right, Categorical) - - if check_category_order: - assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") - assert_numpy_array_equal( - left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", - ) - else: - assert_index_equal( - left.categories.sort_values(), - right.categories.sort_values(), - obj=f"{obj}.categories", - ) - assert_index_equal( - left.categories.take(left.codes), - right.categories.take(right.codes), - obj=f"{obj}.values", - ) - - assert_attr_equal("ordered", left, right, obj=obj) - - -def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray"): - """Test that two IntervalArrays are equivalent. - - Parameters - ---------- - left, right : IntervalArray - The IntervalArrays to compare. - exact : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. If 'equiv', then RangeIndex can be substituted for - Int64Index as well. - obj : str, default 'IntervalArray' - Specify object name being compared, internally used to show appropriate - assertion message - """ - _check_isinstance(left, right, IntervalArray) - - assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") - assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") - assert_attr_equal("closed", left, right, obj=obj) - - -def assert_period_array_equal(left, right, obj="PeriodArray"): - _check_isinstance(left, right, PeriodArray) - - assert_numpy_array_equal(left._data, right._data, obj=f"{obj}.values") - assert_attr_equal("freq", left, right, obj=obj) - - -def assert_datetime_array_equal(left, right, obj="DatetimeArray"): - __tracebackhide__ = True - _check_isinstance(left, right, DatetimeArray) - - assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) - assert_attr_equal("tz", left, right, obj=obj) - - -def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): - __tracebackhide__ = True - _check_isinstance(left, right, TimedeltaArray) - assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) - - -def raise_assert_detail(obj, message, left, right, diff=None): - __tracebackhide__ = True - - if isinstance(left, np.ndarray): - left = pprint_thing(left) - elif is_categorical_dtype(left): - left = repr(left) - - if isinstance(right, np.ndarray): - right = pprint_thing(right) - elif is_categorical_dtype(right): - right = repr(right) - - msg = f"""{obj} are different - -{message} -[left]: {left} -[right]: {right}""" - - if diff is not None: - msg += f"\n[diff]: {diff}" - - raise AssertionError(msg) - - -def assert_numpy_array_equal( - left, - right, - strict_nan=False, - check_dtype=True, - err_msg=None, - check_same=None, - obj="numpy array", -): - """ Checks that 'np.ndarray' is equivalent - - Parameters - ---------- - left : np.ndarray or iterable - right : np.ndarray or iterable - strict_nan : bool, default False - If True, consider NaN and None to be different. - check_dtype: bool, default True - check dtype if both a and b are np.ndarray - err_msg : str, default None - If provided, used as assertion message - check_same : None|'copy'|'same', default None - Ensure left and right refer/do not refer to the same memory area - obj : str, default 'numpy array' - Specify object name being compared, internally used to show appropriate - assertion message - """ - __tracebackhide__ = True - - # instance validation - # Show a detailed error message when classes are different - assert_class_equal(left, right, obj=obj) - # both classes must be an np.ndarray - _check_isinstance(left, right, np.ndarray) - - def _get_base(obj): - return obj.base if getattr(obj, "base", None) is not None else obj - - left_base = _get_base(left) - right_base = _get_base(right) - - if check_same == "same": - if left_base is not right_base: - raise AssertionError(f"{repr(left_base)} is not {repr(right_base)}") - elif check_same == "copy": - if left_base is right_base: - raise AssertionError(f"{repr(left_base)} is {repr(right_base)}") - - def _raise(left, right, err_msg): - if err_msg is None: - if left.shape != right.shape: - raise_assert_detail( - obj, f"{obj} shapes are different", left.shape, right.shape, - ) - - diff = 0 - for l, r in zip(left, right): - # count up differences - if not array_equivalent(l, r, strict_nan=strict_nan): - diff += 1 - - diff = diff * 100.0 / left.size - msg = f"{obj} values are different ({np.round(diff, 5)} %)" - raise_assert_detail(obj, msg, left, right) - - raise AssertionError(err_msg) - - # compare shape and values - if not array_equivalent(left, right, strict_nan=strict_nan): - _raise(left, right, err_msg) - - if check_dtype: - if isinstance(left, np.ndarray) and isinstance(right, np.ndarray): - assert_attr_equal("dtype", left, right, obj=obj) - - -def assert_extension_array_equal( - left, right, check_dtype=True, check_less_precise=False, check_exact=False -): - """Check that left and right ExtensionArrays are equal. - - Parameters - ---------- - left, right : ExtensionArray - The two arrays to compare - check_dtype : bool, default True - Whether to check if the ExtensionArray dtypes are identical. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - check_exact : bool, default False - Whether to compare number exactly. - - Notes - ----- - Missing values are checked separately from valid values. - A mask of missing values is computed for each and checked to match. - The remaining all-valid values are cast to object dtype and checked. - """ - assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" - assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" - if check_dtype: - assert_attr_equal("dtype", left, right, obj="ExtensionArray") - - if hasattr(left, "asi8") and type(right) == type(left): - # Avoid slow object-dtype comparisons - assert_numpy_array_equal(left.asi8, right.asi8) - return - - left_na = np.asarray(left.isna()) - right_na = np.asarray(right.isna()) - assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask") - - left_valid = np.asarray(left[~left_na].astype(object)) - right_valid = np.asarray(right[~right_na].astype(object)) - if check_exact: - assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray") - else: - _testing.assert_almost_equal( - left_valid, - right_valid, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - obj="ExtensionArray", - ) - - -# This could be refactored to use the NDFrame.equals method -def assert_series_equal( - left, - right, - check_dtype=True, - check_index_type="equiv", - check_series_type=True, - check_less_precise=False, - check_names=True, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - obj="Series", -): - """ - Check that left and right Series are equal. - - Parameters - ---------- - left : Series - right : Series - check_dtype : bool, default True - Whether to check the Series dtype is identical. - check_index_type : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. - check_series_type : bool, default True - Whether to check the Series class is identical. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - - When comparing two numbers, if the first number has magnitude less - than 1e-5, we compare the two numbers directly and check whether - they are equivalent within the specified precision. Otherwise, we - compare the **ratio** of the second number to the first number and - check whether it is equivalent to 1 within the specified precision. - check_names : bool, default True - Whether to check the Series and Index names attribute. - check_exact : bool, default False - Whether to compare number exactly. - check_datetimelike_compat : bool, default False - Compare datetime-like which is comparable ignoring dtype. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - obj : str, default 'Series' - Specify object name being compared, internally used to show appropriate - assertion message. - """ - __tracebackhide__ = True - - # instance validation - _check_isinstance(left, right, Series) - - if check_series_type: - # ToDo: There are some tests using rhs is sparse - # lhs is dense. Should use assert_class_equal in future - assert isinstance(left, type(right)) - # assert_class_equal(left, right, obj=obj) - - # length comparison - if len(left) != len(right): - msg1 = f"{len(left)}, {left.index}" - msg2 = f"{len(right)}, {right.index}" - raise_assert_detail(obj, "Series length are different", msg1, msg2) - - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.index", - ) - - if check_dtype: - # We want to skip exact dtype checking when `check_categorical` - # is False. We'll still raise if only one is a `Categorical`, - # regardless of `check_categorical` - if ( - is_categorical_dtype(left) - and is_categorical_dtype(right) - and not check_categorical - ): - pass - else: - assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - - if check_exact: - assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, - obj=str(obj), - ) - elif check_datetimelike_compat: - # we want to check only if we have compat dtypes - # e.g. integer and M|m are NOT compat, but we can simply check - # the values in that case - if needs_i8_conversion(left) or needs_i8_conversion(right): - - # datetimelike may have different objects (e.g. datetime.datetime - # vs Timestamp) but will compare equal - if not Index(left.values).equals(Index(right.values)): - msg = ( - f"[datetimelike_compat=True] {left.values} " - f"is not equal to {right.values}." - ) - raise AssertionError(msg) - else: - assert_numpy_array_equal( - left._internal_get_values(), - right._internal_get_values(), - check_dtype=check_dtype, - ) - elif is_interval_dtype(left) or is_interval_dtype(right): - assert_interval_array_equal(left.array, right.array) - elif is_extension_array_dtype(left.dtype) and is_datetime64tz_dtype(left.dtype): - # .values is an ndarray, but ._values is the ExtensionArray. - # TODO: Use .array - assert is_extension_array_dtype(right.dtype) - assert_extension_array_equal(left._values, right._values) - elif ( - is_extension_array_dtype(left) - and not is_categorical_dtype(left) - and is_extension_array_dtype(right) - and not is_categorical_dtype(right) - ): - assert_extension_array_equal(left.array, right.array) - else: - _testing.assert_almost_equal( - left._internal_get_values(), - right._internal_get_values(), - check_less_precise=check_less_precise, - check_dtype=check_dtype, - obj=str(obj), - ) - - # metadata comparison - if check_names: - assert_attr_equal("name", left, right, obj=obj) - - if check_categorical: - if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, obj=f"{obj} category") - - -# This could be refactored to use the NDFrame.equals method -def assert_frame_equal( - left, - right, - check_dtype=True, - check_index_type="equiv", - check_column_type="equiv", - check_frame_type=True, - check_less_precise=False, - check_names=True, - by_blocks=False, - check_exact=False, - check_datetimelike_compat=False, - check_categorical=True, - check_like=False, - obj="DataFrame", -): - """ - Check that left and right DataFrame are equal. - - This function is intended to compare two DataFrames and output any - differences. Is is mostly intended for use in unit tests. - Additional parameters allow varying the strictness of the - equality checks performed. - - Parameters - ---------- - left : DataFrame - First DataFrame to compare. - right : DataFrame - Second DataFrame to compare. - check_dtype : bool, default True - Whether to check the DataFrame dtype is identical. - check_index_type : bool or {'equiv'}, default 'equiv' - Whether to check the Index class, dtype and inferred_type - are identical. - check_column_type : bool or {'equiv'}, default 'equiv' - Whether to check the columns class, dtype and inferred_type - are identical. Is passed as the ``exact`` argument of - :func:`assert_index_equal`. - check_frame_type : bool, default True - Whether to check the DataFrame class is identical. - check_less_precise : bool or int, default False - Specify comparison precision. Only used when check_exact is False. - 5 digits (False) or 3 digits (True) after decimal points are compared. - If int, then specify the digits to compare. - - When comparing two numbers, if the first number has magnitude less - than 1e-5, we compare the two numbers directly and check whether - they are equivalent within the specified precision. Otherwise, we - compare the **ratio** of the second number to the first number and - check whether it is equivalent to 1 within the specified precision. - check_names : bool, default True - Whether to check that the `names` attribute for both the `index` - and `column` attributes of the DataFrame is identical. - by_blocks : bool, default False - Specify how to compare internal data. If False, compare by columns. - If True, compare by blocks. - check_exact : bool, default False - Whether to compare number exactly. - check_datetimelike_compat : bool, default False - Compare datetime-like which is comparable ignoring dtype. - check_categorical : bool, default True - Whether to compare internal Categorical exactly. - check_like : bool, default False - If True, ignore the order of index & columns. - Note: index labels must match their respective rows - (same as in columns) - same labels must be with the same data. - obj : str, default 'DataFrame' - Specify object name being compared, internally used to show appropriate - assertion message. - - See Also - -------- - assert_series_equal : Equivalent method for asserting Series equality. - DataFrame.equals : Check DataFrame equality. - - Examples - -------- - This example shows comparing two DataFrames that are equal - but with columns of differing dtypes. - - >>> from pandas.util.testing import assert_frame_equal - >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) - >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) - - df1 equals itself. - - >>> assert_frame_equal(df1, df1) - - df1 differs from df2 as column 'b' is of a different type. - - >>> assert_frame_equal(df1, df2) - Traceback (most recent call last): - ... - AssertionError: Attributes of DataFrame.iloc[:, 1] are different - - Attribute "dtype" are different - [left]: int64 - [right]: float64 - - Ignore differing dtypes in columns with check_dtype. - - >>> assert_frame_equal(df1, df2, check_dtype=False) - """ - __tracebackhide__ = True - - # instance validation - _check_isinstance(left, right, DataFrame) - - if check_frame_type: - assert isinstance(left, type(right)) - # assert_class_equal(left, right, obj=obj) - - # shape comparison - if left.shape != right.shape: - raise_assert_detail( - obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", - ) - - if check_like: - left, right = left.reindex_like(right), right - - # index comparison - assert_index_equal( - left.index, - right.index, - exact=check_index_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.index", - ) - - # column comparison - assert_index_equal( - left.columns, - right.columns, - exact=check_column_type, - check_names=check_names, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_categorical=check_categorical, - obj=f"{obj}.columns", - ) - - # compare by blocks - if by_blocks: - rblocks = right._to_dict_of_blocks() - lblocks = left._to_dict_of_blocks() - for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): - assert dtype in lblocks - assert dtype in rblocks - assert_frame_equal( - lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj - ) - - # compare by columns - else: - for i, col in enumerate(left.columns): - assert col in right - lcol = left.iloc[:, i] - rcol = right.iloc[:, i] - assert_series_equal( - lcol, - rcol, - check_dtype=check_dtype, - check_index_type=check_index_type, - check_less_precise=check_less_precise, - check_exact=check_exact, - check_names=check_names, - check_datetimelike_compat=check_datetimelike_compat, - check_categorical=check_categorical, - obj=f"{obj}.iloc[:, {i}]", - ) - - -def assert_equal(left, right, **kwargs): - """ - Wrapper for tm.assert_*_equal to dispatch to the appropriate test function. - - Parameters - ---------- - left : Index, Series, DataFrame, ExtensionArray, or np.ndarray - right : Index, Series, DataFrame, ExtensionArray, or np.ndarray - **kwargs - """ - __tracebackhide__ = True - - if isinstance(left, pd.Index): - assert_index_equal(left, right, **kwargs) - elif isinstance(left, pd.Series): - assert_series_equal(left, right, **kwargs) - elif isinstance(left, pd.DataFrame): - assert_frame_equal(left, right, **kwargs) - elif isinstance(left, IntervalArray): - assert_interval_array_equal(left, right, **kwargs) - elif isinstance(left, PeriodArray): - assert_period_array_equal(left, right, **kwargs) - elif isinstance(left, DatetimeArray): - assert_datetime_array_equal(left, right, **kwargs) - elif isinstance(left, TimedeltaArray): - assert_timedelta_array_equal(left, right, **kwargs) - elif isinstance(left, ExtensionArray): - assert_extension_array_equal(left, right, **kwargs) - elif isinstance(left, np.ndarray): - assert_numpy_array_equal(left, right, **kwargs) - elif isinstance(left, str): - assert kwargs == {} - return left == right - else: - raise NotImplementedError(type(left)) - - -def box_expected(expected, box_cls, transpose=True): - """ - Helper function to wrap the expected output of a test in a given box_class. - - Parameters - ---------- - expected : np.ndarray, Index, Series - box_cls : {Index, Series, DataFrame} - - Returns - ------- - subclass of box_cls - """ - if box_cls is pd.Index: - expected = pd.Index(expected) - elif box_cls is pd.Series: - expected = pd.Series(expected) - elif box_cls is pd.DataFrame: - expected = pd.Series(expected).to_frame() - if transpose: - # for vector operations, we we need a DataFrame to be a single-row, - # not a single-column, in order to operate against non-DataFrame - # vectors of the same length. - expected = expected.T - elif box_cls is PeriodArray: - # the PeriodArray constructor is not as flexible as period_array - expected = period_array(expected) - elif box_cls is DatetimeArray: - expected = DatetimeArray(expected) - elif box_cls is TimedeltaArray: - expected = TimedeltaArray(expected) - elif box_cls is np.ndarray: - expected = np.array(expected) - elif box_cls is to_array: - expected = to_array(expected) - else: - raise NotImplementedError(box_cls) - return expected - - -def to_array(obj): - # temporary implementation until we get pd.array in place - if is_period_dtype(obj): - return period_array(obj) - elif is_datetime64_dtype(obj) or is_datetime64tz_dtype(obj): - return DatetimeArray._from_sequence(obj) - elif is_timedelta64_dtype(obj): - return TimedeltaArray._from_sequence(obj) - else: - return np.array(obj) - - -# ----------------------------------------------------------------------------- -# Sparse - - -def assert_sp_array_equal( - left, - right, - check_dtype=True, - check_kind=True, - check_fill_value=True, - consolidate_block_indices=False, -): - """Check that the left and right SparseArray are equal. - - Parameters - ---------- - left : SparseArray - right : SparseArray - check_dtype : bool, default True - Whether to check the data dtype is identical. - check_kind : bool, default True - Whether to just the kind of the sparse index for each column. - check_fill_value : bool, default True - Whether to check that left.fill_value matches right.fill_value - consolidate_block_indices : bool, default False - Whether to consolidate contiguous blocks for sparse arrays with - a BlockIndex. Some operations, e.g. concat, will end up with - block indices that could be consolidated. Setting this to true will - create a new BlockIndex for that array, with consolidated - block indices. - """ - - _check_isinstance(left, right, pd.SparseArray) - - assert_numpy_array_equal(left.sp_values, right.sp_values, check_dtype=check_dtype) - - # SparseIndex comparison - assert isinstance(left.sp_index, pd._libs.sparse.SparseIndex) - assert isinstance(right.sp_index, pd._libs.sparse.SparseIndex) - - if not check_kind: - left_index = left.sp_index.to_block_index() - right_index = right.sp_index.to_block_index() - else: - left_index = left.sp_index - right_index = right.sp_index - - if consolidate_block_indices and left.kind == "block": - # we'll probably remove this hack... - left_index = left_index.to_int_index().to_block_index() - right_index = right_index.to_int_index().to_block_index() - - if not left_index.equals(right_index): - raise_assert_detail( - "SparseArray.index", "index are not equal", left_index, right_index - ) - else: - # Just ensure a - pass - - if check_fill_value: - assert_attr_equal("fill_value", left, right) - if check_dtype: - assert_attr_equal("dtype", left, right) - assert_numpy_array_equal(left.to_dense(), right.to_dense(), check_dtype=check_dtype) - - -# ----------------------------------------------------------------------------- -# Others - - -def assert_contains_all(iterable, dic): - for k in iterable: - assert k in dic, f"Did not contain item: {repr(k)}" - - -def assert_copy(iter1, iter2, **eql_kwargs): - """ - iter1, iter2: iterables that produce elements - comparable with assert_almost_equal - - Checks that the elements are equal, but not - the same object. (Does not check that items - in sequences are also not the same object) - """ - for elem1, elem2 in zip(iter1, iter2): - assert_almost_equal(elem1, elem2, **eql_kwargs) - msg = ( - f"Expected object {repr(type(elem1))} and object {repr(type(elem2))} to be " - "different objects, but they were the same object." - ) - assert elem1 is not elem2, msg - - -def getCols(k): - return string.ascii_uppercase[:k] - - -# make index -def makeStringIndex(k=10, name=None): - return Index(rands_array(nchars=10, size=k), name=name) - - -def makeUnicodeIndex(k=10, name=None): - return Index(randu_array(nchars=10, size=k), name=name) - - -def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): - """ make a length k index or n categories """ - x = rands_array(nchars=4, size=n) - return CategoricalIndex( - Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs - ) - - -def makeIntervalIndex(k=10, name=None, **kwargs): - """ make a length k IntervalIndex """ - x = np.linspace(0, 100, num=(k + 1)) - return IntervalIndex.from_breaks(x, name=name, **kwargs) - - -def makeBoolIndex(k=10, name=None): - if k == 1: - return Index([True], name=name) - elif k == 2: - return Index([False, True], name=name) - return Index([False, True] + [False] * (k - 2), name=name) - - -def makeIntIndex(k=10, name=None): - return Index(list(range(k)), name=name) - - -def makeUIntIndex(k=10, name=None): - return Index([2 ** 63 + i for i in range(k)], name=name) - - -def makeRangeIndex(k=10, name=None, **kwargs): - return RangeIndex(0, k, 1, name=name, **kwargs) - - -def makeFloatIndex(k=10, name=None): - values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) - return Index(values * (10 ** np.random.randint(0, 9)), name=name) - - -def makeDateIndex(k=10, freq="B", name=None, **kwargs): - dt = datetime(2000, 1, 1) - dr = bdate_range(dt, periods=k, freq=freq, name=name) - return DatetimeIndex(dr, name=name, **kwargs) - - -def makeTimedeltaIndex(k=10, freq="D", name=None, **kwargs): - return pd.timedelta_range(start="1 day", periods=k, freq=freq, name=name, **kwargs) - - -def makePeriodIndex(k=10, name=None, **kwargs): - dt = datetime(2000, 1, 1) - dr = pd.period_range(start=dt, periods=k, freq="B", name=name, **kwargs) - return dr - - -def makeMultiIndex(k=10, names=None, **kwargs): - return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) - - -_names = [ - "Alice", - "Bob", - "Charlie", - "Dan", - "Edith", - "Frank", - "George", - "Hannah", - "Ingrid", - "Jerry", - "Kevin", - "Laura", - "Michael", - "Norbert", - "Oliver", - "Patricia", - "Quinn", - "Ray", - "Sarah", - "Tim", - "Ursula", - "Victor", - "Wendy", - "Xavier", - "Yvonne", - "Zelda", -] - - -def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None): - """ - Make a DataFrame with a DatetimeIndex - - Parameters - ---------- - start : str or Timestamp, default "2000-01-01" - The start of the index. Passed to date_range with `freq`. - end : str or Timestamp, default "2000-12-31" - The end of the index. Passed to date_range with `freq`. - freq : str or Freq - The frequency to use for the DatetimeIndex - seed : int, optional - The random state seed. - - * name : object dtype with string names - * id : int dtype with - * x, y : float dtype - - Examples - -------- - >>> _make_timeseries() - id name x y - timestamp - 2000-01-01 982 Frank 0.031261 0.986727 - 2000-01-02 1025 Edith -0.086358 -0.032920 - 2000-01-03 982 Edith 0.473177 0.298654 - 2000-01-04 1009 Sarah 0.534344 -0.750377 - 2000-01-05 963 Zelda -0.271573 0.054424 - ... ... ... ... ... - 2000-12-27 980 Ingrid -0.132333 -0.422195 - 2000-12-28 972 Frank -0.376007 -0.298687 - 2000-12-29 1009 Ursula -0.865047 -0.503133 - 2000-12-30 1000 Hannah -0.063757 -0.507336 - 2000-12-31 972 Tim -0.869120 0.531685 - """ - index = pd.date_range(start=start, end=end, freq=freq, name="timestamp") - n = len(index) - state = np.random.RandomState(seed) - columns = { - "name": state.choice(_names, size=n), - "id": state.poisson(1000, size=n), - "x": state.rand(n) * 2 - 1, - "y": state.rand(n) * 2 - 1, - } - df = pd.DataFrame(columns, index=index, columns=sorted(columns)) - if df.index[-1] == end: - df = df.iloc[:-1] - return df - - -def all_index_generator(k=10): - """Generator which can be iterated over to get instances of all the various - index classes. - - Parameters - ---------- - k: length of each of the index instances - """ - all_make_index_funcs = [ - makeIntIndex, - makeFloatIndex, - makeStringIndex, - makeUnicodeIndex, - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeBoolIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - ] - for make_index_func in all_make_index_funcs: - yield make_index_func(k=k) - - -def index_subclass_makers_generator(): - make_index_funcs = [ - makeDateIndex, - makePeriodIndex, - makeTimedeltaIndex, - makeRangeIndex, - makeIntervalIndex, - makeCategoricalIndex, - makeMultiIndex, - ] - for make_index_func in make_index_funcs: - yield make_index_func - - -def all_timeseries_index_generator(k=10): - """Generator which can be iterated over to get instances of all the classes - which represent time-series. - - Parameters - ---------- - k: length of each of the index instances - """ - make_index_funcs = [makeDateIndex, makePeriodIndex, makeTimedeltaIndex] - for make_index_func in make_index_funcs: - yield make_index_func(k=k) - - -# make series -def makeFloatSeries(name=None): - index = makeStringIndex(N) - return Series(randn(N), index=index, name=name) - - -def makeStringSeries(name=None): - index = makeStringIndex(N) - return Series(randn(N), index=index, name=name) - - -def makeObjectSeries(name=None): - data = makeStringIndex(N) - data = Index(data, dtype=object) - index = makeStringIndex(N) - return Series(data, index=index, name=name) - - -def getSeriesData(): - index = makeStringIndex(N) - return {c: Series(randn(N), index=index) for c in getCols(K)} - - -def makeTimeSeries(nper=None, freq="B", name=None): - if nper is None: - nper = N - return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name) - - -def makePeriodSeries(nper=None, name=None): - if nper is None: - nper = N - return Series(randn(nper), index=makePeriodIndex(nper), name=name) - - -def getTimeSeriesData(nper=None, freq="B"): - return {c: makeTimeSeries(nper, freq) for c in getCols(K)} - - -def getPeriodData(nper=None): - return {c: makePeriodSeries(nper) for c in getCols(K)} - - -# make frame -def makeTimeDataFrame(nper=None, freq="B"): - data = getTimeSeriesData(nper, freq) - return DataFrame(data) - - -def makeDataFrame(): - data = getSeriesData() - return DataFrame(data) - - -def getMixedTypeDict(): - index = Index(["a", "b", "c", "d", "e"]) - - data = { - "A": [0.0, 1.0, 2.0, 3.0, 4.0], - "B": [0.0, 1.0, 0.0, 1.0, 0.0], - "C": ["foo1", "foo2", "foo3", "foo4", "foo5"], - "D": bdate_range("1/1/2009", periods=5), - } - - return index, data - - -def makeMixedDataFrame(): - return DataFrame(getMixedTypeDict()[1]) - - -def makePeriodFrame(nper=None): - data = getPeriodData(nper) - return DataFrame(data) - - -def makeCustomIndex( - nentries, nlevels, prefix="#", names=False, ndupe_l=None, idx_type=None -): - """Create an index/multindex with given dimensions, levels, names, etc' - - nentries - number of entries in index - nlevels - number of levels (> 1 produces multindex) - prefix - a string prefix for labels - names - (Optional), bool or list of strings. if True will use default - names, if false will use no names, if a list is given, the name of - each level in the index will be taken from the list. - ndupe_l - (Optional), list of ints, the number of rows for which the - label will repeated at the corresponding level, you can specify just - the first few, the rest will use the default ndupe_l of 1. - len(ndupe_l) <= nlevels. - idx_type - "i"/"f"/"s"/"u"/"dt"/"p"/"td". - If idx_type is not None, `idx_nlevels` must be 1. - "i"/"f" creates an integer/float index, - "s"/"u" creates a string/unicode index - "dt" create a datetime index. - "td" create a datetime index. - - if unspecified, string labels will be generated. - """ - - if ndupe_l is None: - ndupe_l = [1] * nlevels - assert is_sequence(ndupe_l) and len(ndupe_l) <= nlevels - assert names is None or names is False or names is True or len(names) is nlevels - assert idx_type is None or ( - idx_type in ("i", "f", "s", "u", "dt", "p", "td") and nlevels == 1 - ) - - if names is True: - # build default names - names = [prefix + str(i) for i in range(nlevels)] - if names is False: - # pass None to index constructor for no name - names = None - - # make singleton case uniform - if isinstance(names, str) and nlevels == 1: - names = [names] - - # specific 1D index type requested? - idx_func = dict( - i=makeIntIndex, - f=makeFloatIndex, - s=makeStringIndex, - u=makeUnicodeIndex, - dt=makeDateIndex, - td=makeTimedeltaIndex, - p=makePeriodIndex, - ).get(idx_type) - if idx_func: - idx = idx_func(nentries) - # but we need to fill in the name - if names: - idx.name = names[0] - return idx - elif idx_type is not None: - raise ValueError( - f"{repr(idx_type)} is not a legal value for `idx_type`, " - "use 'i'/'f'/'s'/'u'/'dt'/'p'/'td'." - ) - - if len(ndupe_l) < nlevels: - ndupe_l.extend([1] * (nlevels - len(ndupe_l))) - assert len(ndupe_l) == nlevels - - assert all(x > 0 for x in ndupe_l) - - tuples = [] - for i in range(nlevels): - - def keyfunc(x): - import re - - numeric_tuple = re.sub(r"[^\d_]_?", "", x).split("_") - return [int(num) for num in numeric_tuple] - - # build a list of lists to create the index from - div_factor = nentries // ndupe_l[i] + 1 - cnt = Counter() - for j in range(div_factor): - label = f"{prefix}_l{i}_g{j}" - cnt[label] = ndupe_l[i] - # cute Counter trick - result = sorted(cnt.elements(), key=keyfunc)[:nentries] - tuples.append(result) - - tuples = list(zip(*tuples)) - - # convert tuples to index - if nentries == 1: - # we have a single level of tuples, i.e. a regular Index - index = Index(tuples[0], name=names[0]) - elif nlevels == 1: - name = None if names is None else names[0] - index = Index((x[0] for x in tuples), name=name) - else: - index = MultiIndex.from_tuples(tuples, names=names) - return index - - -def makeCustomDataframe( - nrows, - ncols, - c_idx_names=True, - r_idx_names=True, - c_idx_nlevels=1, - r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, - r_ndupe_l=None, - dtype=None, - c_idx_type=None, - r_idx_type=None, -): - """ - nrows, ncols - number of data rows/cols - c_idx_names, idx_names - False/True/list of strings, yields No names , - default names or uses the provided names for the levels of the - corresponding index. You can provide a single string when - c_idx_nlevels ==1. - c_idx_nlevels - number of levels in columns index. > 1 will yield MultiIndex - r_idx_nlevels - number of levels in rows index. > 1 will yield MultiIndex - data_gen_f - a function f(row,col) which return the data value - at that position, the default generator used yields values of the form - "RxCy" based on position. - c_ndupe_l, r_ndupe_l - list of integers, determines the number - of duplicates for each label at a given level of the corresponding - index. The default `None` value produces a multiplicity of 1 across - all levels, i.e. a unique index. Will accept a partial list of length - N < idx_nlevels, for just the first N levels. If ndupe doesn't divide - nrows/ncol, the last label might have lower multiplicity. - dtype - passed to the DataFrame constructor as is, in case you wish to - have more control in conjunction with a custom `data_gen_f` - r_idx_type, c_idx_type - "i"/"f"/"s"/"u"/"dt"/"td". - If idx_type is not None, `idx_nlevels` must be 1. - "i"/"f" creates an integer/float index, - "s"/"u" creates a string/unicode index - "dt" create a datetime index. - "td" create a timedelta index. - - if unspecified, string labels will be generated. - - Examples: - - # 5 row, 3 columns, default names on both, single index on both axis - >> makeCustomDataframe(5,3) - - # make the data a random int between 1 and 100 - >> mkdf(5,3,data_gen_f=lambda r,c:randint(1,100)) - - # 2-level multiindex on rows with each label duplicated - # twice on first level, default names on both axis, single - # index on both axis - >> a=makeCustomDataframe(5,3,r_idx_nlevels=2,r_ndupe_l=[2]) - - # DatetimeIndex on row, index with unicode labels on columns - # no names on either axis - >> a=makeCustomDataframe(5,3,c_idx_names=False,r_idx_names=False, - r_idx_type="dt",c_idx_type="u") - - # 4-level multindex on rows with names provided, 2-level multindex - # on columns with default labels and default names. - >> a=makeCustomDataframe(5,3,r_idx_nlevels=4, - r_idx_names=["FEE","FI","FO","FAM"], - c_idx_nlevels=2) - - >> a=mkdf(5,3,r_idx_nlevels=2,c_idx_nlevels=4) - """ - - assert c_idx_nlevels > 0 - assert r_idx_nlevels > 0 - assert r_idx_type is None or ( - r_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and r_idx_nlevels == 1 - ) - assert c_idx_type is None or ( - c_idx_type in ("i", "f", "s", "u", "dt", "p", "td") and c_idx_nlevels == 1 - ) - - columns = makeCustomIndex( - ncols, - nlevels=c_idx_nlevels, - prefix="C", - names=c_idx_names, - ndupe_l=c_ndupe_l, - idx_type=c_idx_type, - ) - index = makeCustomIndex( - nrows, - nlevels=r_idx_nlevels, - prefix="R", - names=r_idx_names, - ndupe_l=r_ndupe_l, - idx_type=r_idx_type, - ) - - # by default, generate data based on location - if data_gen_f is None: - data_gen_f = lambda r, c: f"R{r}C{c}" - - data = [[data_gen_f(r, c) for c in range(ncols)] for r in range(nrows)] - - return DataFrame(data, index, columns, dtype=dtype) - - -def _create_missing_idx(nrows, ncols, density, random_state=None): - if random_state is None: - random_state = np.random - else: - random_state = np.random.RandomState(random_state) - - # below is cribbed from scipy.sparse - size = int(np.round((1 - density) * nrows * ncols)) - # generate a few more to ensure unique values - min_rows = 5 - fac = 1.02 - extra_size = min(size + min_rows, fac * size) - - def _gen_unique_rand(rng, _extra_size): - ind = rng.rand(int(_extra_size)) - return np.unique(np.floor(ind * nrows * ncols))[:size] - - ind = _gen_unique_rand(random_state, extra_size) - while ind.size < size: - extra_size *= 1.05 - ind = _gen_unique_rand(random_state, extra_size) - - j = np.floor(ind * 1.0 / nrows).astype(int) - i = (ind - j * nrows).astype(int) - return i.tolist(), j.tolist() - - -def makeMissingCustomDataframe( - nrows, - ncols, - density=0.9, - random_state=None, - c_idx_names=True, - r_idx_names=True, - c_idx_nlevels=1, - r_idx_nlevels=1, - data_gen_f=None, - c_ndupe_l=None, - r_ndupe_l=None, - dtype=None, - c_idx_type=None, - r_idx_type=None, -): - """ - Parameters - ---------- - Density : float, optional - Float in (0, 1) that gives the percentage of non-missing numbers in - the DataFrame. - random_state : {np.random.RandomState, int}, optional - Random number generator or random seed. - - See makeCustomDataframe for descriptions of the rest of the parameters. - """ - df = makeCustomDataframe( - nrows, - ncols, - c_idx_names=c_idx_names, - r_idx_names=r_idx_names, - c_idx_nlevels=c_idx_nlevels, - r_idx_nlevels=r_idx_nlevels, - data_gen_f=data_gen_f, - c_ndupe_l=c_ndupe_l, - r_ndupe_l=r_ndupe_l, - dtype=dtype, - c_idx_type=c_idx_type, - r_idx_type=r_idx_type, - ) - - i, j = _create_missing_idx(nrows, ncols, density, random_state) - df.values[i, j] = np.nan - return df - - -def makeMissingDataframe(density=0.9, random_state=None): - df = makeDataFrame() - i, j = _create_missing_idx(*df.shape, density=density, random_state=random_state) - df.values[i, j] = np.nan - return df - - -class TestSubDict(dict): - def __init__(self, *args, **kwargs): - dict.__init__(self, *args, **kwargs) - - -def optional_args(decorator): - """allows a decorator to take optional positional and keyword arguments. - Assumes that taking a single, callable, positional argument means that - it is decorating a function, i.e. something like this:: - - @my_decorator - def function(): pass - - Calls decorator with decorator(f, *args, **kwargs)""" - - @wraps(decorator) - def wrapper(*args, **kwargs): - def dec(f): - return decorator(f, *args, **kwargs) - - is_decorating = not kwargs and len(args) == 1 and callable(args[0]) - if is_decorating: - f = args[0] - args = [] - return dec(f) - else: - return dec - - return wrapper - - -# skip tests on exceptions with this message -_network_error_messages = ( - # 'urlopen error timed out', - # 'timeout: timed out', - # 'socket.timeout: timed out', - "timed out", - "Server Hangup", - "HTTP Error 503: Service Unavailable", - "502: Proxy Error", - "HTTP Error 502: internal error", - "HTTP Error 502", - "HTTP Error 503", - "HTTP Error 403", - "HTTP Error 400", - "Temporary failure in name resolution", - "Name or service not known", - "Connection refused", - "certificate verify", -) - -# or this e.errno/e.reason.errno -_network_errno_vals = ( - 101, # Network is unreachable - 111, # Connection refused - 110, # Connection timed out - 104, # Connection reset Error - 54, # Connection reset by peer - 60, # urllib.error.URLError: [Errno 60] Connection timed out -) - -# Both of the above shouldn't mask real issues such as 404's -# or refused connections (changed DNS). -# But some tests (test_data yahoo) contact incredibly flakey -# servers. - -# and conditionally raise on exception types in _get_default_network_errors - - -def _get_default_network_errors(): - # Lazy import for http.client because it imports many things from the stdlib - import http.client - - return (IOError, http.client.HTTPException, TimeoutError) - - -def can_connect(url, error_classes=None): - """Try to connect to the given url. True if succeeds, False if IOError - raised - - Parameters - ---------- - url : basestring - The URL to try to connect to - - Returns - ------- - connectable : bool - Return True if no IOError (unable to connect) or URLError (bad url) was - raised - """ - - if error_classes is None: - error_classes = _get_default_network_errors() - - try: - with urlopen(url): - pass - except error_classes: - return False - else: - return True - - -@optional_args -def network( - t, - url="http://www.google.com", - raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, - check_before_test=False, - error_classes=None, - skip_errnos=_network_errno_vals, - _skip_on_messages=_network_error_messages, -): - """ - Label a test as requiring network connection and, if an error is - encountered, only raise if it does not find a network connection. - - In comparison to ``network``, this assumes an added contract to your test: - you must assert that, under normal conditions, your test will ONLY fail if - it does not have network connectivity. - - You can call this in 3 ways: as a standard decorator, with keyword - arguments, or with a positional argument that is the url to check. - - Parameters - ---------- - t : callable - The test requiring network connectivity. - url : path - The url to test via ``pandas.io.common.urlopen`` to check - for connectivity. Defaults to 'http://www.google.com'. - raise_on_error : bool - If True, never catches errors. - check_before_test : bool - If True, checks connectivity before running the test case. - error_classes : tuple or Exception - error classes to ignore. If not in ``error_classes``, raises the error. - defaults to IOError. Be careful about changing the error classes here. - skip_errnos : iterable of int - Any exception that has .errno or .reason.erno set to one - of these values will be skipped with an appropriate - message. - _skip_on_messages: iterable of string - any exception e for which one of the strings is - a substring of str(e) will be skipped with an appropriate - message. Intended to suppress errors where an errno isn't available. - - Notes - ----- - * ``raise_on_error`` supercedes ``check_before_test`` - - Returns - ------- - t : callable - The decorated test ``t``, with checks for connectivity errors. - - Example - ------- - - Tests decorated with @network will fail if it's possible to make a network - connection to another URL (defaults to google.com):: - - >>> from pandas.util.testing import network - >>> from pandas.io.common import urlopen - >>> @network - ... def test_network(): - ... with urlopen("rabbit://bonanza.com"): - ... pass - Traceback - ... - URLError: - - You can specify alternative URLs:: - - >>> @network("http://www.yahoo.com") - ... def test_something_with_yahoo(): - ... raise IOError("Failure Message") - >>> test_something_with_yahoo() - Traceback (most recent call last): - ... - IOError: Failure Message - - If you set check_before_test, it will check the url first and not run the - test on failure:: - - >>> @network("failing://url.blaher", check_before_test=True) - ... def test_something(): - ... print("I ran!") - ... raise ValueError("Failure") - >>> test_something() - Traceback (most recent call last): - ... - - Errors not related to networking will always be raised. - """ - from pytest import skip - - if error_classes is None: - error_classes = _get_default_network_errors() - - t.network = True - - @wraps(t) - def wrapper(*args, **kwargs): - if check_before_test and not raise_on_error: - if not can_connect(url, error_classes): - skip() - try: - return t(*args, **kwargs) - except Exception as err: - errno = getattr(err, "errno", None) - if not errno and hasattr(errno, "reason"): - errno = getattr(err.reason, "errno", None) - - if errno in skip_errnos: - skip(f"Skipping test due to known errno and error {err}") - - e_str = str(err) - - if any(m.lower() in e_str.lower() for m in _skip_on_messages): - skip( - f"Skipping test because exception message is known and error {err}" - ) - - if not isinstance(err, error_classes): - raise - - if raise_on_error or can_connect(url, error_classes): - raise - else: - skip(f"Skipping test due to lack of connectivity and error {err}") - - return wrapper - - -with_connectivity_check = network - - -@contextmanager -def assert_produces_warning( - expected_warning=Warning, - filter_level="always", - clear=None, - check_stacklevel=True, - raise_on_extra_warnings=True, -): - """ - Context manager for running code expected to either raise a specific - warning, or not raise any warnings. Verifies that the code raises the - expected warning, and that it does not raise any other unexpected - warnings. It is basically a wrapper around ``warnings.catch_warnings``. - - Parameters - ---------- - expected_warning : {Warning, False, None}, default Warning - The type of Exception raised. ``exception.Warning`` is the base - class for all warnings. To check that no warning is returned, - specify ``False`` or ``None``. - filter_level : str or None, default "always" - Specifies whether warnings are ignored, displayed, or turned - into errors. - Valid values are: - - * "error" - turns matching warnings into exceptions - * "ignore" - discard the warning - * "always" - always emit a warning - * "default" - print the warning the first time it is generated - from each location - * "module" - print the warning the first time it is generated - from each module - * "once" - print the warning the first time it is generated - - clear : str, default None - If not ``None`` then remove any previously raised warnings from - the ``__warningsregistry__`` to ensure that no warning messages are - suppressed by this context manager. If ``None`` is specified, - the ``__warningsregistry__`` keeps track of which warnings have been - shown, and does not show them again. - check_stacklevel : bool, default True - If True, displays the line that called the function containing - the warning to show were the function is called. Otherwise, the - line that implements the function is displayed. - raise_on_extra_warnings : bool, default True - Whether extra warnings not of the type `expected_warning` should - cause the test to fail. - - Examples - -------- - >>> import warnings - >>> with assert_produces_warning(): - ... warnings.warn(UserWarning()) - ... - >>> with assert_produces_warning(False): - ... warnings.warn(RuntimeWarning()) - ... - Traceback (most recent call last): - ... - AssertionError: Caused unexpected warning(s): ['RuntimeWarning']. - >>> with assert_produces_warning(UserWarning): - ... warnings.warn(RuntimeWarning()) - Traceback (most recent call last): - ... - AssertionError: Did not see expected warning of class 'UserWarning'. - - ..warn:: This is *not* thread-safe. - """ - __tracebackhide__ = True - - with warnings.catch_warnings(record=True) as w: - - if clear is not None: - # make sure that we are clearing these warnings - # if they have happened before - # to guarantee that we will catch them - if not is_list_like(clear): - clear = [clear] - for m in clear: - try: - m.__warningregistry__.clear() - except AttributeError: - # module may not have __warningregistry__ - pass - - saw_warning = False - warnings.simplefilter(filter_level) - yield w - extra_warnings = [] - - for actual_warning in w: - if expected_warning and issubclass( - actual_warning.category, expected_warning - ): - saw_warning = True - - if check_stacklevel and issubclass( - actual_warning.category, (FutureWarning, DeprecationWarning) - ): - from inspect import getframeinfo, stack - - caller = getframeinfo(stack()[2][0]) - msg = ( - "Warning not set with correct stacklevel. " - f"File where warning is raised: {actual_warning.filename} != " - f"{caller.filename}. Warning message: {actual_warning.message}" - ) - assert actual_warning.filename == caller.filename, msg - else: - extra_warnings.append( - ( - actual_warning.category.__name__, - actual_warning.message, - actual_warning.filename, - actual_warning.lineno, - ) - ) - if expected_warning: - msg = ( - f"Did not see expected warning of class " - f"{repr(expected_warning.__name__)}" - ) - assert saw_warning, msg - if raise_on_extra_warnings and extra_warnings: - raise AssertionError( - f"Caused unexpected warning(s): {repr(extra_warnings)}" - ) - - -class RNGContext: - """ - Context manager to set the numpy random number generator speed. Returns - to the original value upon exiting the context manager. - - Parameters - ---------- - seed : int - Seed for numpy.random.seed - - Examples - -------- - - with RNGContext(42): - np.random.randn() - """ - - def __init__(self, seed): - self.seed = seed - - def __enter__(self): - - self.start_state = np.random.get_state() - np.random.seed(self.seed) - - def __exit__(self, exc_type, exc_value, traceback): - - np.random.set_state(self.start_state) - - -@contextmanager -def with_csv_dialect(name, **kwargs): - """ - Context manager to temporarily register a CSV dialect for parsing CSV. - - Parameters - ---------- - name : str - The name of the dialect. - kwargs : mapping - The parameters for the dialect. - - Raises - ------ - ValueError : the name of the dialect conflicts with a builtin one. - - See Also - -------- - csv : Python's CSV library. - """ - import csv - - _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"} - - if name in _BUILTIN_DIALECTS: - raise ValueError("Cannot override builtin dialect.") - - csv.register_dialect(name, **kwargs) - yield - csv.unregister_dialect(name) - - -@contextmanager -def use_numexpr(use, min_elements=None): - from pandas.core.computation import expressions as expr - - if min_elements is None: - min_elements = expr._MIN_ELEMENTS - - olduse = expr._USE_NUMEXPR - oldmin = expr._MIN_ELEMENTS - expr.set_use_numexpr(use) - expr._MIN_ELEMENTS = min_elements - yield - expr._MIN_ELEMENTS = oldmin - expr.set_use_numexpr(olduse) - - -def test_parallel(num_threads=2, kwargs_list=None): - """Decorator to run the same function multiple times in parallel. - - Parameters - ---------- - num_threads : int, optional - The number of times the function is run in parallel. - kwargs_list : list of dicts, optional - The list of kwargs to update original - function kwargs on different threads. - Notes - ----- - This decorator does not pass the return value of the decorated function. - - Original from scikit-image: - - https://github.com/scikit-image/scikit-image/pull/1519 - - """ - - assert num_threads > 0 - has_kwargs_list = kwargs_list is not None - if has_kwargs_list: - assert len(kwargs_list) == num_threads - import threading - - def wrapper(func): - @wraps(func) - def inner(*args, **kwargs): - if has_kwargs_list: - update_kwargs = lambda i: dict(kwargs, **kwargs_list[i]) - else: - update_kwargs = lambda i: kwargs - threads = [] - for i in range(num_threads): - updated_kwargs = update_kwargs(i) - thread = threading.Thread(target=func, args=args, kwargs=updated_kwargs) - threads.append(thread) - for thread in threads: - thread.start() - for thread in threads: - thread.join() - - return inner - - return wrapper - - -class SubclassedSeries(Series): - _metadata = ["testattr", "name"] - - @property - def _constructor(self): - return SubclassedSeries - - @property - def _constructor_expanddim(self): - return SubclassedDataFrame - - -class SubclassedDataFrame(DataFrame): - _metadata = ["testattr"] - - @property - def _constructor(self): - return SubclassedDataFrame - - @property - def _constructor_sliced(self): - return SubclassedSeries - - -class SubclassedCategorical(Categorical): - @property - def _constructor(self): - return SubclassedCategorical - - -@contextmanager -def set_timezone(tz: str): - """ - Context manager for temporarily setting a timezone. - - Parameters - ---------- - tz : str - A string representing a valid timezone. - - Examples - -------- - - >>> from datetime import datetime - >>> from dateutil.tz import tzlocal - >>> tzlocal().tzname(datetime.now()) - 'IST' - - >>> with set_timezone('US/Eastern'): - ... tzlocal().tzname(datetime.now()) - ... - 'EDT' - """ - - import os - import time - - def setTZ(tz): - if tz is None: - try: - del os.environ["TZ"] - except KeyError: - pass - else: - os.environ["TZ"] = tz - time.tzset() - - orig_tz = os.environ.get("TZ") - setTZ(tz) - try: - yield - finally: - setTZ(orig_tz) - - -def _make_skipna_wrapper(alternative, skipna_alternative=None): - """ - Create a function for calling on an array. - - Parameters - ---------- - alternative : function - The function to be called on the array with no NaNs. - Only used when 'skipna_alternative' is None. - skipna_alternative : function - The function to be called on the original array - - Returns - ------- - function - """ - if skipna_alternative: - - def skipna_wrapper(x): - return skipna_alternative(x.values) - - else: - - def skipna_wrapper(x): - nona = x.dropna() - if len(nona) == 0: - return np.nan - return alternative(nona) - - return skipna_wrapper - - -def convert_rows_list_to_csv_str(rows_list: List[str]): - """ - Convert list of CSV rows to single CSV-formatted string for current OS. - - This method is used for creating expected value of to_csv() method. - - Parameters - ---------- - rows_list : List[str] - Each element represents the row of csv. - - Returns - ------- - str - Expected output of to_csv() in current OS. - """ - sep = os.linesep - expected = sep.join(rows_list) + sep - return expected diff --git a/requirements-dev.txt b/requirements-dev.txt index 5f67726a3e476..08cbef2c7fc6b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,6 @@ +# This file is auto-generated from environment.yml, do not modify. +# See that file for comments about the need/usage of each dependency. + numpy>=1.15 python-dateutil>=2.6.1 pytz @@ -13,7 +16,6 @@ mypy==0.730 pycodestyle gitpython sphinx -numpydoc>=0.9.0 nbconvert>=5.4.1 nbsphinx pandoc @@ -33,6 +35,7 @@ moto pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 +pytest-asyncio seaborn statsmodels ipywidgets @@ -42,26 +45,29 @@ pip blosc bottleneck>=1.2.1 ipykernel -ipython>=5.6.0 +ipython>=7.11.1 jinja2 matplotlib>=2.2.2 numexpr>=2.6.8 scipy>=1.1 +numba>=0.46.0 beautifulsoup4>=4.6.0 -fastparquet>=0.3.2 html5lib lxml openpyxl<=3.0.1 +xlrd +xlsxwriter +xlwt +odfpy +fastparquet>=0.3.2 pyarrow>=0.13.1 +python-snappy pyqt5>=5.9.2 tables>=3.4.2 -python-snappy s3fs sqlalchemy xarray -xlrd -xlsxwriter -xlwt -odfpy pyreadstat -git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master \ No newline at end of file +tabulate>=0.8.3 +git+https://github.com/pandas-dev/pandas-sphinx-theme.git@master +git+https://github.com/numpy/numpydoc \ No newline at end of file diff --git a/scripts/build_dist.sh b/scripts/build_dist.sh deleted file mode 100755 index c3f849ce7a6eb..0000000000000 --- a/scripts/build_dist.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# build the distribution -LAST=`git tag --sort version:refname | grep -v rc | tail -1` - -echo "Building distribution for: $LAST" -git checkout $LAST - -read -p "Ok to continue (y/n)? " answer -case ${answer:0:1} in - y|Y ) - echo "Building distribution" - ./build_dist_for_release.sh - ;; - * ) - echo "Not building distribution" - ;; -esac diff --git a/scripts/build_dist_for_release.sh b/scripts/build_dist_for_release.sh deleted file mode 100755 index bee0f23a68ec2..0000000000000 --- a/scripts/build_dist_for_release.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# this requires cython to be installed - -# this builds the release cleanly & is building on the current checkout -rm -rf dist -git clean -xfd -python setup.py clean --quiet -python setup.py cython --quiet -python setup.py sdist --formats=gztar --quiet diff --git a/scripts/download_wheels.py b/scripts/download_wheels.py deleted file mode 100644 index 4ca1354321134..0000000000000 --- a/scripts/download_wheels.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python -"""Fetch wheels from wheels.scipy.org for a pandas version.""" -import argparse -import pathlib -import sys -import urllib.parse -import urllib.request - -from lxml import html - - -def parse_args(args=None): - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("version", type=str, help="Pandas version (0.23.0)") - return parser.parse_args(args) - - -def fetch(version): - base = "http://wheels.scipy.org" - tree = html.parse(base) - root = tree.getroot() - - dest = pathlib.Path("dist") - dest.mkdir(exist_ok=True) - - files = [ - x - for x in root.xpath("//a/text()") - if x.startswith("pandas-{}".format(version)) and not dest.joinpath(x).exists() - ] - - N = len(files) - - for i, filename in enumerate(files, 1): - out = str(dest.joinpath(filename)) - link = urllib.request.urljoin(base, filename) - urllib.request.urlretrieve(link, out) - print( - "Downloaded {link} to {out} [{i}/{N}]".format(link=link, out=out, i=i, N=N) - ) - - -def main(args=None): - args = parse_args(args) - fetch(args.version) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/scripts/find_commits_touching_func.py b/scripts/find_commits_touching_func.py index 5e1a169dbfc3f..85675cb6df42b 100755 --- a/scripts/find_commits_touching_func.py +++ b/scripts/find_commits_touching_func.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # copyright 2013, y-p @ github """ Search the git history for all commits touching a named method diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 6f809669d917f..b0a06416ce443 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Convert the conda environment.yml to the pip requirements-dev.txt, or check that they have the same packages (for the CI) @@ -87,9 +87,14 @@ def main(conda_fname, pip_fname, compare=False): elif isinstance(dep, dict) and len(dep) == 1 and "pip" in dep: pip_deps += dep["pip"] else: - raise ValueError("Unexpected dependency {}".format(dep)) + raise ValueError(f"Unexpected dependency {dep}") - pip_content = "\n".join(pip_deps) + fname = os.path.split(conda_fname)[1] + header = ( + f"# This file is auto-generated from {fname}, do not modify.\n" + "# See that file for comments about the need/usage of each dependency.\n\n" + ) + pip_content = header + "\n".join(pip_deps) if compare: with open(pip_fname) as pip_fd: @@ -122,13 +127,12 @@ def main(conda_fname, pip_fname, compare=False): ) if res: msg = ( - "`requirements-dev.txt` has to be generated with `{}` after " - "`environment.yml` is modified.\n".format(sys.argv[0]) + f"`requirements-dev.txt` has to be generated with `{sys.argv[0]}` after " + "`environment.yml` is modified.\n" ) if args.azure: msg = ( - "##vso[task.logissue type=error;" - "sourcepath=requirements-dev.txt]{}".format(msg) + f"##vso[task.logissue type=error;sourcepath=requirements-dev.txt]{msg}" ) sys.stderr.write(msg) sys.exit(res) diff --git a/scripts/list_future_warnings.sh b/scripts/list_future_warnings.sh index 0c4046bbb5f49..121f4f5a92abb 100755 --- a/scripts/list_future_warnings.sh +++ b/scripts/list_future_warnings.sh @@ -25,7 +25,7 @@ EXCLUDE="^pandas/tests/|" # tests validate that FutureWarnings are raised EXCLUDE+="^pandas/util/_decorators.py$|" # generic deprecate function that raises warning EXCLUDE+="^pandas/util/_depr_module.py$|" # generic deprecate module that raises warnings -EXCLUDE+="^pandas/util/testing.py$|" # contains function to evaluate if warning is raised +EXCLUDE+="^pandas._testing.py$|" # contains function to evaluate if warning is raised EXCLUDE+="^pandas/io/parsers.py$" # implements generic deprecation system in io reading BASE_DIR="$(dirname $0)/.." diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index 1506acc95edf9..b11de0c4ad860 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -1,819 +1,52 @@ -import functools import io -import random -import string import textwrap -import numpy as np import pytest import validate_docstrings -import pandas as pd -validate_one = validate_docstrings.validate_one - - -class GoodDocStrings: - """ - Collection of good doc strings. - - This class contains a lot of docstrings that should pass the validation - script without any errors. - """ - - def plot(self, kind, color="blue", **kwargs): - """ - Generate a plot. - - Render the data in the Series as a matplotlib plot of the - specified kind. - - Parameters - ---------- - kind : str - Kind of matplotlib plot. - color : str, default 'blue' - Color name or rgb code. - **kwargs - These parameters will be passed to the matplotlib plotting - function. - """ - pass - - def swap(self, arr, i, j, *args, **kwargs): - """ - Swap two indicies on an array. - - Parameters - ---------- - arr : list - The list having indexes swapped. - i, j : int - The indexes being swapped. - *args, **kwargs - Extraneous parameters are being permitted. - """ - pass - - def sample(self): - """ - Generate and return a random number. - - The value is sampled from a continuous uniform distribution between - 0 and 1. - - Returns - ------- - float - Random number generated. - """ - return random.random() - - @functools.lru_cache(None) - def decorated_sample(self, max): - """ - Generate and return a random integer between 0 and max. - - Parameters - ---------- - max : int - The maximum value of the random number. - - Returns - ------- - int - Random number generated. - """ - return random.randint(0, max) - - def random_letters(self): - """ - Generate and return a sequence of random letters. - - The length of the returned string is also random, and is also - returned. - - Returns - ------- - length : int - Length of the returned string. - letters : str - String of random letters. - """ - length = random.randint(1, 10) - letters = "".join(random.sample(string.ascii_lowercase, length)) - return length, letters - - def sample_values(self): - """ - Generate an infinite sequence of random numbers. - - The values are sampled from a continuous uniform distribution between - 0 and 1. - - Yields - ------ - float - Random number generated. - """ - while True: - yield random.random() - - def head(self): - """ - Return the first 5 elements of the Series. - - This function is mainly useful to preview the values of the - Series without displaying the whole of it. - - Returns - ------- - Series - Subset of the original series with the 5 first values. - - See Also - -------- - Series.tail : Return the last 5 elements of the Series. - Series.iloc : Return a slice of the elements in the Series, - which can also be used to return the first or last n. - """ - return self.iloc[:5] - - def head1(self, n=5): - """ - Return the first elements of the Series. - - This function is mainly useful to preview the values of the - Series without displaying the whole of it. - - Parameters - ---------- - n : int - Number of values to return. - - Returns - ------- - Series - Subset of the original series with the n first values. - - See Also - -------- - tail : Return the last n elements of the Series. - - Examples - -------- - >>> s = pd.Series(['Ant', 'Bear', 'Cow', 'Dog', 'Falcon']) - >>> s.head() - 0 Ant - 1 Bear - 2 Cow - 3 Dog - 4 Falcon - dtype: object - - With the `n` parameter, we can change the number of returned rows: - - >>> s.head(n=3) - 0 Ant - 1 Bear - 2 Cow - dtype: object - """ - return self.iloc[:n] - - def contains(self, pat, case=True, na=np.nan): - """ - Return whether each value contains `pat`. - - In this case, we are illustrating how to use sections, even - if the example is simple enough and does not require them. - - Parameters - ---------- - pat : str - Pattern to check for within each element. - case : bool, default True - Whether check should be done with case sensitivity. - na : object, default np.nan - Fill value for missing data. - - Examples - -------- - >>> s = pd.Series(['Antelope', 'Lion', 'Zebra', np.nan]) - >>> s.str.contains(pat='a') - 0 False - 1 False - 2 True - 3 NaN - dtype: object - - **Case sensitivity** - - With `case_sensitive` set to `False` we can match `a` with both - `a` and `A`: - - >>> s.str.contains(pat='a', case=False) - 0 True - 1 False - 2 True - 3 NaN - dtype: object - - **Missing values** - - We can fill missing values in the output using the `na` parameter: - - >>> s.str.contains(pat='a', na=False) - 0 False - 1 False - 2 True - 3 False - dtype: bool - """ - pass - - def mode(self, axis, numeric_only): - """ - Ensure reST directives don't affect checks for leading periods. - - Parameters - ---------- - axis : str - Sentence ending in period, followed by single directive. - - .. versionchanged:: 0.1.2 - - numeric_only : bool - Sentence ending in period, followed by multiple directives. - - .. versionadded:: 0.1.2 - .. deprecated:: 0.00.0 - A multiline description, - which spans another line. - """ - pass - - def good_imports(self): - """ - Ensure import other than numpy and pandas are fine. - - Examples - -------- - This example does not import pandas or import numpy. - >>> import datetime - >>> datetime.MAXYEAR - 9999 - """ - pass - - def no_returns(self): - """ - Say hello and have no returns. - """ - pass - - def empty_returns(self): - """ - Say hello and always return None. - - Since this function never returns a value, this - docstring doesn't need a return section. - """ - - def say_hello(): - return "Hello World!" - - say_hello() - if True: - return - else: - return None - - def multiple_variables_on_one_line(self, matrix, a, b, i, j): - """ - Swap two values in a matrix. - - Parameters - ---------- - matrix : list of list - A double list that represents a matrix. - a, b : int - The indicies of the first value. - i, j : int - The indicies of the second value. - """ - pass - - -class BadGenericDocStrings: - """Everything here has a bad docstring - """ - - def func(self): - - """Some function. - - With several mistakes in the docstring. - - It has a blank like after the signature `def func():`. - - The text 'Some function' should go in the line after the - opening quotes of the docstring, not in the same line. - - There is a blank line between the docstring and the first line - of code `foo = 1`. - - The closing quotes should be in the next line, not in this one.""" - - foo = 1 - bar = 2 - return foo + bar - - def astype(self, dtype): - """ - Casts Series type. - - Verb in third-person of the present simple, should be infinitive. - """ - pass - - def astype1(self, dtype): - """ - Method to cast Series type. - - Does not start with verb. - """ - pass - - def astype2(self, dtype): - """ - Cast Series type - - Missing dot at the end. - """ - pass - - def astype3(self, dtype): - """ - Cast Series type from its current type to the new type defined in - the parameter dtype. - - Summary is too verbose and doesn't fit in a single line. - """ - pass - - def two_linebreaks_between_sections(self, foo): - """ - Test linebreaks message GL03. - - Note 2 blank lines before parameters section. - - - Parameters - ---------- - foo : str - Description of foo parameter. - """ - pass - - def linebreak_at_end_of_docstring(self, foo): - """ - Test linebreaks message GL03. - - Note extra blank line at end of docstring. - - Parameters - ---------- - foo : str - Description of foo parameter. - - """ - pass - - def plot(self, kind, **kwargs): - """ - Generate a plot. - - Render the data in the Series as a matplotlib plot of the - specified kind. - - Note the blank line between the parameters title and the first - parameter. Also, note that after the name of the parameter `kind` - and before the colon, a space is missing. - - Also, note that the parameter descriptions do not start with a - capital letter, and do not finish with a dot. - - Finally, the `**kwargs` parameter is missing. - - Parameters - ---------- - - kind: str - kind of matplotlib plot - """ - pass - - def method(self, foo=None, bar=None): - """ - A sample DataFrame method. - - Do not import numpy and pandas. - - Try to use meaningful data, when it makes the example easier - to understand. - - Try to avoid positional arguments like in `df.method(1)`. They - can be alright if previously defined with a meaningful name, - like in `present_value(interest_rate)`, but avoid them otherwise. - - When presenting the behavior with different parameters, do not place - all the calls one next to the other. Instead, add a short sentence - explaining what the example shows. - - Examples - -------- - >>> import numpy as np - >>> import pandas as pd - >>> df = pd.DataFrame(np.ones((3, 3)), - ... columns=('a', 'b', 'c')) - >>> df.all(1) - 0 True - 1 True - 2 True - dtype: bool - >>> df.all(bool_only=True) - Series([], dtype: bool) - """ - pass - - def private_classes(self): - """ - This mentions NDFrame, which is not correct. - """ - - def unknown_section(self): - """ - This section has an unknown section title. - - Unknown Section - --------------- - This should raise an error in the validation. - """ - - def sections_in_wrong_order(self): - """ - This docstring has the sections in the wrong order. - - Parameters - ---------- - name : str - This section is in the right position. - - Examples - -------- - >>> print('So far Examples is good, as it goes before Parameters') - So far Examples is good, as it goes before Parameters - - See Also - -------- - function : This should generate an error, as See Also needs to go - before Examples. - """ - - def deprecation_in_wrong_order(self): - """ - This docstring has the deprecation warning in the wrong order. - - This is the extended summary. The correct order should be - summary, deprecation warning, extended summary. - - .. deprecated:: 1.0 - This should generate an error as it needs to go before - extended summary. - """ - - def method_wo_docstrings(self): - pass - - def directives_without_two_colons(self, first, second): - """ - Ensure reST directives have trailing colons. - - Parameters - ---------- - first : str - Sentence ending in period, followed by single directive w/o colons. - - .. versionchanged 0.1.2 - - second : bool - Sentence ending in period, followed by multiple directives w/o - colons. - - .. versionadded 0.1.2 - .. deprecated 0.00.0 - - """ - pass - - -class BadSummaries: - def wrong_line(self): - """Exists on the wrong line""" - pass - - def no_punctuation(self): - """ - Has the right line but forgets punctuation - """ - pass - - def no_capitalization(self): - """ - provides a lowercase summary. - """ - pass - - def no_infinitive(self): - """ - Started with a verb that is not infinitive. - """ - - def multi_line(self): - """ - Extends beyond one line - which is not correct. - """ - - def two_paragraph_multi_line(self): - """ - Extends beyond one line - which is not correct. - - Extends beyond one line, which in itself is correct but the - previous short summary should still be an issue. - """ - - -class BadParameters: - """ - Everything here has a problem with its Parameters section. - """ - - def missing_params(self, kind, **kwargs): - """ - Lacks kwargs in Parameters. - - Parameters - ---------- - kind : str - Foo bar baz. - """ - - def bad_colon_spacing(self, kind): - """ - Has bad spacing in the type line. - - Parameters - ---------- - kind: str - Needs a space after kind. - """ - - def no_description_period(self, kind): - """ - Forgets to add a period to the description. - - Parameters - ---------- - kind : str - Doesn't end with a dot - """ - - def no_description_period_with_directive(self, kind): - """ - Forgets to add a period, and also includes a directive. - - Parameters - ---------- - kind : str - Doesn't end with a dot - - .. versionadded:: 0.00.0 - """ - - def no_description_period_with_directives(self, kind): - """ - Forgets to add a period, and also includes multiple directives. - - Parameters - ---------- - kind : str - Doesn't end with a dot - - .. versionchanged:: 0.00.0 - .. deprecated:: 0.00.0 - """ - - def parameter_capitalization(self, kind): - """ - Forgets to capitalize the description. - - Parameters - ---------- - kind : str - this is not capitalized. - """ - - def blank_lines(self, kind): - """ - Adds a blank line after the section header. - - Parameters - ---------- - - kind : str - Foo bar baz. - """ - pass - - def integer_parameter(self, kind): - """ - Uses integer instead of int. - - Parameters - ---------- - kind : integer - Foo bar baz. - """ - pass - - def string_parameter(self, kind): - """ - Uses string instead of str. - - Parameters - ---------- - kind : string - Foo bar baz. - """ - pass - - def boolean_parameter(self, kind): - """ - Uses boolean instead of bool. - - Parameters - ---------- - kind : boolean - Foo bar baz. - """ - pass - - def list_incorrect_parameter_type(self, kind): - """ - Uses list of boolean instead of list of bool. - - Parameters - ---------- - kind : list of boolean, integer, float or string - Foo bar baz. - """ - pass - - def bad_parameter_spacing(self, a, b): - """ - The parameters on the same line have an extra space between them. - - Parameters - ---------- - a, b : int - Foo bar baz. - """ - pass - - -class BadReturns: - def return_not_documented(self): - """ - Lacks section for Returns - """ - return "Hello world!" - - def yield_not_documented(self): - """ - Lacks section for Yields - """ - yield "Hello world!" - - def no_type(self): - """ - Returns documented but without type. - - Returns - ------- - Some value. - """ - return "Hello world!" - - def no_description(self): - """ - Provides type but no descrption. - - Returns - ------- - str - """ - return "Hello world!" - - def no_punctuation(self): - """ - Provides type and description but no period. - - Returns - ------- - str - A nice greeting - """ - return "Hello world!" - - def named_single_return(self): - """ - Provides name but returns only one value. - - Returns - ------- - s : str - A nice greeting. - """ - return "Hello world!" - - def no_capitalization(self): - """ - Forgets capitalization in return values description. - - Returns - ------- - foo : str - The first returned string. - bar : str - the second returned string. - """ - return "Hello", "World!" +class BadDocstrings: + """Everything here has a bad docstring + """ - def no_period_multi(self): + def private_classes(self): """ - Forgets period in return values description. - - Returns - ------- - foo : str - The first returned string - bar : str - The second returned string. + This mentions NDFrame, which is not correct. """ - return "Hello", "World!" - -class BadSeeAlso: - def desc_no_period(self): + def prefix_pandas(self): """ - Return the first 5 elements of the Series. + Have `pandas` prefix in See Also section. See Also -------- - Series.tail : Return the last 5 elements of the Series. - Series.iloc : Return a slice of the elements in the Series, - which can also be used to return the first or last n + pandas.Series.rename : Alter Series index labels or name. + DataFrame.head : The first `n` rows of the caller object. """ pass - def desc_first_letter_lowercase(self): - """ - Return the first 5 elements of the Series. - - See Also - -------- - Series.tail : return the last 5 elements of the Series. - Series.iloc : Return a slice of the elements in the Series, - which can also be used to return the first or last n. + def redundant_import(self, foo=None, bar=None): """ - pass + A sample DataFrame method. - def prefix_pandas(self): - """ - Have `pandas` prefix in See Also section. + Should not import numpy and pandas. - See Also + Examples -------- - pandas.Series.rename : Alter Series index labels or name. - DataFrame.head : The first `n` rows of the caller object. + >>> import numpy as np + >>> import pandas as pd + >>> df = pd.DataFrame(np.ones((3, 3)), + ... columns=('a', 'b', 'c')) + >>> df.all(1) + 0 True + 1 True + 2 True + dtype: bool + >>> df.all(bool_only=True) + Series([], dtype: bool) """ pass - -class BadExamples: def unused_import(self): """ Examples @@ -877,59 +110,9 @@ def _import_path(self, klass=None, func=None): return base_path - def test_good_class(self, capsys): - errors = validate_one(self._import_path(klass="GoodDocStrings"))["errors"] - assert isinstance(errors, list) - assert not errors - - @pytest.mark.parametrize( - "func", - [ - "plot", - "swap", - "sample", - "decorated_sample", - "random_letters", - "sample_values", - "head", - "head1", - "contains", - "mode", - "good_imports", - "no_returns", - "empty_returns", - "multiple_variables_on_one_line", - ], - ) - def test_good_functions(self, capsys, func): - errors = validate_one(self._import_path(klass="GoodDocStrings", func=func))[ - "errors" - ] - assert isinstance(errors, list) - assert not errors - def test_bad_class(self, capsys): - errors = validate_one(self._import_path(klass="BadGenericDocStrings"))["errors"] - assert isinstance(errors, list) - assert errors - - @pytest.mark.parametrize( - "func", - [ - "func", - "astype", - "astype1", - "astype2", - "astype3", - "plot", - "method", - "private_classes", - "directives_without_two_colons", - ], - ) - def test_bad_generic_functions(self, capsys, func): - errors = validate_one( - self._import_path(klass="BadGenericDocStrings", func=func) # noqa:F821 + errors = validate_docstrings.pandas_validate( + self._import_path(klass="BadDocstrings") )["errors"] assert isinstance(errors, list) assert errors @@ -937,9 +120,8 @@ def test_bad_generic_functions(self, capsys, func): @pytest.mark.parametrize( "klass,func,msgs", [ - # See Also tests ( - "BadGenericDocStrings", + "BadDocstrings", "private_classes", ( "Private classes (NDFrame) should not be mentioned in public " @@ -947,200 +129,31 @@ def test_bad_generic_functions(self, capsys, func): ), ), ( - "BadGenericDocStrings", - "unknown_section", - ('Found unknown section "Unknown Section".',), - ), - ( - "BadGenericDocStrings", - "sections_in_wrong_order", - ( - "Sections are in the wrong order. Correct order is: Parameters, " - "See Also, Examples", - ), - ), - ( - "BadGenericDocStrings", - "deprecation_in_wrong_order", - ("Deprecation warning should precede extended summary",), - ), - ( - "BadGenericDocStrings", - "directives_without_two_colons", - ( - "reST directives ['versionchanged', 'versionadded', " - "'deprecated'] must be followed by two colons", - ), - ), - ( - "BadSeeAlso", - "desc_no_period", - ('Missing period at end of description for See Also "Series.iloc"',), - ), - ( - "BadSeeAlso", - "desc_first_letter_lowercase", - ('should be capitalized for See Also "Series.tail"',), - ), - # Summary tests - ( - "BadSummaries", - "wrong_line", - ("should start in the line immediately after the opening quotes",), - ), - ("BadSummaries", "no_punctuation", ("Summary does not end with a period",)), - ( - "BadSummaries", - "no_capitalization", - ("Summary does not start with a capital letter",), - ), - ( - "BadSummaries", - "no_capitalization", - ("Summary must start with infinitive verb",), - ), - ("BadSummaries", "multi_line", ("Summary should fit in a single line",)), - ( - "BadSummaries", - "two_paragraph_multi_line", - ("Summary should fit in a single line",), - ), - # Parameters tests - ( - "BadParameters", - "missing_params", - ("Parameters {**kwargs} not documented",), - ), - ( - "BadParameters", - "bad_colon_spacing", - ( - 'Parameter "kind" requires a space before the colon ' - "separating the parameter name and type", - ), - ), - ( - "BadParameters", - "no_description_period", - ('Parameter "kind" description should finish with "."',), - ), - ( - "BadParameters", - "no_description_period_with_directive", - ('Parameter "kind" description should finish with "."',), - ), - ( - "BadParameters", - "parameter_capitalization", - ('Parameter "kind" description should start with a capital letter',), - ), - ( - "BadParameters", - "integer_parameter", - ('Parameter "kind" type should use "int" instead of "integer"',), - ), - ( - "BadParameters", - "string_parameter", - ('Parameter "kind" type should use "str" instead of "string"',), - ), - ( - "BadParameters", - "boolean_parameter", - ('Parameter "kind" type should use "bool" instead of "boolean"',), - ), - ( - "BadParameters", - "list_incorrect_parameter_type", - ('Parameter "kind" type should use "bool" instead of "boolean"',), - ), - ( - "BadParameters", - "list_incorrect_parameter_type", - ('Parameter "kind" type should use "int" instead of "integer"',), - ), - ( - "BadParameters", - "list_incorrect_parameter_type", - ('Parameter "kind" type should use "str" instead of "string"',), - ), - ( - "BadParameters", - "bad_parameter_spacing", - ("Parameters {b} not documented", "Unknown parameters { b}"), - ), - pytest.param( - "BadParameters", - "blank_lines", - ("No error yet?",), - marks=pytest.mark.xfail, - ), - # Returns tests - ("BadReturns", "return_not_documented", ("No Returns section found",)), - ("BadReturns", "yield_not_documented", ("No Yields section found",)), - pytest.param("BadReturns", "no_type", ("foo",), marks=pytest.mark.xfail), - ("BadReturns", "no_description", ("Return value has no description",)), - ( - "BadReturns", - "no_punctuation", - ('Return value description should finish with "."',), - ), - ( - "BadReturns", - "named_single_return", + "BadDocstrings", + "prefix_pandas", ( - "The first line of the Returns section should contain only the " - "type, unless multiple values are being returned", + "pandas.Series.rename in `See Also` section " + "does not need `pandas` prefix", ), ), - ( - "BadReturns", - "no_capitalization", - ("Return value description should start with a capital letter",), - ), - ( - "BadReturns", - "no_period_multi", - ('Return value description should finish with "."',), - ), # Examples tests ( - "BadGenericDocStrings", - "method", + "BadDocstrings", + "redundant_import", ("Do not import numpy, as it is imported automatically",), ), ( - "BadGenericDocStrings", - "method", + "BadDocstrings", + "redundant_import", ("Do not import pandas, as it is imported automatically",), ), ( - "BadGenericDocStrings", - "method_wo_docstrings", - ("The object does not have a docstring",), - ), - # See Also tests - ( - "BadSeeAlso", - "prefix_pandas", - ( - "pandas.Series.rename in `See Also` section " - "does not need `pandas` prefix", - ), - ), - # Examples tests - ( - "BadExamples", + "BadDocstrings", "unused_import", ("flake8 error: F401 'pandas as pdf' imported but unused",), ), ( - "BadExamples", - "indentation_is_not_a_multiple_of_four", - ("flake8 error: E111 indentation is not a multiple of four",), - ), - ( - "BadExamples", + "BadDocstrings", "missing_whitespace_around_arithmetic_operator", ( "flake8 error: " @@ -1148,39 +161,28 @@ def test_bad_generic_functions(self, capsys, func): ), ), ( - "BadExamples", - "missing_whitespace_after_comma", - ("flake8 error: E231 missing whitespace after ',' (3 times)",), - ), - ( - "BadGenericDocStrings", - "two_linebreaks_between_sections", - ( - "Double line break found; please use only one blank line to " - "separate sections or paragraphs, and do not leave blank lines " - "at the end of docstrings", - ), + "BadDocstrings", + "indentation_is_not_a_multiple_of_four", + ("flake8 error: E111 indentation is not a multiple of four",), ), ( - "BadGenericDocStrings", - "linebreak_at_end_of_docstring", - ( - "Double line break found; please use only one blank line to " - "separate sections or paragraphs, and do not leave blank lines " - "at the end of docstrings", - ), + "BadDocstrings", + "missing_whitespace_after_comma", + ("flake8 error: E231 missing whitespace after ',' (3 times)",), ), ], ) def test_bad_docstrings(self, capsys, klass, func, msgs): - result = validate_one(self._import_path(klass=klass, func=func)) + result = validate_docstrings.pandas_validate( + self._import_path(klass=klass, func=func) + ) for msg in msgs: assert msg in " ".join(err[1] for err in result["errors"]) def test_validate_all_ignore_deprecated(self, monkeypatch): monkeypatch.setattr( validate_docstrings, - "validate_one", + "pandas_validate", lambda func_name: { "docstring": "docstring1", "errors": [ @@ -1285,50 +287,22 @@ def test_item_subsection(self, idx, subsection): assert result[idx][3] == subsection -class TestDocstringClass: - @pytest.mark.parametrize( - "name, expected_obj", - [ - ("pandas.isnull", pd.isnull), - ("pandas.DataFrame", pd.DataFrame), - ("pandas.Series.sum", pd.Series.sum), - ], - ) - def test_resolves_class_name(self, name, expected_obj): - d = validate_docstrings.Docstring(name) - assert d.obj is expected_obj - - @pytest.mark.parametrize("invalid_name", ["panda", "panda.DataFrame"]) - def test_raises_for_invalid_module_name(self, invalid_name): - msg = 'No module can be imported from "{}"'.format(invalid_name) - with pytest.raises(ImportError, match=msg): - validate_docstrings.Docstring(invalid_name) - - @pytest.mark.parametrize( - "invalid_name", ["pandas.BadClassName", "pandas.Series.bad_method_name"] - ) - def test_raises_for_invalid_attribute_name(self, invalid_name): - name_components = invalid_name.split(".") - obj_name, invalid_attr_name = name_components[-2], name_components[-1] - msg = "'{}' has no attribute '{}'".format(obj_name, invalid_attr_name) - with pytest.raises(AttributeError, match=msg): - validate_docstrings.Docstring(invalid_name) - +class TestPandasDocstringClass: @pytest.mark.parametrize( "name", ["pandas.Series.str.isdecimal", "pandas.Series.str.islower"] ) def test_encode_content_write_to_file(self, name): # GH25466 - docstr = validate_docstrings.Docstring(name).validate_pep8() + docstr = validate_docstrings.PandasDocstring(name).validate_pep8() # the list of pep8 errors should be empty assert not list(docstr) class TestMainFunction: - def test_exit_status_for_validate_one(self, monkeypatch): + def test_exit_status_for_main(self, monkeypatch): monkeypatch.setattr( validate_docstrings, - "validate_one", + "pandas_validate", lambda func_name: { "docstring": "docstring1", "errors": [ @@ -1336,8 +310,7 @@ def test_exit_status_for_validate_one(self, monkeypatch): ("ER02", "err desc"), ("ER03", "err desc"), ], - "warnings": [], - "examples_errors": "", + "examples_errs": "", }, ) exit_status = validate_docstrings.main( diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index b0eeb7b96e0eb..d43086756769a 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Analyze docstrings to detect errors. @@ -14,20 +14,14 @@ $ ./validate_docstrings.py pandas.DataFrame.head """ import argparse -import ast -import collections import doctest -import functools import glob import importlib -import inspect import json import os -import pydoc -import re import sys import tempfile -import textwrap +from typing import List, Optional import flake8.main.application @@ -53,87 +47,15 @@ import pandas # noqa: E402 isort:skip sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) -from numpydoc.docscrape import NumpyDocString # noqa: E402 isort:skip -from pandas.io.formats.printing import pprint_thing # noqa: E402 isort:skip +from numpydoc.validate import validate, Docstring # noqa: E402 isort:skip PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] -DIRECTIVES = ["versionadded", "versionchanged", "deprecated"] -DIRECTIVE_PATTERN = re.compile(rf"^\s*\.\. ({'|'.join(DIRECTIVES)})(?!::)", re.I | re.M) -ALLOWED_SECTIONS = [ - "Parameters", - "Attributes", - "Methods", - "Returns", - "Yields", - "Other Parameters", - "Raises", - "Warns", - "See Also", - "Notes", - "References", - "Examples", -] ERROR_MSGS = { - "GL01": "Docstring text (summary) should start in the line immediately " - "after the opening quotes (not in the same line, or leaving a " - "blank line in between)", - "GL02": "Closing quotes should be placed in the line after the last text " - "in the docstring (do not close the quotes in the same line as " - "the text, or leave a blank line between the last text and the " - "quotes)", - "GL03": "Double line break found; please use only one blank line to " - "separate sections or paragraphs, and do not leave blank lines " - "at the end of docstrings", "GL04": "Private classes ({mentioned_private_classes}) should not be " "mentioned in public docstrings", - "GL05": 'Tabs found at the start of line "{line_with_tabs}", please use ' - "whitespace only", - "GL06": 'Found unknown section "{section}". Allowed sections are: ' - "{allowed_sections}", - "GL07": "Sections are in the wrong order. Correct order is: {correct_sections}", - "GL08": "The object does not have a docstring", - "GL09": "Deprecation warning should precede extended summary", - "GL10": "reST directives {directives} must be followed by two colons", - "SS01": "No summary found (a short summary in a single line should be " - "present at the beginning of the docstring)", - "SS02": "Summary does not start with a capital letter", - "SS03": "Summary does not end with a period", - "SS04": "Summary contains heading whitespaces", - "SS05": "Summary must start with infinitive verb, not third person " - '(e.g. use "Generate" instead of "Generates")', - "SS06": "Summary should fit in a single line", - "ES01": "No extended summary found", - "PR01": "Parameters {missing_params} not documented", - "PR02": "Unknown parameters {unknown_params}", - "PR03": "Wrong parameters order. Actual: {actual_params}. " - "Documented: {documented_params}", - "PR04": 'Parameter "{param_name}" has no type', - "PR05": 'Parameter "{param_name}" type should not finish with "."', - "PR06": 'Parameter "{param_name}" type should use "{right_type}" instead ' - 'of "{wrong_type}"', - "PR07": 'Parameter "{param_name}" has no description', - "PR08": 'Parameter "{param_name}" description should start with a ' - "capital letter", - "PR09": 'Parameter "{param_name}" description should finish with "."', - "PR10": 'Parameter "{param_name}" requires a space before the colon ' - "separating the parameter name and type", - "RT01": "No Returns section found", - "RT02": "The first line of the Returns section should contain only the " - "type, unless multiple values are being returned", - "RT03": "Return value has no description", - "RT04": "Return value description should start with a capital letter", - "RT05": 'Return value description should finish with "."', - "YD01": "No Yields section found", - "SA01": "See Also section not found", - "SA02": "Missing period at end of description for See Also " - '"{reference_name}" reference', - "SA03": "Description should be capitalized for See Also " - '"{reference_name}" reference', - "SA04": 'Missing description for See Also "{reference_name}" reference', "SA05": "{reference_name} in `See Also` section does not need `pandas` " "prefix, use {right_reference} instead.", - "EX01": "No examples section found", "EX02": "Examples do not pass tests:\n{doctest_log}", "EX03": "flake8 error: {error_code} {error_message}{times_happening}", "EX04": "Do not import {imported_library}, as it is imported " @@ -141,29 +63,10 @@ } -def error(code, **kwargs): +def pandas_error(code, **kwargs): """ - Return a tuple with the error code and the message with variables replaced. - - This is syntactic sugar so instead of: - - `('EX02', ERROR_MSGS['EX02'].format(doctest_log=log))` - - We can simply use: - - `error('EX02', doctest_log=log)` - - Parameters - ---------- - code : str - Error code. - **kwargs - Values for the variables in the error messages - - Returns - ------- - code : str - Error code. - message : str - Error message with variables replaced. + Copy of the numpydoc error function, since ERROR_MSGS can't be updated + with our custom errors yet. """ return (code, ERROR_MSGS[code].format(**kwargs)) @@ -240,347 +143,7 @@ def get_api_items(api_doc_fd): previous_line = line -class Docstring: - def __init__(self, name): - self.name = name - obj = self._load_obj(name) - self.obj = obj - self.code_obj = self._to_original_callable(obj) - self.raw_doc = obj.__doc__ or "" - self.clean_doc = pydoc.getdoc(obj) - self.doc = NumpyDocString(self.clean_doc) - - def __len__(self) -> int: - return len(self.raw_doc) - - @staticmethod - def _load_obj(name): - """ - Import Python object from its name as string. - - Parameters - ---------- - name : str - Object name to import (e.g. pandas.Series.str.upper) - - Returns - ------- - object - Python object that can be a class, method, function... - - Examples - -------- - >>> Docstring._load_obj('pandas.Series') - - """ - for maxsplit in range(1, name.count(".") + 1): - # TODO when py3 only replace by: module, *func_parts = ... - func_name_split = name.rsplit(".", maxsplit) - module = func_name_split[0] - func_parts = func_name_split[1:] - try: - obj = importlib.import_module(module) - except ImportError: - pass - else: - continue - - if "obj" not in locals(): - raise ImportError("No module can be imported " 'from "{}"'.format(name)) - - for part in func_parts: - obj = getattr(obj, part) - return obj - - @staticmethod - def _to_original_callable(obj): - """ - Find the Python object that contains the source code of the object. - - This is useful to find the place in the source code (file and line - number) where a docstring is defined. It does not currently work for - all cases, but it should help find some (properties...). - """ - while True: - if inspect.isfunction(obj) or inspect.isclass(obj): - f = inspect.getfile(obj) - if f.startswith("<") and f.endswith(">"): - return None - return obj - if inspect.ismethod(obj): - obj = obj.__func__ - elif isinstance(obj, functools.partial): - obj = obj.func - elif isinstance(obj, property): - obj = obj.fget - else: - return None - - @property - def type(self): - return type(self.obj).__name__ - - @property - def is_function_or_method(self): - # TODO(py27): remove ismethod - return inspect.isfunction(self.obj) or inspect.ismethod(self.obj) - - @property - def source_file_name(self): - """ - File name where the object is implemented (e.g. pandas/core/frame.py). - """ - try: - fname = inspect.getsourcefile(self.code_obj) - except TypeError: - # In some cases the object is something complex like a cython - # object that can't be easily introspected. An it's better to - # return the source code file of the object as None, than crash - pass - else: - if fname: - fname = os.path.relpath(fname, BASE_PATH) - return fname - - @property - def source_file_def_line(self): - """ - Number of line where the object is defined in its file. - """ - try: - return inspect.getsourcelines(self.code_obj)[-1] - except (OSError, TypeError): - # In some cases the object is something complex like a cython - # object that can't be easily introspected. An it's better to - # return the line number as None, than crash - pass - - @property - def github_url(self): - url = "https://github.com/pandas-dev/pandas/blob/master/" - url += "{}#L{}".format(self.source_file_name, self.source_file_def_line) - return url - - @property - def start_blank_lines(self): - i = None - if self.raw_doc: - for i, row in enumerate(self.raw_doc.split("\n")): - if row.strip(): - break - return i - - @property - def end_blank_lines(self): - i = None - if self.raw_doc: - for i, row in enumerate(reversed(self.raw_doc.split("\n"))): - if row.strip(): - break - return i - - @property - def double_blank_lines(self): - prev = True - for row in self.raw_doc.split("\n"): - if not prev and not row.strip(): - return True - prev = row.strip() - return False - - @property - def section_titles(self): - sections = [] - self.doc._doc.reset() - while not self.doc._doc.eof(): - content = self.doc._read_to_next_section() - if ( - len(content) > 1 - and len(content[0]) == len(content[1]) - and set(content[1]) == {"-"} - ): - sections.append(content[0]) - return sections - - @property - def summary(self): - return " ".join(self.doc["Summary"]) - - @property - def num_summary_lines(self): - return len(self.doc["Summary"]) - - @property - def extended_summary(self): - if not self.doc["Extended Summary"] and len(self.doc["Summary"]) > 1: - return " ".join(self.doc["Summary"]) - return " ".join(self.doc["Extended Summary"]) - - @property - def needs_summary(self): - return not (bool(self.summary) and bool(self.extended_summary)) - - @property - def doc_parameters(self): - parameters = collections.OrderedDict() - for names, type_, desc in self.doc["Parameters"]: - for name in names.split(", "): - parameters[name] = (type_, "".join(desc)) - return parameters - - @property - def signature_parameters(self): - def add_stars(param_name: str, info: inspect.Parameter): - """ - Add stars to *args and **kwargs parameters - """ - if info.kind == inspect.Parameter.VAR_POSITIONAL: - return f"*{param_name}" - elif info.kind == inspect.Parameter.VAR_KEYWORD: - return f"**{param_name}" - else: - return param_name - - if inspect.isclass(self.obj): - if hasattr(self.obj, "_accessors") and ( - self.name.split(".")[-1] in self.obj._accessors - ): - # accessor classes have a signature but don't want to show this - return tuple() - try: - sig = inspect.signature(self.obj) - except (TypeError, ValueError): - # Some objects, mainly in C extensions do not support introspection - # of the signature - return tuple() - - params = tuple( - add_stars(parameter, sig.parameters[parameter]) - for parameter in sig.parameters - ) - if params and params[0] in ("self", "cls"): - return params[1:] - return params - - @property - def parameter_mismatches(self): - errs = [] - signature_params = self.signature_parameters - doc_params = tuple(self.doc_parameters) - missing = set(signature_params) - set(doc_params) - if missing: - errs.append(error("PR01", missing_params=pprint_thing(missing))) - extra = set(doc_params) - set(signature_params) - if extra: - errs.append(error("PR02", unknown_params=pprint_thing(extra))) - if ( - not missing - and not extra - and signature_params != doc_params - and not (not signature_params and not doc_params) - ): - errs.append( - error( - "PR03", actual_params=signature_params, documented_params=doc_params - ) - ) - - return errs - - @property - def correct_parameters(self): - return not bool(self.parameter_mismatches) - - @property - def directives_without_two_colons(self): - return DIRECTIVE_PATTERN.findall(self.raw_doc) - - def parameter_type(self, param): - return self.doc_parameters[param][0] - - def parameter_desc(self, param): - desc = self.doc_parameters[param][1] - # Find and strip out any sphinx directives - for directive in DIRECTIVES: - full_directive = ".. {}".format(directive) - if full_directive in desc: - # Only retain any description before the directive - desc = desc[: desc.index(full_directive)] - return desc - - @property - def see_also(self): - result = collections.OrderedDict() - for funcs, desc in self.doc["See Also"]: - for func, _ in funcs: - result[func] = "".join(desc) - - return result - - @property - def examples(self): - return self.doc["Examples"] - - @property - def returns(self): - return self.doc["Returns"] - - @property - def yields(self): - return self.doc["Yields"] - - @property - def method_source(self): - try: - source = inspect.getsource(self.obj) - except TypeError: - return "" - return textwrap.dedent(source) - - @property - def method_returns_something(self): - """ - Check if the docstrings method can return something. - - Bare returns, returns valued None and returns from nested functions are - disconsidered. - - Returns - ------- - bool - Whether the docstrings method can return something. - """ - - def get_returns_not_on_nested_functions(node): - returns = [node] if isinstance(node, ast.Return) else [] - for child in ast.iter_child_nodes(node): - # Ignore nested functions and its subtrees. - if not isinstance(child, ast.FunctionDef): - child_returns = get_returns_not_on_nested_functions(child) - returns.extend(child_returns) - return returns - - tree = ast.parse(self.method_source).body - if tree: - returns = get_returns_not_on_nested_functions(tree[0]) - return_values = [r.value for r in returns] - # Replace NameConstant nodes valued None for None. - for i, v in enumerate(return_values): - if isinstance(v, ast.NameConstant) and v.value is None: - return_values[i] = None - return any(return_values) - else: - return False - - @property - def first_line_ends_in_dot(self): - if self.doc: - return self.doc.split("\n")[0][-1] == "." - - @property - def deprecated(self): - return ".. deprecated:: " in (self.summary + self.extended_summary) - +class PandasDocstring(Docstring): @property def mentioned_private_classes(self): return [klass for klass in PRIVATE_CLASSES if klass in self.raw_doc] @@ -633,196 +196,50 @@ def validate_pep8(self): yield from application.guide.stats.statistics_for("") -def get_validation_data(doc): +def pandas_validate(func_name: str): """ - Validate the docstring. + Call the numpydoc validation, and add the errors specific to pandas. Parameters ---------- - doc : Docstring - A Docstring object with the given function name. + func_name : str + Name of the object of the docstring to validate. Returns ------- - tuple - errors : list of tuple - Errors occurred during validation. - warnings : list of tuple - Warnings occurred during validation. - examples_errs : str - Examples usage displayed along the error, otherwise empty string. - - Notes - ----- - The errors codes are defined as: - - First two characters: Section where the error happens: - * GL: Global (no section, like section ordering errors) - * SS: Short summary - * ES: Extended summary - * PR: Parameters - * RT: Returns - * YD: Yields - * RS: Raises - * WN: Warns - * SA: See Also - * NT: Notes - * RF: References - * EX: Examples - - Last two characters: Numeric error code inside the section - - For example, EX02 is the second codified error in the Examples section - (which in this case is assigned to examples that do not pass the tests). - - The error codes, their corresponding error messages, and the details on how - they are validated, are not documented more than in the source code of this - function. + dict + Information about the docstring and the errors found. """ + doc = PandasDocstring(func_name) + result = validate(func_name) - errs = [] - wrns = [] - if not doc.raw_doc: - errs.append(error("GL08")) - return errs, wrns, "" - - if doc.start_blank_lines != 1: - errs.append(error("GL01")) - if doc.end_blank_lines != 1: - errs.append(error("GL02")) - if doc.double_blank_lines: - errs.append(error("GL03")) mentioned_errs = doc.mentioned_private_classes if mentioned_errs: - errs.append(error("GL04", mentioned_private_classes=", ".join(mentioned_errs))) - for line in doc.raw_doc.splitlines(): - if re.match("^ *\t", line): - errs.append(error("GL05", line_with_tabs=line.lstrip())) - - unexpected_sections = [ - section for section in doc.section_titles if section not in ALLOWED_SECTIONS - ] - for section in unexpected_sections: - errs.append( - error("GL06", section=section, allowed_sections=", ".join(ALLOWED_SECTIONS)) + result["errors"].append( + pandas_error("GL04", mentioned_private_classes=", ".join(mentioned_errs)) ) - correct_order = [ - section for section in ALLOWED_SECTIONS if section in doc.section_titles - ] - if correct_order != doc.section_titles: - errs.append(error("GL07", correct_sections=", ".join(correct_order))) - - if doc.deprecated and not doc.extended_summary.startswith(".. deprecated:: "): - errs.append(error("GL09")) - - directives_without_two_colons = doc.directives_without_two_colons - if directives_without_two_colons: - errs.append(error("GL10", directives=directives_without_two_colons)) - - if not doc.summary: - errs.append(error("SS01")) - else: - if not doc.summary[0].isupper(): - errs.append(error("SS02")) - if doc.summary[-1] != ".": - errs.append(error("SS03")) - if doc.summary != doc.summary.lstrip(): - errs.append(error("SS04")) - elif doc.is_function_or_method and doc.summary.split(" ")[0][-1] == "s": - errs.append(error("SS05")) - if doc.num_summary_lines > 1: - errs.append(error("SS06")) - - if not doc.extended_summary: - wrns.append(("ES01", "No extended summary found")) - - # PR01: Parameters not documented - # PR02: Unknown parameters - # PR03: Wrong parameters order - errs += doc.parameter_mismatches - - for param in doc.doc_parameters: - if not param.startswith("*"): # Check can ignore var / kwargs - if not doc.parameter_type(param): - if ":" in param: - errs.append(error("PR10", param_name=param.split(":")[0])) - else: - errs.append(error("PR04", param_name=param)) - else: - if doc.parameter_type(param)[-1] == ".": - errs.append(error("PR05", param_name=param)) - common_type_errors = [ - ("integer", "int"), - ("boolean", "bool"), - ("string", "str"), - ] - for wrong_type, right_type in common_type_errors: - if wrong_type in doc.parameter_type(param): - errs.append( - error( - "PR06", - param_name=param, - right_type=right_type, - wrong_type=wrong_type, - ) - ) - if not doc.parameter_desc(param): - errs.append(error("PR07", param_name=param)) - else: - if not doc.parameter_desc(param)[0].isupper(): - errs.append(error("PR08", param_name=param)) - if doc.parameter_desc(param)[-1] != ".": - errs.append(error("PR09", param_name=param)) - - if doc.is_function_or_method: - if not doc.returns: - if doc.method_returns_something: - errs.append(error("RT01")) - else: - if len(doc.returns) == 1 and doc.returns[0].name: - errs.append(error("RT02")) - for name_or_type, type_, desc in doc.returns: - if not desc: - errs.append(error("RT03")) - else: - desc = " ".join(desc) - if not desc[0].isupper(): - errs.append(error("RT04")) - if not desc.endswith("."): - errs.append(error("RT05")) - - if not doc.yields and "yield" in doc.method_source: - errs.append(error("YD01")) - - if not doc.see_also: - wrns.append(error("SA01")) - else: + if doc.see_also: for rel_name, rel_desc in doc.see_also.items(): - if rel_desc: - if not rel_desc.endswith("."): - errs.append(error("SA02", reference_name=rel_name)) - if not rel_desc[0].isupper(): - errs.append(error("SA03", reference_name=rel_name)) - else: - errs.append(error("SA04", reference_name=rel_name)) if rel_name.startswith("pandas."): - errs.append( - error( + result["errors"].append( + pandas_error( "SA05", reference_name=rel_name, right_reference=rel_name[len("pandas.") :], ) ) - examples_errs = "" - if not doc.examples: - wrns.append(error("EX01")) - else: - examples_errs = doc.examples_errors - if examples_errs: - errs.append(error("EX02", doctest_log=examples_errs)) + result["examples_errs"] = "" + if doc.examples: + result["examples_errs"] = doc.examples_errors + if result["examples_errs"]: + result["errors"].append( + pandas_error("EX02", doctest_log=result["examples_errs"]) + ) for err in doc.validate_pep8(): - errs.append( - error( + result["errors"].append( + pandas_error( "EX03", error_code=err.error_code, error_message=err.message, @@ -834,38 +251,11 @@ def get_validation_data(doc): examples_source_code = "".join(doc.examples_source_code) for wrong_import in ("numpy", "pandas"): if "import {}".format(wrong_import) in examples_source_code: - errs.append(error("EX04", imported_library=wrong_import)) - return errs, wrns, examples_errs - - -def validate_one(func_name): - """ - Validate the docstring for the given func_name - - Parameters - ---------- - func_name : function - Function whose docstring will be evaluated (e.g. pandas.read_csv). + result["errors"].append( + pandas_error("EX04", imported_library=wrong_import) + ) - Returns - ------- - dict - A dictionary containing all the information obtained from validating - the docstring. - """ - doc = Docstring(func_name) - errs, wrns, examples_errs = get_validation_data(doc) - return { - "type": doc.type, - "docstring": doc.clean_doc, - "deprecated": doc.deprecated, - "file": doc.source_file_name, - "file_line": doc.source_file_def_line, - "github_link": doc.github_url, - "errors": errs, - "warnings": wrns, - "examples_errors": examples_errs, - } + return result def validate_all(prefix, ignore_deprecated=False): @@ -890,16 +280,16 @@ def validate_all(prefix, ignore_deprecated=False): result = {} seen = {} - # functions from the API docs api_doc_fnames = os.path.join(BASE_PATH, "doc", "source", "reference", "*.rst") api_items = [] for api_doc_fname in glob.glob(api_doc_fnames): with open(api_doc_fname) as f: api_items += list(get_api_items(f)) + for func_name, func_obj, section, subsection in api_items: if prefix and not func_name.startswith(prefix): continue - doc_info = validate_one(func_name) + doc_info = pandas_validate(func_name) if ignore_deprecated and doc_info["deprecated"]: continue result[func_name] = doc_info @@ -917,24 +307,40 @@ def validate_all(prefix, ignore_deprecated=False): seen[shared_code_key] = func_name - # functions from introspecting Series and DataFrame - api_item_names = set(list(zip(*api_items))[0]) - for class_ in (pandas.Series, pandas.DataFrame): - for member in inspect.getmembers(class_): - func_name = "pandas.{}.{}".format(class_.__name__, member[0]) - if not member[0].startswith("_") and func_name not in api_item_names: - if prefix and not func_name.startswith(prefix): - continue - doc_info = validate_one(func_name) - if ignore_deprecated and doc_info["deprecated"]: - continue - result[func_name] = doc_info - result[func_name]["in_api"] = False - return result -def main(func_name, prefix, errors, output_format, ignore_deprecated): +def print_validate_all_results( + prefix: str, + errors: Optional[List[str]], + output_format: str, + ignore_deprecated: bool, +): + if output_format not in ("default", "json", "actions"): + raise ValueError(f'Unknown output_format "{output_format}"') + + result = validate_all(prefix, ignore_deprecated) + + if output_format == "json": + sys.stdout.write(json.dumps(result)) + return 0 + + prefix = "##[error]" if output_format == "actions" else "" + exit_status = 0 + for name, res in result.items(): + for err_code, err_desc in res["errors"]: + if errors and err_code not in errors: + continue + sys.stdout.write( + f'{prefix}{res["file"]}:{res["file_line"]}:' + f"{err_code}:{name}:{err_desc}\n" + ) + exit_status += 1 + + return exit_status + + +def print_validate_one_results(func_name: str): def header(title, width=80, char="#"): full_line = char * width side_len = (width - len(title) - 2) // 2 @@ -943,78 +349,44 @@ def header(title, width=80, char="#"): side=char * side_len, title=title, adj=adj ) - return "\n{full_line}\n{title_line}\n{full_line}\n\n".format( - full_line=full_line, title_line=title_line - ) + return f"\n{full_line}\n{title_line}\n{full_line}\n\n" - exit_status = 0 - if func_name is None: - result = validate_all(prefix, ignore_deprecated) - - if output_format == "json": - output = json.dumps(result) - else: - if output_format == "default": - output_format = "{text}\n" - elif output_format == "azure": - output_format = ( - "##vso[task.logissue type=error;" - "sourcepath={path};" - "linenumber={row};" - "code={code};" - "]{text}\n" - ) - else: - raise ValueError(f'Unknown output_format "{output_format}"') - - output = "" - for name, res in result.items(): - for err_code, err_desc in res["errors"]: - # The script would be faster if instead of filtering the - # errors after validating them, it didn't validate them - # initially. But that would complicate the code too much - if errors and err_code not in errors: - continue - exit_status += 1 - output += output_format.format( - path=res["file"], - row=res["file_line"], - code=err_code, - text=f"{name}: {err_desc}", - ) + result = pandas_validate(func_name) - sys.stdout.write(output) + sys.stderr.write(header(f"Docstring ({func_name})")) + sys.stderr.write(f"{result['docstring']}\n") - else: - result = validate_one(func_name) - sys.stderr.write(header("Docstring ({})".format(func_name))) - sys.stderr.write("{}\n".format(result["docstring"])) - sys.stderr.write(header("Validation")) - if result["errors"]: - sys.stderr.write("{} Errors found:\n".format(len(result["errors"]))) - for err_code, err_desc in result["errors"]: - # Failing examples are printed at the end - if err_code == "EX02": - sys.stderr.write("\tExamples do not pass tests\n") - continue - sys.stderr.write("\t{}\n".format(err_desc)) - if result["warnings"]: - sys.stderr.write("{} Warnings found:\n".format(len(result["warnings"]))) - for wrn_code, wrn_desc in result["warnings"]: - sys.stderr.write("\t{}\n".format(wrn_desc)) - - if not result["errors"]: - sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name)) - - if result["examples_errors"]: - sys.stderr.write(header("Doctests")) - sys.stderr.write(result["examples_errors"]) + sys.stderr.write(header("Validation")) + if result["errors"]: + sys.stderr.write(f'{len(result["errors"])} Errors found:\n') + for err_code, err_desc in result["errors"]: + if err_code == "EX02": # Failing examples are printed at the end + sys.stderr.write("\tExamples do not pass tests\n") + continue + sys.stderr.write(f"\t{err_desc}\n") + elif result["errors"]: + sys.stderr.write(f'Docstring for "{func_name}" correct. :)\n') - return exit_status + if result["examples_errs"]: + sys.stderr.write(header("Doctests")) + sys.stderr.write(result["examples_errs"]) + + +def main(func_name, prefix, errors, output_format, ignore_deprecated): + """ + Main entry point. Call the validation for one or for all docstrings. + """ + if func_name is None: + return print_validate_all_results( + prefix, errors, output_format, ignore_deprecated + ) + else: + print_validate_one_results(func_name) + return 0 if __name__ == "__main__": - format_opts = "default", "json", "azure" + format_opts = "default", "json", "actions" func_help = ( "function or method to validate (e.g. pandas.DataFrame.head) " "if not provided, all docstrings are validated and returned " @@ -1027,16 +399,16 @@ def header(title, width=80, char="#"): default="default", choices=format_opts, help="format of the output when validating " - "multiple docstrings (ignored when validating one)." - "It can be {}".format(str(format_opts)[1:-1]), + "multiple docstrings (ignored when validating one). " + "It can be {str(format_opts)[1:-1]}", ) argparser.add_argument( "--prefix", default=None, help="pattern for the " "docstring names, in order to decide which ones " - 'will be validated. A prefix "pandas.Series.str.' - "will make the script validate all the docstrings" + 'will be validated. A prefix "pandas.Series.str."' + "will make the script validate all the docstrings " "of methods starting by this pattern. It is " "ignored if parameter function is provided", ) diff --git a/scripts/validate_string_concatenation.py b/scripts/validate_string_concatenation.py new file mode 100755 index 0000000000000..fbf3bb5cfccf2 --- /dev/null +++ b/scripts/validate_string_concatenation.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +""" +GH #30454 + +Check where there is a string that needs to be concatenated. + +This is necessary after black formating, +where for example black transforms this: + +>>> foo = ( +... "bar " +... "baz" +... ) + +into this: + +>>> foo = ("bar " "baz") + +Black is not considering this as an +issue (see issue https://github.com/psf/black/issues/1051), +so we are checking it here. +""" + +import argparse +import os +import sys +import token +import tokenize +from typing import Generator, List, Tuple + +FILE_EXTENSIONS_TO_CHECK = (".py", ".pyx", ".pyx.ini", ".pxd") + + +def main(source_path: str, output_format: str) -> bool: + """ + Main entry point of the script. + + Parameters + ---------- + source_path : str + Source path representing path to a file/directory. + output_format : str + Output format of the script. + + Returns + ------- + bool + True if found any strings that needs to be concatenated. + + Raises + ------ + ValueError + If the `source_path` is not pointing to existing file/directory. + """ + if not os.path.exists(source_path): + raise ValueError( + "Please enter a valid path, pointing to a valid file/directory." + ) + + is_failed: bool = False + + msg = "String unnecessarily split in two by black. Please merge them manually." + + if os.path.isfile(source_path): + for source_path, line_number in strings_to_concatenate(source_path): + is_failed = True + print( + output_format.format( + source_path=source_path, line_number=line_number, msg=msg + ) + ) + + for subdir, _, files in os.walk(source_path): + for file_name in files: + if any( + file_name.endswith(extension) for extension in FILE_EXTENSIONS_TO_CHECK + ): + for source_path, line_number in strings_to_concatenate( + os.path.join(subdir, file_name) + ): + is_failed = True + print( + output_format.format( + source_path=source_path, line_number=line_number, msg=msg + ) + ) + return is_failed + + +def strings_to_concatenate(source_path: str) -> Generator[Tuple[str, int], None, None]: + """ + Yielding the strings that needs to be concatenated in a given file. + + Parameters + ---------- + source_path : str + File path pointing to a single file. + + Yields + ------ + source_path : str + Source file path. + line_number : int + Line number of unconcatenated string. + """ + with open(source_path, "r") as file_name: + tokens: List = list(tokenize.generate_tokens(file_name.readline)) + + for current_token, next_token in zip(tokens, tokens[1:]): + if current_token[0] == next_token[0] == token.STRING: + yield source_path, current_token[2][0] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Validate concatenated strings") + + parser.add_argument( + "path", nargs="?", default=".", help="Source path of file/directory to check." + ) + parser.add_argument( + "--format", + "-f", + default="{source_path}:{line_number}:{msg}", + help="Output format of the unconcatenated strings.", + ) + + args = parser.parse_args() + + sys.exit(main(source_path=args.path, output_format=args.format)) diff --git a/setup.cfg b/setup.cfg index 8fb602188dad5..d0570cee6fe10 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,6 +66,7 @@ xfail_strict = True filterwarnings = error:Sparse:FutureWarning error:The SparseArray:FutureWarning +junit_family=xunit2 [coverage:run] branch = False @@ -109,7 +110,7 @@ known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER known_first_party = pandas -known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml +known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml,odf multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 @@ -123,6 +124,7 @@ skip = pandas/__init__.py,pandas/core/api.py ignore_missing_imports=True no_implicit_optional=True check_untyped_defs=True +strict_equality=True [mypy-pandas.tests.*] check_untyped_defs=False @@ -151,15 +153,9 @@ ignore_errors=True [mypy-pandas._version] check_untyped_defs=False -[mypy-pandas.core.arrays.boolean] -check_untyped_defs=False - [mypy-pandas.core.arrays.categorical] check_untyped_defs=False -[mypy-pandas.core.arrays.integer] -check_untyped_defs=False - [mypy-pandas.core.arrays.interval] check_untyped_defs=False @@ -169,12 +165,6 @@ check_untyped_defs=False [mypy-pandas.core.base] check_untyped_defs=False -[mypy-pandas.core.computation.align] -check_untyped_defs=False - -[mypy-pandas.core.computation.eval] -check_untyped_defs=False - [mypy-pandas.core.computation.expr] check_untyped_defs=False @@ -190,15 +180,9 @@ check_untyped_defs=False [mypy-pandas.core.computation.scope] check_untyped_defs=False -[mypy-pandas.core.config_init] -check_untyped_defs=False - [mypy-pandas.core.dtypes.cast] check_untyped_defs=False -[mypy-pandas.core.dtypes.generic] -check_untyped_defs=False - [mypy-pandas.core.frame] check_untyped_defs=False @@ -217,9 +201,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.base] check_untyped_defs=False -[mypy-pandas.core.indexes.category] -check_untyped_defs=False - [mypy-pandas.core.indexes.datetimelike] check_untyped_defs=False @@ -232,9 +213,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.multi] check_untyped_defs=False -[mypy-pandas.core.indexes.timedeltas] -check_untyped_defs=False - [mypy-pandas.core.indexing] check_untyped_defs=False @@ -268,9 +246,6 @@ check_untyped_defs=False [mypy-pandas.core.reshape.reshape] check_untyped_defs=False -[mypy-pandas.core.series] -check_untyped_defs=False - [mypy-pandas.core.strings] check_untyped_defs=False @@ -325,9 +300,6 @@ check_untyped_defs=False [mypy-pandas.io.json._json] check_untyped_defs=False -[mypy-pandas.io.json._normalize] -check_untyped_defs=False - [mypy-pandas.io.json._table_schema] check_untyped_defs=False @@ -346,9 +318,6 @@ check_untyped_defs=False [mypy-pandas.io.sas.sasreader] check_untyped_defs=False -[mypy-pandas.io.sql] -check_untyped_defs=False - [mypy-pandas.io.stata] check_untyped_defs=False @@ -361,14 +330,11 @@ check_untyped_defs=False [mypy-pandas.plotting._matplotlib.misc] check_untyped_defs=False -[mypy-pandas.plotting._matplotlib.timeseries] -check_untyped_defs=False - [mypy-pandas.tseries.holiday] check_untyped_defs=False [mypy-pandas.tseries.offsets] check_untyped_defs=False -[mypy-pandas.util.testing] +[mypy-pandas._testing] check_untyped_defs=False diff --git a/setup.py b/setup.py index c6b078dae280a..191fe49d1eb89 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Parts of this file were taken from the pyzmq project @@ -49,11 +49,12 @@ def is_platform_mac(): try: import Cython - ver = Cython.__version__ + _CYTHON_VERSION = Cython.__version__ from Cython.Build import cythonize - _CYTHON_INSTALLED = ver >= LooseVersion(min_cython_ver) + _CYTHON_INSTALLED = _CYTHON_VERSION >= LooseVersion(min_cython_ver) except ImportError: + _CYTHON_VERSION = None _CYTHON_INSTALLED = False cythonize = lambda x, *args, **kwargs: x # dummy func @@ -355,7 +356,7 @@ def run(self): sourcefile = pyxfile[:-3] + extension msg = ( f"{extension}-source file '{sourcefile}' not found.\n" - f"Run 'setup.py cython' before sdist." + "Run 'setup.py cython' before sdist." ) assert os.path.isfile(sourcefile), msg sdist_class.run(self) @@ -411,15 +412,14 @@ def run(self): cmdclass.update({"clean": CleanCommand, "build": build}) +cmdclass["build_ext"] = CheckingBuildExt if cython: suffix = ".pyx" - cmdclass["build_ext"] = CheckingBuildExt cmdclass["cython"] = CythonCommand else: suffix = ".c" cmdclass["build_src"] = DummyBuildSrc - cmdclass["build_ext"] = CheckingBuildExt # ---------------------------------------------------------------------- # Preparation of compiler arguments @@ -504,13 +504,22 @@ def maybe_cythonize(extensions, *args, **kwargs): # See https://github.com/cython/cython/issues/1495 return extensions + elif not cython: + # GH#28836 raise a helfpul error message + if _CYTHON_VERSION: + raise RuntimeError( + f"Cannot cythonize with old Cython version ({_CYTHON_VERSION} " + f"installed, needs {min_cython_ver})" + ) + raise RuntimeError("Cannot cythonize without Cython installed.") + numpy_incl = pkg_resources.resource_filename("numpy", "core/include") # TODO: Is this really necessary here? for ext in extensions: if hasattr(ext, "include_dirs") and numpy_incl not in ext.include_dirs: ext.include_dirs.append(numpy_incl) - # reuse any parallel arguments provided for compliation to cythonize + # reuse any parallel arguments provided for compilation to cythonize parser = argparse.ArgumentParser() parser.add_argument("-j", type=int) parser.add_argument("--parallel", type=int) @@ -522,6 +531,11 @@ def maybe_cythonize(extensions, *args, **kwargs): elif parsed.j: nthreads = parsed.j + # GH#30356 Cythonize doesn't support parallel on Windows + if is_platform_windows() and nthreads > 0: + print("Parallel build for cythonize ignored on Windows") + nthreads = 0 + kwargs["nthreads"] = nthreads build_ext.render_templates(_pxifiles) return cythonize(extensions, *args, **kwargs) @@ -587,6 +601,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): }, "_libs.reduction": {"pyxfile": "_libs/reduction"}, "_libs.ops": {"pyxfile": "_libs/ops"}, + "_libs.ops_dispatch": {"pyxfile": "_libs/ops_dispatch"}, "_libs.properties": {"pyxfile": "_libs/properties"}, "_libs.reshape": {"pyxfile": "_libs/reshape", "depends": []}, "_libs.sparse": {"pyxfile": "_libs/sparse", "depends": _pxi_dep["sparse"]}, diff --git a/web/pandas/_templates/layout.html b/web/pandas/_templates/layout.html index 120058afd1190..92126a7b5a2f2 100644 --- a/web/pandas/_templates/layout.html +++ b/web/pandas/_templates/layout.html @@ -84,11 +84,6 @@ -
  • - - - -
  • pandas is a fiscally sponsored project of NumFOCUS diff --git a/web/pandas/about/roadmap.md b/web/pandas/about/roadmap.md index 8a5c2735b3d93..35a6b3361f32e 100644 --- a/web/pandas/about/roadmap.md +++ b/web/pandas/about/roadmap.md @@ -134,19 +134,6 @@ pandas documentation. Some specific goals include subsections of the documentation to make navigation and finding content easier. -## Package docstring validation - -To improve the quality and consistency of pandas docstrings, we've -developed tooling to check docstrings in a variety of ways. - -contains the checks. - -Like many other projects, pandas uses the -[numpydoc](https://numpydoc.readthedocs.io/en/latest/) style for writing -docstrings. With the collaboration of the numpydoc maintainers, we'd -like to move the checks to a package other than pandas so that other -projects can easily use them as well. - ## Performance monitoring Pandas uses [airspeed velocity](https://asv.readthedocs.io/en/stable/) diff --git a/web/pandas/about/sponsors.md b/web/pandas/about/sponsors.md index dcc6e367e5d64..4473a16cfd590 100644 --- a/web/pandas/about/sponsors.md +++ b/web/pandas/about/sponsors.md @@ -11,31 +11,50 @@ health and sustainability of the project. Visit numfocus.org for more informatio Donations to _pandas_ are managed by NumFOCUS. For donors in the United States, your gift is tax-deductible to the extent provided by law. As with any donation, you should consult with your tax adviser about your particular tax situation. -## Tidelift +## Become a sponsor -_pandas_ is part of the [Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-pandas?utm_source=pypi-pandas&utm_medium=referral&utm_campaign=readme). -You can support pandas by becoming a Tidelift subscriber. +As a free and open source project, _pandas_ relies on the support of the community of users for its development. +If you work for an organization that uses and benefits from _pandas_, please consider supporting pandas. There +are different ways, such as employing people to work on pandas, funding the project, or becoming a +[NumFOCUS sponsor](https://numfocus.org/sponsors) to support the broader ecosystem. Please contact us at +[admin@numfocus.org](mailto:admin@numfocus.org) to discuss. ## Institutional partners -Institutional Partners are companies and universities that support the project by employing contributors. -Current Institutional Partners include: +Institutional partners are companies and universities that support the project by employing contributors. +Current institutional partners include:

      - {% for company in partners.active if company.employs %} -
    • {{ company.name }} ({{ company.employs }})
    • + {% for company in sponsors.active if company.kind == "partner" %} +
    • {{ company.name }}: {{ company.description }}
    • + {% endfor %} +
    + +## Sponsors + +Sponsors are organizations that provide funding for pandas. Current sponsors include: + +
      + {% for company in sponsors.active if company.kind == "regular" %} +
    • {{ company.name }}: {{ company.description }}
    • {% endfor %}
    ## In-kind sponsors -- [OVH](https://us.ovhcloud.com/): Hosting -- [Indeed](https://opensource.indeedeng.io/): Logo and website design +In-kind sponsors are organizations that support pandas development with goods or services. +Current in-kind sponsors include: + +
      + {% for company in sponsors.inkind %} +
    • {{ company.name }}: {{ company.description }}
    • + {% endfor %} +
    ## Past institutional partners
      - {% for company in partners.past %} + {% for company in sponsors.past if company.kind == "partner" %}
    • {{ company.name }}
    • {% endfor %}
    diff --git a/web/pandas/community/blog.html b/web/pandas/community/blog.html index ffe6f97d679e4..627aaa450893b 100644 --- a/web/pandas/community/blog.html +++ b/web/pandas/community/blog.html @@ -4,10 +4,10 @@ {% for post in blog.posts %}
    -

    {{ post.title }}

    -
    Source: {{ post.feed }} | Author: {{ post.author }} | Published: {{ post.published.strftime("%b %d, %Y") }}
    -
    {{ post.summary }}
    - Read +
    {{ post.title }}
    +
    Source: {{ post.feed }} | Author: {{ post.author }} | Published: {{ post.published.strftime("%b %d, %Y") }}
    +
    {{ post.summary }}
    + Read more
    {% endfor %} diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index af6fd1ac77605..a707854c6ed2c 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -84,19 +84,16 @@ pandas with the option to perform statistical estimation while plotting, aggregating across observations and visualizing the fit of statistical models to emphasize patterns in a dataset. -### [yhat/ggpy](https://github.com/yhat/ggpy) +### [plotnine](https://github.com/has2k1/plotnine/) Hadley Wickham's [ggplot2](https://ggplot2.tidyverse.org/) is a foundational exploratory visualization package for the R language. Based on ["The Grammar of Graphics"](https://www.cs.uic.edu/~wilkinson/TheGrammarOfGraphics/GOG.html) it provides a powerful, declarative and extremely general way to -generate bespoke plots of any kind of data. It's really quite -incredible. Various implementations to other languages are available, -but a faithful implementation for Python users has long been missing. -Although still young (as of Jan-2014), the -[yhat/ggpy](https://github.com/yhat/ggpy) project has been progressing -quickly in that direction. +generate bespoke plots of any kind of data. +Various implementations to other languages are available. +A good implementation for Python users is [has2k1/plotnine](https://github.com/has2k1/plotnine/). ### [IPython Vega](https://github.com/vega/ipyvega) diff --git a/web/pandas/config.yml b/web/pandas/config.yml index e2a95a5039884..d041d6dd2ac95 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -50,8 +50,6 @@ navbar: target: /community/blog.html - name: "Ask a question (StackOverflow)" target: https://stackoverflow.com/questions/tagged/pandas - - name: "Discuss" - target: https://pandas.discourse.group - name: "Code of conduct" target: /community/coc.html - name: "Ecosystem" @@ -101,30 +99,50 @@ maintainers: - Wes McKinney - Jeff Reback - Joris Van den Bossche -partners: +sponsors: active: - name: "NumFOCUS" url: https://numfocus.org/ logo: /static/img/partners/numfocus.svg + kind: numfocus - name: "Anaconda" url: https://www.anaconda.com/ logo: /static/img/partners/anaconda.svg - employs: "Tom Augspurger, Brock Mendel" + kind: partner + description: "Tom Augspurger, Brock Mendel" - name: "Two Sigma" url: https://www.twosigma.com/ logo: /static/img/partners/two_sigma.svg - employs: "Phillip Cloud, Jeff Reback" + kind: partner + description: "Phillip Cloud, Jeff Reback" - name: "RStudio" url: https://www.rstudio.com/ logo: /static/img/partners/r_studio.svg - employs: "Wes McKinney" + kind: partner + description: "Wes McKinney" - name: "Ursa Labs" url: https://ursalabs.org/ logo: /static/img/partners/ursa_labs.svg - employs: "Wes McKinney, Joris Van den Bossche" + kind: partner + description: "Wes McKinney, Joris Van den Bossche" - name: "Tidelift" url: https://tidelift.com logo: /static/img/partners/tidelift.svg + kind: regular + description: "pandas is part of the Tidelift subscription. You can support pandas by becoming a Tidelift subscriber." + - name: "Chan Zuckerberg Initiative" + url: https://chanzuckerberg.com/ + logo: /static/img/partners/czi.svg + kind: regular + description: "pandas is funded by the Essential Open Source Software for Science program of the Chan Zuckerberg Initiative. The funding is used for general maintainance, improve extension types, and a efficient string type." + inkind: # not included in active so they don't appear in the home page + - name: "OVH" + url: https://us.ovhcloud.com/ + description: "Website and documentation hosting." + - name: "Indeed" + url: https://opensource.indeedeng.io/ + description: "pandas logo design" past: - name: "Paris-Saclay Center for Data Science" url: https://www.datascience-paris-saclay.fr/ + kind: partner diff --git a/web/pandas/index.html b/web/pandas/index.html index df6e5ab9a330b..0f4598add4efc 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -35,7 +35,7 @@
    Documentation
    Community
    @@ -43,15 +43,20 @@
    Community
    With the support of:
    -
    - {% for company in partners.active %} -
    - - {{ company.name }} - -
    - {% endfor %} -
    + {% for row in sponsors.active | batch(6, "") %} +
    + {% for company in row %} +
    + {% if company %} + + {{ company.name }} + + {% endif %} +
    + {% endfor %} +
    + {% endfor %} +

    The full list of companies supporting pandas is available in the sponsors page.

    diff --git a/web/pandas/static/img/partners/czi.svg b/web/pandas/static/img/partners/czi.svg new file mode 100644 index 0000000000000..b0ad9eb80580b --- /dev/null +++ b/web/pandas/static/img/partners/czi.svg @@ -0,0 +1,38 @@ + + + + Group + Created with Sketch. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/web/pandas_web.py b/web/pandas_web.py old mode 100644 new mode 100755 index d515d8a0e1cd7..a34a31feabce0 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Simple static site generator for the pandas web. @@ -28,14 +28,15 @@ import importlib import operator import os +import re import shutil import sys import time import typing import feedparser -import markdown import jinja2 +import markdown import requests import yaml @@ -74,6 +75,7 @@ def blog_add_posts(context): preprocessor fetches the posts in the feeds, and returns the relevant information for them (sorted from newest to oldest). """ + tag_expr = re.compile("<.*?>") posts = [] for feed_url in context["blog"]["feed"]: feed_data = feedparser.parse(feed_url) @@ -81,6 +83,7 @@ def blog_add_posts(context): published = datetime.datetime.fromtimestamp( time.mktime(entry.published_parsed) ) + summary = re.sub(tag_expr, "", entry.summary) posts.append( { "title": entry.title, @@ -89,7 +92,7 @@ def blog_add_posts(context): "feed": feed_data["feed"]["title"], "link": entry.link, "description": entry.description, - "summary": entry.summary, + "summary": summary, } ) posts.sort(key=operator.itemgetter("published"), reverse=True)
    0 0http://pandas.pydata.org/?q1=a&q2=bhttps://pandas.pydata.org/?q1=a&q2=b pydata.org
    0 0http://pandas.pydata.org/?q1=a&q2=bhttps://pandas.pydata.org/?q1=a&q2=b pydata.org
    {val}{40 + h}{val}31