diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000000000..e947f30d285cdb --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,147 @@ +version: 2 +jobs: + + # -------------------------------------------------------------------------- + # 0. py27_compat + # -------------------------------------------------------------------------- + py27_compat: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + environment: + JOB: "2.7_COMPAT" + ENV_FILE: "ci/circle-27-compat.yaml" + LOCALE_OVERRIDE: "it_IT.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + # -------------------------------------------------------------------------- + # 1. py36_locale + # -------------------------------------------------------------------------- + py36_locale: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.6_LOCALE" + ENV_FILE: "ci/circle-36-locale.yaml" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + # -------------------------------------------------------------------------- + # 2. py36_locale_slow + # -------------------------------------------------------------------------- + py36_locale_slow: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.6_LOCALE_SLOW" + ENV_FILE: "ci/circle-36-locale_slow.yaml" + LOCALE_OVERRIDE: "zh_CN.UTF-8" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --only-slow --skip-network + + # -------------------------------------------------------------------------- + # 3. py35_ascii + # -------------------------------------------------------------------------- + py35_ascii: + docker: + - image: continuumio/miniconda:latest + # databases configuration + - image: circleci/postgres:9.6.5-alpine-ram + environment: + POSTGRES_USER: postgres + POSTGRES_DB: pandas_nosetest + - image: circleci/mysql:8-ram + environment: + MYSQL_USER: "root" + MYSQL_HOST: "localhost" + MYSQL_ALLOW_EMPTY_PASSWORD: "true" + MYSQL_DATABASE: "pandas_nosetest" + + environment: + JOB: "3.5_ASCII" + ENV_FILE: "ci/circle-35-ascii.yaml" + LOCALE_OVERRIDE: "C" + MINICONDA_DIR: /home/ubuntu/miniconda3 + steps: + - checkout + - run: + name: build + command: | + ./ci/install_circle.sh + ./ci/show_circle.sh + - run: + name: test + command: ./ci/run_circle.sh --skip-slow --skip-network + + +workflows: + version: 2 + build_and_test: + jobs: + - py27_compat + - py36_locale + - py36_locale_slow + - py35_ascii diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index 3f630aa6cf8f56..00000000000000 --- a/.coveragerc +++ /dev/null @@ -1,27 +0,0 @@ -# .coveragerc to control coverage.py -[run] -branch = False -omit = */tests/* - -[report] -# Regexes for lines to exclude from consideration -exclude_lines = - # Have to re-enable the standard pragma - pragma: no cover - - # Don't complain about missing debug-only code: - def __repr__ - if self\.debug - - # Don't complain if tests don't hit defensive assertion code: - raise AssertionError - raise NotImplementedError - - # Don't complain if non-runnable code isn't run: - if 0: - if __name__ == .__main__.: - -ignore_errors = False - -[html] -directory = coverage_html_report diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md new file mode 100644 index 00000000000000..a1fbece3284ece --- /dev/null +++ b/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,63 @@ +# Contributor Code of Conduct + +As contributors and maintainers of this project, and in the interest of +fostering an open and welcoming community, we pledge to respect all people who +contribute through reporting issues, posting feature requests, updating +documentation, submitting pull requests or patches, and other activities. + +We are committed to making participation in this project a harassment-free +experience for everyone, regardless of level of experience, gender, gender +identity and expression, sexual orientation, disability, personal appearance, +body size, race, ethnicity, age, religion, or nationality. + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery +* Personal attacks +* Trolling or insulting/derogatory comments +* Public or private harassment +* Publishing other's private information, such as physical or electronic + addresses, without explicit permission +* Other unethical or unprofessional conduct + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +By adopting this Code of Conduct, project maintainers commit themselves to +fairly and consistently applying these principles to every aspect of managing +this project. Project maintainers who do not follow or enforce the Code of +Conduct may be permanently removed from the project team. + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. + +A working group of community members is committed to promptly addressing any +reported issues. The working group is made up of pandas contributors and users. +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the working group by e-mail (pandas-coc@googlegroups.com). +Messages sent to this e-mail address will not be publicly visible but only to +the working group members. The working group currently includes + +- Safia Abdalla +- Tom Augspurger +- Joris Van den Bossche +- Camille Scott +- Nathaniel Smith + +All complaints will be reviewed and investigated and will result in a response +that is deemed necessary and appropriate to the circumstances. Maintainers are +obligated to maintain confidentiality with regard to the reporter of an +incident. + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 1.3.0, available at +[http://contributor-covenant.org/version/1/3/0/][version], +and the [Swift Code of Conduct][swift]. + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/3/0/ +[swift]: https://swift.org/community/#code-of-conduct + diff --git a/.gitignore b/.gitignore index ff0a6aef47163f..a59f2843c365ae 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,7 @@ .ipynb_checkpoints .tags .cache/ +.vscode/ # Compiled source # ################### @@ -60,6 +61,9 @@ dist .coverage coverage.xml coverage_html_report +*.pytest_cache +# hypothesis test database +.hypothesis/ # OS generated files # ###################### @@ -87,8 +91,8 @@ scikits *.c *.cpp -# Performance Testing # -####################### +# Unit / Performance Testing # +############################## asv_bench/env/ asv_bench/html/ asv_bench/results/ @@ -106,3 +110,5 @@ doc/build/html/index.html doc/tmp.sv doc/source/styled.xlsx doc/source/templates/ +env/ +doc/source/savefig/ diff --git a/.pep8speaks.yml b/.pep8speaks.yml index 299b76c8922cce..fda26d87bf7f67 100644 --- a/.pep8speaks.yml +++ b/.pep8speaks.yml @@ -6,5 +6,7 @@ scanner: pycodestyle: max-line-length: 79 ignore: # Errors and warnings to ignore - - E731 - - E402 + - E402, # module level import not at top of file + - E731, # do not assign a lambda expression, use a def + - E741, # do not use variables named 'l', 'O', or 'I' + - W503 # line break before binary operator diff --git a/.travis.yml b/.travis.yml index fe1a2950dbf081..76f4715a4abb2e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,67 +34,65 @@ matrix: - os: osx language: generic env: - - JOB="3.5_OSX" TEST_ARGS="--skip-slow --skip-network" + - JOB="3.5, OSX" ENV_FILE="ci/travis-35-osx.yaml" TEST_ARGS="--skip-slow --skip-network" + - dist: trusty env: - - JOB="2.7_LOCALE" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true - addons: - apt: - packages: - - language-pack-zh-hans + - JOB="3.7" ENV_FILE="ci/travis-37.yaml" TEST_ARGS="--skip-slow --skip-network" + - dist: trusty env: - - JOB="2.7" TEST_ARGS="--skip-slow" LINT=true + - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true addons: apt: packages: - - python-gtk2 + - language-pack-zh-hans - dist: trusty env: - - JOB="3.5" TEST_ARGS="--skip-slow --skip-network" COVERAGE=true + - JOB="2.7, lint" ENV_FILE="ci/travis-27.yaml" TEST_ARGS="--skip-slow" LINT=true addons: apt: packages: - - xsel - - dist: trusty - env: - - JOB="3.6" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" CONDA_FORGE=true - # In allow_failures - - dist: trusty - env: - - JOB="2.7_SLOW" SLOW=true - # In allow_failures + - python-gtk2 - dist: trusty env: - - JOB="2.7_BUILD_TEST" TEST_ARGS="--skip-slow" BUILD_TEST=true + - JOB="3.6, coverage" ENV_FILE="ci/travis-36.yaml" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" COVERAGE=true DOCTEST=true # In allow_failures - dist: trusty env: - - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" + - JOB="3.6, slow" ENV_FILE="ci/travis-36-slow.yaml" SLOW=true # In allow_failures - dist: trusty env: - - JOB="3.6_DOC" DOC=true + - JOB="3.7, NumPy dev" ENV_FILE="ci/travis-37-numpydev.yaml" TEST_ARGS="--skip-slow --skip-network -W error" PANDAS_TESTING_MODE="deprecate" addons: apt: packages: - xsel + # In allow_failures + - dist: trusty + env: + - JOB="3.6, doc" ENV_FILE="ci/travis-36-doc.yaml" DOC=true allow_failures: - dist: trusty env: - - JOB="2.7_SLOW" SLOW=true - - dist: trusty - env: - - JOB="2.7_BUILD_TEST" TEST_ARGS="--skip-slow" BUILD_TEST=true + - JOB="3.6, slow" ENV_FILE="ci/travis-36-slow.yaml" SLOW=true - dist: trusty env: - - JOB="3.6_NUMPY_DEV" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" + - JOB="3.7, NumPy dev" ENV_FILE="ci/travis-37-numpydev.yaml" TEST_ARGS="--skip-slow --skip-network -W error" PANDAS_TESTING_MODE="deprecate" + addons: + apt: + packages: + - xsel - dist: trusty env: - - JOB="3.6_DOC" DOC=true + - JOB="3.6, doc" ENV_FILE="ci/travis-36-doc.yaml" DOC=true before_install: - echo "before_install" + # set non-blocking IO on travis + # https://github.com/travis-ci/travis-ci/issues/8920#issuecomment-352661024 + - python -c 'import os,sys,fcntl; flags = fcntl.fcntl(sys.stdout, fcntl.F_GETFL); fcntl.fcntl(sys.stdout, fcntl.F_SETFL, flags&~os.O_NONBLOCK);' - source ci/travis_process_gbq_encryption.sh - export PATH="$HOME/miniconda3/bin:$PATH" - df -h @@ -102,8 +100,6 @@ before_install: - uname -a - git --version - git tag - - ci/before_install_travis.sh - - export DISPLAY=":99.0" install: - echo "install start" @@ -114,6 +110,8 @@ install: before_script: - ci/install_db_travis.sh + - export DISPLAY=":99.0" + - ci/before_script_travis.sh script: - echo "script start" @@ -121,6 +119,7 @@ script: - ci/script_single.sh - ci/script_multi.sh - ci/lint.sh + - ci/doctests.sh - echo "checking imports" - source activate pandas && python ci/check_imports.py - echo "script done" diff --git a/LICENSES/XARRAY_LICENSE b/LICENSES/XARRAY_LICENSE new file mode 100644 index 00000000000000..37ec93a14fdcd0 --- /dev/null +++ b/LICENSES/XARRAY_LICENSE @@ -0,0 +1,191 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and +You must cause any modified files to carry prominent notices stating that You +changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in index 9773019c6e6e08..b417b8890fa245 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,27 +3,39 @@ include LICENSE include RELEASE.md include README.md include setup.py -include pyproject.toml graft doc prune doc/build +graft LICENSES + graft pandas -global-exclude *.so -global-exclude *.pyd +global-exclude *.bz2 +global-exclude *.csv +global-exclude *.dta +global-exclude *.gz +global-exclude *.h5 +global-exclude *.html +global-exclude *.json +global-exclude *.msgpack +global-exclude *.pickle +global-exclude *.png global-exclude *.pyc +global-exclude *.pyd +global-exclude *.sas7bdat +global-exclude *.so +global-exclude *.xls +global-exclude *.xlsm +global-exclude *.xlsx +global-exclude *.xpt +global-exclude *.xz +global-exclude *.zip global-exclude *~ -global-exclude \#* -global-exclude .git* global-exclude .DS_Store -global-exclude *.png +global-exclude .git* +global-exclude \#* -# include examples/data/* -# recursive-include examples *.py -# recursive-include doc/source * -# recursive-include doc/sphinxext * -# recursive-include LICENSES * include versioneer.py include pandas/_version.py include pandas/io/formats/templates/*.tpl diff --git a/Makefile b/Makefile index c79175cd3c4017..4a4aca21e1b784 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ build: clean_pyc python setup.py build_ext --inplace lint-diff: - git diff master --name-only -- "*.py" | grep "pandas" | xargs flake8 + git diff master --name-only -- "*.py" | grep -E "pandas|scripts" | xargs flake8 develop: build -python setup.py develop @@ -23,3 +23,4 @@ doc: cd doc; \ python make.py clean; \ python make.py html + python make.py spellcheck diff --git a/README.md b/README.md index ac043f55864984..3dde5e5e2a76e8 100644 --- a/README.md +++ b/README.md @@ -9,18 +9,33 @@ - + - + - + + - + @@ -48,18 +63,14 @@ - - - - - - + - - + +
Latest Releaselatest release + + latest release + +
latest release + + latest release + +
Package Statusstatus + + status
Licenselicense + + license + +
Build Status
Coveragecoverage
Conda - - conda default downloads +   + + coverage
Conda-forgeDownloads conda-forge downloads @@ -67,18 +78,18 @@
PyPI - - pypi downloads - - Gitter + + +
-[![https://gitter.im/pydata/pandas](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/pydata/pandas?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -## What is it + +## What is it? **pandas** is a Python package providing fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both @@ -147,7 +158,7 @@ The source code is currently hosted on GitHub at: https://github.com/pandas-dev/pandas Binary installers for the latest released version are available at the [Python -package index](https://pypi.python.org/pypi/pandas) and on conda. +package index](https://pypi.org/project/pandas) and on conda. ```sh # conda @@ -160,10 +171,9 @@ pip install pandas ``` ## Dependencies -- [NumPy](http://www.numpy.org): 1.7.0 or higher -- [python-dateutil](https://labix.org/python-dateutil): 1.5 or higher -- [pytz](https://pythonhosted.org/pytz) - - Needed for time zone support with ``pandas.date_range`` +- [NumPy](https://www.numpy.org): 1.9.0 or higher +- [python-dateutil](https://labix.org/python-dateutil): 2.5.0 or higher +- [pytz](https://pythonhosted.org/pytz): 2011k or higher See the [full installation instructions](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies) for recommended and optional dependencies. @@ -205,9 +215,6 @@ See the full instructions for [installing from source](https://pandas.pydata.org ## Documentation The official documentation is hosted on PyData.org: https://pandas.pydata.org/pandas-docs/stable -The Sphinx documentation should provide a good starting point for learning how -to use the library. Expect the docs to continue to expand as time goes on. - ## Background Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and has been under active development since then. @@ -220,12 +227,15 @@ Further, general questions and discussions can also take place on the [pydata ma ## Discussion and Development Most development discussion is taking place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. -## Contributing to pandas +## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) + All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome. A detailed overview on how to contribute can be found in the **[contributing guide.](https://pandas.pydata.org/pandas-docs/stable/contributing.html)** -If you are simply looking to start working with the pandas codebase, navigate to the [GitHub “issues” tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [Difficulty Novice](https://github.com/pandas-dev/pandas/issues?q=is%3Aopen+is%3Aissue+label%3A%22Difficulty+Novice%22) where you could start out. +If you are simply looking to start working with the pandas codebase, navigate to the [GitHub “issues” tab](https://github.com/pandas-dev/pandas/issues) and start looking through interesting issues. There are a number of issues listed under [Docs](https://github.com/pandas-dev/pandas/issues?labels=Docs&sort=updated&state=open) and [good first issue](https://github.com/pandas-dev/pandas/issues?labels=good+first+issue&sort=updated&state=open) where you could start out. + +You can also triage issues which may include reproducing bug reports, or asking for vital information such as version numbers or reproduction instructions. If you would like to start triaging issues, one easy way to get started is to [subscribe to pandas on CodeTriage](https://www.codetriage.com/pandas-dev/pandas). Or maybe through using pandas you have an idea of your own or are looking for something in the documentation and thinking ‘this can be improved’...you can do something about it! diff --git a/appveyor.yml b/appveyor.yml index a1f8886f6d068f..c6199c1493f22a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -11,20 +11,23 @@ matrix: environment: global: # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the - # /E:ON and /V:ON options are not enabled in the batch script intepreter + # /E:ON and /V:ON options are not enabled in the batch script interpreter # See: http://stackoverflow.com/a/13751649/163740 CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\ci\\run_with_env.cmd" clone_folder: C:\projects\pandas + PANDAS_TESTING_MODE: "deprecate" matrix: - CONDA_ROOT: "C:\\Miniconda3_64" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" CONDA_PY: "36" - CONDA_NPY: "112" + CONDA_NPY: "113" - CONDA_ROOT: "C:\\Miniconda3_64" + APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" CONDA_PY: "27" @@ -72,19 +75,12 @@ install: - cmd: conda info -a # create our env - - cmd: conda create -n pandas python=%PYTHON_VERSION% cython pytest>=3.1.0 pytest-xdist + - cmd: conda env create -q -n pandas --file=ci\appveyor-%CONDA_PY%.yaml - cmd: activate pandas - - cmd: pip install moto - - SET REQ=ci\requirements-%PYTHON_VERSION%_WIN.run - - cmd: echo "installing requirements from %REQ%" - - cmd: conda install -n pandas --file=%REQ% - cmd: conda list -n pandas - - cmd: echo "installing requirements from %REQ% - done" - - # add some pip only reqs to the env - - SET REQ=ci\requirements-%PYTHON_VERSION%_WIN.pip - - cmd: echo "installing requirements from %REQ%" - - cmd: pip install -Ur %REQ% + # uninstall pandas if it's present + - cmd: conda remove pandas -y --force & exit 0 + - cmd: pip uninstall -y pandas & exit 0 # build em using the local source checkout in the correct windows env - cmd: '%CMD_IN_ENV% python setup.py build_ext --inplace' diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 40cfec1bcd4c74..cccd38ef112519 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -1,7 +1,7 @@ +import warnings from importlib import import_module import numpy as np - import pandas as pd from pandas.util import testing as tm @@ -12,113 +12,117 @@ except: pass -class Algorithms(object): +from .pandas_vb_common import setup # noqa + + +class Factorize(object): + goal_time = 0.2 - def setup(self): - N = 100000 - np.random.seed(1234) + params = [True, False] + param_names = ['sort'] - self.int_unique = pd.Int64Index(np.arange(N * 5)) - # cache is_unique - self.int_unique.is_unique + def setup(self, sort): + N = 10**5 + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) - self.int = pd.Int64Index(np.arange(N).repeat(5)) - self.float = pd.Float64Index(np.random.randn(N).repeat(5)) + def time_factorize_int(self, sort): + self.int_idx.factorize(sort=sort) - # Convenience naming. - self.checked_add = pd.core.algorithms.checked_add_with_arr + def time_factorize_float(self, sort): + self.float_idx.factorize(sort=sort) - self.arr = np.arange(1000000) - self.arrpos = np.arange(1000000) - self.arrneg = np.arange(-1000000, 0) - self.arrmixed = np.array([1, -1]).repeat(500000) - self.strings = tm.makeStringIndex(100000) + def time_factorize_string(self, sort): + self.string_idx.factorize(sort=sort) - self.arr_nan = np.random.choice([True, False], size=1000000) - self.arrmixed_nan = np.random.choice([True, False], size=1000000) - # match - self.uniques = tm.makeStringIndex(1000).values - self.all = self.uniques.repeat(10) +class Duplicated(object): - def time_factorize_string(self): - self.strings.factorize() + goal_time = 0.2 - def time_factorize_int(self): - self.int.factorize() + params = ['first', 'last', False] + param_names = ['keep'] - def time_factorize_float(self): - self.int.factorize() + def setup(self, keep): + N = 10**5 + self.int_idx = pd.Int64Index(np.arange(N).repeat(5)) + self.float_idx = pd.Float64Index(np.random.randn(N).repeat(5)) + self.string_idx = tm.makeStringIndex(N) - def time_duplicated_int_unique(self): - self.int_unique.duplicated() + def time_duplicated_int(self, keep): + self.int_idx.duplicated(keep=keep) - def time_duplicated_int(self): - self.int.duplicated() + def time_duplicated_float(self, keep): + self.float_idx.duplicated(keep=keep) - def time_duplicated_float(self): - self.float.duplicated() + def time_duplicated_string(self, keep): + self.string_idx.duplicated(keep=keep) - def time_match_strings(self): - pd.match(self.all, self.uniques) - def time_add_overflow_pos_scalar(self): - self.checked_add(self.arr, 1) +class DuplicatedUniqueIndex(object): - def time_add_overflow_neg_scalar(self): - self.checked_add(self.arr, -1) + goal_time = 0.2 - def time_add_overflow_zero_scalar(self): - self.checked_add(self.arr, 0) + def setup(self): + N = 10**5 + self.idx_int_dup = pd.Int64Index(np.arange(N * 5)) + # cache is_unique + self.idx_int_dup.is_unique - def time_add_overflow_pos_arr(self): - self.checked_add(self.arr, self.arrpos) + def time_duplicated_unique_int(self): + self.idx_int_dup.duplicated() - def time_add_overflow_neg_arr(self): - self.checked_add(self.arr, self.arrneg) - def time_add_overflow_mixed_arr(self): - self.checked_add(self.arr, self.arrmixed) +class Match(object): - def time_add_overflow_first_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan) + goal_time = 0.2 - def time_add_overflow_second_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan) + def setup(self): + self.uniques = tm.makeStringIndex(1000).values + self.all = self.uniques.repeat(10) - def time_add_overflow_both_arg_nan(self): - self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan, - b_mask=self.arrmixed_nan) + def time_match_string(self): + with warnings.catch_warnings(record=True): + pd.match(self.all, self.uniques) class Hashing(object): + goal_time = 0.2 - def setup(self): - N = 100000 - - self.df = pd.DataFrame( - {'A': pd.Series(tm.makeStringIndex(100).take( - np.random.randint(0, 100, size=N))), - 'B': pd.Series(tm.makeStringIndex(10000).take( - np.random.randint(0, 10000, size=N))), - 'D': np.random.randn(N), - 'E': np.arange(N), - 'F': pd.date_range('20110101', freq='s', periods=N), - 'G': pd.timedelta_range('1 day', freq='s', periods=N), - }) - self.df['C'] = self.df['B'].astype('category') - self.df.iloc[10:20] = np.nan - - def time_frame(self): - hashing.hash_pandas_object(self.df) - - def time_series_int(self): - hashing.hash_pandas_object(self.df.E) - - def time_series_string(self): - hashing.hash_pandas_object(self.df.B) - - def time_series_categorical(self): - hashing.hash_pandas_object(self.df.C) + def setup_cache(self): + N = 10**5 + + df = pd.DataFrame( + {'strings': pd.Series(tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=N))), + 'floats': np.random.randn(N), + 'ints': np.arange(N), + 'dates': pd.date_range('20110101', freq='s', periods=N), + 'timedeltas': pd.timedelta_range('1 day', freq='s', periods=N)}) + df['categories'] = df['strings'].astype('category') + df.iloc[10:20] = np.nan + return df + + def time_frame(self, df): + hashing.hash_pandas_object(df) + + def time_series_int(self, df): + hashing.hash_pandas_object(df['ints']) + + def time_series_string(self, df): + hashing.hash_pandas_object(df['strings']) + + def time_series_float(self, df): + hashing.hash_pandas_object(df['floats']) + + def time_series_categorical(self, df): + hashing.hash_pandas_object(df['categories']) + + def time_series_timedeltas(self, df): + hashing.hash_pandas_object(df['timedeltas']) + + def time_series_dates(self, df): + hashing.hash_pandas_object(df['dates']) diff --git a/asv_bench/benchmarks/attrs_caching.py b/asv_bench/benchmarks/attrs_caching.py index b7610037bed4d9..48f0b7d71144c1 100644 --- a/asv_bench/benchmarks/attrs_caching.py +++ b/asv_bench/benchmarks/attrs_caching.py @@ -1,12 +1,15 @@ -from .pandas_vb_common import * - +import numpy as np +from pandas import DataFrame try: from pandas.util import cache_readonly except ImportError: from pandas.util.decorators import cache_readonly +from .pandas_vb_common import setup # noqa + class DataFrameAttributes(object): + goal_time = 0.2 def setup(self): @@ -21,6 +24,7 @@ def time_set_index(self): class CacheReadonly(object): + goal_time = 0.2 def setup(self): diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index 0ca21b929ea17c..cc8766e1fa39cc 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -1,11 +1,16 @@ -from .pandas_vb_common import * +import numpy as np +from pandas import DataFrame, Series, date_range +from pandas.core.algorithms import checked_add_with_arr try: import pandas.core.computation.expressions as expr except ImportError: import pandas.computation.expressions as expr +from .pandas_vb_common import setup # noqa + class Ops(object): + goal_time = 0.2 params = [[True, False], ['default', 1]] @@ -20,18 +25,17 @@ def setup(self, use_numexpr, threads): if not use_numexpr: expr.set_use_numexpr(False) - def time_frame_add(self, use_numexpr, threads): - (self.df + self.df2) + self.df + self.df2 def time_frame_mult(self, use_numexpr, threads): - (self.df * self.df2) + self.df * self.df2 def time_frame_multi_and(self, use_numexpr, threads): - self.df[((self.df > 0) & (self.df2 > 0))] + self.df[(self.df > 0) & (self.df2 > 0)] def time_frame_comparison(self, use_numexpr, threads): - (self.df > self.df2) + self.df > self.df2 def teardown(self, use_numexpr, threads): expr.set_use_numexpr(True) @@ -39,75 +43,109 @@ def teardown(self, use_numexpr, threads): class Ops2(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 1000)) - self.df2 = DataFrame(np.random.randn(1000, 1000)) + N = 10**3 + self.df = DataFrame(np.random.randn(N, N)) + self.df2 = DataFrame(np.random.randn(N, N)) - self.df_int = DataFrame( - np.random.random_integers(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(1000, 1000))) - self.df2_int = DataFrame( - np.random.random_integers(np.iinfo(np.int16).min, - np.iinfo(np.int16).max, - size=(1000, 1000))) + self.df_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(N, N))) + self.df2_int = DataFrame(np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max, + size=(N, N))) - ## Division + # Division def time_frame_float_div(self): - (self.df // self.df2) + self.df // self.df2 def time_frame_float_div_by_zero(self): - (self.df / 0) + self.df / 0 def time_frame_float_floor_by_zero(self): - (self.df // 0) + self.df // 0 def time_frame_int_div_by_zero(self): - (self.df_int / 0) + self.df_int / 0 - ## Modulo + # Modulo def time_frame_int_mod(self): - (self.df / self.df2) + self.df_int % self.df2_int def time_frame_float_mod(self): - (self.df / self.df2) + self.df % self.df2 class Timeseries(object): + goal_time = 0.2 - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.s = Series(date_range('20010101', periods=self.N, freq='T')) - self.ts = self.s[self.halfway] + params = [None, 'US/Eastern'] + param_names = ['tz'] - self.s2 = Series(date_range('20010101', periods=self.N, freq='s')) + def setup(self, tz): + N = 10**6 + halfway = (N // 2) - 1 + self.s = Series(date_range('20010101', periods=N, freq='T', tz=tz)) + self.ts = self.s[halfway] - def time_series_timestamp_compare(self): - (self.s <= self.ts) + self.s2 = Series(date_range('20010101', periods=N, freq='s', tz=tz)) - def time_timestamp_series_compare(self): - (self.ts >= self.s) + def time_series_timestamp_compare(self, tz): + self.s <= self.ts - def time_timestamp_ops_diff1(self): + def time_timestamp_series_compare(self, tz): + self.ts >= self.s + + def time_timestamp_ops_diff(self, tz): self.s2.diff() - def time_timestamp_ops_diff2(self): - (self.s - self.s.shift()) + def time_timestamp_ops_diff_with_shift(self, tz): + self.s - self.s.shift() + +class AddOverflowScalar(object): + goal_time = 0.2 -class TimeseriesTZ(Timeseries): + params = [1, -1, 0] + param_names = ['scalar'] + + def setup(self, scalar): + N = 10**6 + self.arr = np.arange(N) + + def time_add_overflow_scalar(self, scalar): + checked_add_with_arr(self.arr, scalar) - def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.s = Series(date_range('20010101', periods=self.N, freq='T', tz='US/Eastern')) - self.ts = self.s[self.halfway] - self.s2 = Series(date_range('20010101', periods=self.N, freq='s', tz='US/Eastern')) +class AddOverflowArray(object): + + goal_time = 0.2 + + def setup(self): + N = 10**6 + self.arr = np.arange(N) + self.arr_rev = np.arange(-N, 0) + self.arr_mixed = np.array([1, -1]).repeat(N / 2) + self.arr_nan_1 = np.random.choice([True, False], size=N) + self.arr_nan_2 = np.random.choice([True, False], size=N) + + def time_add_overflow_arr_rev(self): + checked_add_with_arr(self.arr, self.arr_rev) + + def time_add_overflow_arr_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1) + + def time_add_overflow_b_mask_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, + b_mask=self.arr_nan_1) + + def time_add_overflow_both_arg_nan(self): + checked_add_with_arr(self.arr, self.arr_mixed, arr_mask=self.arr_nan_1, + b_mask=self.arr_nan_2) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a5bb5e790dec13..2a7717378c280b 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,4 +1,8 @@ -from .pandas_vb_common import * +import warnings + +import numpy as np +import pandas as pd +import pandas.util.testing as tm try: from pandas.api.types import union_categoricals except ImportError: @@ -7,108 +11,237 @@ except ImportError: pass +from .pandas_vb_common import setup # noqa + + +class Concat(object): -class Categoricals(object): goal_time = 0.2 def setup(self): - N = 100000 - self.s = pd.Series((list('aabbcd') * N)).astype('category') + N = 10**5 + self.s = pd.Series(list('aabbcd') * N).astype('category') + + self.a = pd.Categorical(list('aabbcd') * N) + self.b = pd.Categorical(list('bbcdjk') * N) + + def time_concat(self): + pd.concat([self.s, self.s]) + + def time_union(self): + union_categoricals([self.a, self.b]) + - self.a = pd.Categorical((list('aabbcd') * N)) - self.b = pd.Categorical((list('bbcdjk') * N)) +class Constructor(object): + + goal_time = 0.2 + def setup(self): + N = 10**5 self.categories = list('abcde') - self.cat_idx = Index(self.categories) + self.cat_idx = pd.Index(self.categories) self.values = np.tile(self.categories, N) self.codes = np.tile(range(len(self.categories)), N) - self.datetimes = pd.Series(pd.date_range( - '1995-01-01 00:00:00', periods=10000, freq='s')) + self.datetimes = pd.Series(pd.date_range('1995-01-01 00:00:00', + periods=N / 10, + freq='s')) + self.datetimes_with_nat = self.datetimes.copy() + self.datetimes_with_nat.iloc[-1] = pd.NaT self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) + self.values_all_int8 = np.ones(N, 'int8') - def time_concat(self): - concat([self.s, self.s]) + def time_regular(self): + pd.Categorical(self.values, self.categories) - def time_union(self): - union_categoricals([self.a, self.b]) + def time_fastpath(self): + pd.Categorical(self.codes, self.cat_idx, fastpath=True) - def time_constructor_regular(self): - Categorical(self.values, self.categories) + def time_datetimes(self): + pd.Categorical(self.datetimes) - def time_constructor_fastpath(self): - Categorical(self.codes, self.cat_idx, fastpath=True) + def time_datetimes_with_nat(self): + pd.Categorical(self.datetimes_with_nat) - def time_constructor_datetimes(self): - Categorical(self.datetimes) + def time_with_nan(self): + pd.Categorical(self.values_some_nan) - def time_constructor_datetimes_with_nat(self): - t = self.datetimes - t.iloc[-1] = pd.NaT - Categorical(t) + def time_all_nan(self): + pd.Categorical(self.values_all_nan) - def time_constructor_with_nan(self): - Categorical(self.values_some_nan) + def time_from_codes_all_int8(self): + pd.Categorical.from_codes(self.values_all_int8, self.categories) - def time_constructor_all_nan(self): - Categorical(self.values_all_nan) +class ValueCounts(object): -class Categoricals2(object): goal_time = 0.2 - def setup(self): - n = 500000 - np.random.seed(2718281) + params = [True, False] + param_names = ['dropna'] + + def setup(self, dropna): + n = 5 * 10**5 arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] - self.ts = Series(arr).astype('category') + self.ts = pd.Series(arr).astype('category') + + def time_value_counts(self, dropna): + self.ts.value_counts(dropna=dropna) + - self.sel = self.ts.loc[[0]] +class Repr(object): - def time_value_counts(self): - self.ts.value_counts(dropna=False) + goal_time = 0.2 - def time_value_counts_dropna(self): - self.ts.value_counts(dropna=True) + def setup(self): + self.sel = pd.Series(['s1234']).astype('category') def time_rendering(self): str(self.sel) + +class SetCategories(object): + + goal_time = 0.2 + + def setup(self): + n = 5 * 10**5 + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + self.ts = pd.Series(arr).astype('category') + def time_set_categories(self): self.ts.cat.set_categories(self.ts.cat.categories[::2]) -class Categoricals3(object): +class Rank(object): + goal_time = 0.2 def setup(self): - N = 100000 + N = 10**5 ncats = 100 - self.s1 = Series(np.array(tm.makeCategoricalIndex(N, ncats))) - self.s1_cat = self.s1.astype('category') - self.s1_cat_ordered = self.s1.astype('category', ordered=True) + self.s_str = pd.Series(tm.makeCategoricalIndex(N, ncats)).astype(str) + self.s_str_cat = self.s_str.astype('category') + with warnings.catch_warnings(record=True): + self.s_str_cat_ordered = self.s_str.astype('category', + ordered=True) - self.s2 = Series(np.random.randint(0, ncats, size=N)) - self.s2_cat = self.s2.astype('category') - self.s2_cat_ordered = self.s2.astype('category', ordered=True) + self.s_int = pd.Series(np.random.randint(0, ncats, size=N)) + self.s_int_cat = self.s_int.astype('category') + with warnings.catch_warnings(record=True): + self.s_int_cat_ordered = self.s_int.astype('category', + ordered=True) def time_rank_string(self): - self.s1.rank() + self.s_str.rank() def time_rank_string_cat(self): - self.s1_cat.rank() + self.s_str_cat.rank() def time_rank_string_cat_ordered(self): - self.s1_cat_ordered.rank() + self.s_str_cat_ordered.rank() def time_rank_int(self): - self.s2.rank() + self.s_int.rank() def time_rank_int_cat(self): - self.s2_cat.rank() + self.s_int_cat.rank() def time_rank_int_cat_ordered(self): - self.s2_cat_ordered.rank() + self.s_int_cat_ordered.rank() + + +class Isin(object): + + goal_time = 0.2 + + params = ['object', 'int64'] + param_names = ['dtype'] + + def setup(self, dtype): + np.random.seed(1234) + n = 5 * 10**5 + sample_size = 100 + arr = [i for i in np.random.randint(0, n // 10, size=n)] + if dtype == 'object': + arr = ['s%04d' % i for i in arr] + self.sample = np.random.choice(arr, sample_size) + self.series = pd.Series(arr).astype('category') + + def time_isin_categorical(self, dtype): + self.series.isin(self.sample) + + +class IsMonotonic(object): + + def setup(self): + N = 1000 + self.c = pd.CategoricalIndex(list('a' * N + 'b' * N + 'c' * N)) + self.s = pd.Series(self.c) + + def time_categorical_index_is_monotonic_increasing(self): + self.c.is_monotonic_increasing + + def time_categorical_index_is_monotonic_decreasing(self): + self.c.is_monotonic_decreasing + + def time_categorical_series_is_monotonic_increasing(self): + self.s.is_monotonic_increasing + + def time_categorical_series_is_monotonic_decreasing(self): + self.s.is_monotonic_decreasing + + +class Contains(object): + + goal_time = 0.2 + + def setup(self): + N = 10**5 + self.ci = tm.makeCategoricalIndex(N) + self.c = self.ci.values + self.key = self.ci.categories[0] + + def time_categorical_index_contains(self): + self.key in self.ci + + def time_categorical_contains(self): + self.key in self.c + + +class CategoricalSlicing(object): + + goal_time = 0.2 + params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + param_names = ['index'] + + def setup(self, index): + N = 10**6 + values = list('a' * N + 'b' * N + 'c' * N) + indices = { + 'monotonic_incr': pd.Categorical(values), + 'monotonic_decr': pd.Categorical(reversed(values)), + 'non_monotonic': pd.Categorical(list('abc' * N))} + self.data = indices[index] + + self.scalar = 10000 + self.list = list(range(10000)) + self.cat_scalar = 'b' + + def time_getitem_scalar(self, index): + self.data[self.scalar] + + def time_getitem_slice(self, index): + self.data[:self.scalar] + + def time_getitem_list_like(self, index): + self.data[[self.scalar]] + + def time_getitem_list(self, index): + self.data[self.list] + + def time_getitem_bool_array(self, index): + self.data[self.data == self.cat_scalar] diff --git a/asv_bench/benchmarks/ctors.py b/asv_bench/benchmarks/ctors.py index b5694a3a21502b..3f9016787aab4b 100644 --- a/asv_bench/benchmarks/ctors.py +++ b/asv_bench/benchmarks/ctors.py @@ -1,30 +1,66 @@ -from .pandas_vb_common import * +import numpy as np +import pandas.util.testing as tm +from pandas import Series, Index, DatetimeIndex, Timestamp, MultiIndex +from .pandas_vb_common import setup # noqa + + +class SeriesConstructors(object): -class Constructors(object): goal_time = 0.2 - def setup(self): - self.arr = np.random.randn(100, 100) - self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) + param_names = ["data_fmt", "with_index"] + params = [[lambda x: x, + list, + lambda arr: list(arr.astype(str)), + lambda arr: dict(zip(range(len(arr)), arr)), + lambda arr: [(i, -i) for i in arr], + lambda arr: [[i, -i] for i in arr], + lambda arr: ([(i, -i) for i in arr][:-1] + [None]), + lambda arr: ([[i, -i] for i in arr][:-1] + [None])], + [False, True]] + + def setup(self, data_fmt, with_index): + N = 10**4 + arr = np.random.randn(N) + self.data = data_fmt(arr) + self.index = np.arange(N) if with_index else None - self.data = np.random.randn(100) - self.index = Index(np.arange(100)) + def time_series_constructor(self, data_fmt, with_index): + Series(self.data, index=self.index) - self.s = Series(([Timestamp('20110101'), Timestamp('20120101'), - Timestamp('20130101')] * 1000)) - def time_frame_from_ndarray(self): - DataFrame(self.arr) +class SeriesDtypesConstructors(object): - def time_series_from_ndarray(self): - pd.Series(self.data, index=self.index) + goal_time = 0.2 + + def setup(self): + N = 10**4 + self.arr = np.random.randn(N, N) + self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object) + self.s = Series([Timestamp('20110101'), Timestamp('20120101'), + Timestamp('20130101')] * N * 10) def time_index_from_array_string(self): Index(self.arr_str) + def time_index_from_array_floats(self): + Index(self.arr) + def time_dtindex_from_series(self): DatetimeIndex(self.s) - def time_dtindex_from_series2(self): + def time_dtindex_from_index_with_series(self): Index(self.s) + + +class MultiIndexConstructor(object): + + goal_time = 0.2 + + def setup(self): + N = 10**4 + self.iterables = [tm.makeStringIndex(N), range(20)] + + def time_multiindex_from_iterables(self): + MultiIndex.from_product(self.iterables) diff --git a/asv_bench/benchmarks/eval.py b/asv_bench/benchmarks/eval.py index 6f33590ee9e33f..8e581dcf22b4c8 100644 --- a/asv_bench/benchmarks/eval.py +++ b/asv_bench/benchmarks/eval.py @@ -1,70 +1,67 @@ -from .pandas_vb_common import * +import numpy as np import pandas as pd try: import pandas.core.computation.expressions as expr except ImportError: import pandas.computation.expressions as expr +from .pandas_vb_common import setup # noqa + class Eval(object): + goal_time = 0.2 params = [['numexpr', 'python'], [1, 'all']] param_names = ['engine', 'threads'] def setup(self, engine, threads): - self.df = DataFrame(np.random.randn(20000, 100)) - self.df2 = DataFrame(np.random.randn(20000, 100)) - self.df3 = DataFrame(np.random.randn(20000, 100)) - self.df4 = DataFrame(np.random.randn(20000, 100)) + self.df = pd.DataFrame(np.random.randn(20000, 100)) + self.df2 = pd.DataFrame(np.random.randn(20000, 100)) + self.df3 = pd.DataFrame(np.random.randn(20000, 100)) + self.df4 = pd.DataFrame(np.random.randn(20000, 100)) if threads == 1: expr.set_numexpr_threads(1) def time_add(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('df + df2 + df3 + df4', engine=engine) + pd.eval('self.df + self.df2 + self.df3 + self.df4', engine=engine) def time_and(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('(df > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)', engine=engine) + pd.eval('(self.df > 0) & (self.df2 > 0) & ' + '(self.df3 > 0) & (self.df4 > 0)', engine=engine) def time_chained_cmp(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('df < df2 < df3 < df4', engine=engine) + pd.eval('self.df < self.df2 < self.df3 < self.df4', engine=engine) def time_mult(self, engine, threads): - df, df2, df3, df4 = self.df, self.df2, self.df3, self.df4 - pd.eval('df * df2 * df3 * df4', engine=engine) + pd.eval('self.df * self.df2 * self.df3 * self.df4', engine=engine) def teardown(self, engine, threads): expr.set_numexpr_threads() class Query(object): + goal_time = 0.2 def setup(self): - self.N = 1000000 - self.halfway = ((self.N // 2) - 1) - self.index = date_range('20010101', periods=self.N, freq='T') - self.s = Series(self.index) - self.ts = self.s.iloc[self.halfway] - self.df = DataFrame({'a': np.random.randn(self.N), }, index=self.index) - self.df2 = DataFrame({'dates': self.s.values,}) - - self.df3 = DataFrame({'a': np.random.randn(self.N),}) - self.min_val = self.df3['a'].min() - self.max_val = self.df3['a'].max() + N = 10**6 + halfway = (N // 2) - 1 + index = pd.date_range('20010101', periods=N, freq='T') + s = pd.Series(index) + self.ts = s.iloc[halfway] + self.df = pd.DataFrame({'a': np.random.randn(N), 'dates': s}, + index=index) + data = np.random.randn(N) + self.min_val = data.min() + self.max_val = data.max() def time_query_datetime_index(self): - ts = self.ts - self.df.query('index < @ts') + self.df.query('index < @self.ts') - def time_query_datetime_series(self): - ts = self.ts - self.df2.query('dates < @ts') + def time_query_datetime_column(self): + self.df.query('dates < @self.ts') def time_query_with_boolean_selection(self): - min_val, max_val = self.min_val, self.max_val - self.df.query('(a >= @min_val) & (a <= @max_val)') + self.df.query('(a >= @self.min_val) & (a <= @self.max_val)') diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index dec4fcba0eb5e6..9def910df0babd 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -1,138 +1,101 @@ -from .pandas_vb_common import * +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Series, MultiIndex, Timestamp, date_range try: - from pandas.tseries.offsets import * -except: - from pandas.core.datetools import * + from pandas.tseries.offsets import Nano, Hour +except ImportError: + # For compatibility with older versions + from pandas.core.datetools import * # noqa +from .pandas_vb_common import setup # noqa -#---------------------------------------------------------------------- -# Creation from nested dict class FromDicts(object): + goal_time = 0.2 def setup(self): - (N, K) = (5000, 50) + N, K = 5000, 50 self.index = tm.makeStringIndex(N) self.columns = tm.makeStringIndex(K) - self.frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns) - try: - self.data = self.frame.to_dict() - except: - self.data = self.frame.toDict() - self.some_dict = list(self.data.values())[0] - self.dict_list = [dict(zip(self.columns, row)) for row in self.frame.values] - - self.data2 = dict( - ((i, dict(((j, float(j)) for j in range(100)))) for i in - range(2000))) - - def time_frame_ctor_list_of_dict(self): + frame = DataFrame(np.random.randn(N, K), index=self.index, + columns=self.columns) + self.data = frame.to_dict() + self.dict_list = frame.to_dict(orient='records') + self.data2 = {i: {j: float(j) for j in range(100)} + for i in range(2000)} + + def time_list_of_dict(self): DataFrame(self.dict_list) - def time_frame_ctor_nested_dict(self): + def time_nested_dict(self): DataFrame(self.data) - def time_series_ctor_from_dict(self): - Series(self.some_dict) + def time_nested_dict_index(self): + DataFrame(self.data, index=self.index) + + def time_nested_dict_columns(self): + DataFrame(self.data, columns=self.columns) + + def time_nested_dict_index_columns(self): + DataFrame(self.data, index=self.index, columns=self.columns) - def time_frame_ctor_nested_dict_int64(self): + def time_nested_dict_int64(self): # nested dict, integer indexes, regression described in #621 - DataFrame(self.data) + DataFrame(self.data2) -# from a mi-series +class FromSeries(object): -class frame_from_series(object): goal_time = 0.2 def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(100) for y in range(100)]) - self.s = Series(randn(10000), index=self.mi) + mi = MultiIndex.from_product([range(100), range(100)]) + self.s = Series(np.random.randn(10000), index=mi) - def time_frame_from_mi_series(self): + def time_mi_series(self): DataFrame(self.s) -#---------------------------------------------------------------------- -# get_numeric_data +class FromDictwithTimestamp(object): -class frame_get_numeric_data(object): goal_time = 0.2 + params = [Nano(1), Hour(1)] + param_names = ['offset'] - def setup(self): - self.df = DataFrame(randn(10000, 25)) - self.df['foo'] = 'bar' - self.df['bar'] = 'baz' - self.df = self.df.consolidate() - - def time_frame_get_numeric_data(self): - self.df._get_numeric_data() - - -# ---------------------------------------------------------------------- -# From dict with DatetimeIndex with all offsets - -# dynamically generate benchmarks for every offset -# -# get_period_count & get_index_for_offset are there because blindly taking each -# offset times 1000 can easily go out of Timestamp bounds and raise errors. - - -def get_period_count(start_date, off): - ten_offsets_in_days = ((start_date + (off * 10)) - start_date).days - if (ten_offsets_in_days == 0): - return 1000 - else: - return min((9 * ((Timestamp.max - start_date).days // ten_offsets_in_days)), 1000) - - -def get_index_for_offset(off): - start_date = Timestamp('1/1/1900') - return date_range(start_date, periods=min(1000, get_period_count( - start_date, off)), freq=off) - - -all_offsets = offsets.__all__ -# extra cases -for off in ['FY5253', 'FY5253Quarter']: - all_offsets.pop(all_offsets.index(off)) - all_offsets.extend([off + '_1', off + '_2']) + def setup(self, offset): + N = 10**3 + np.random.seed(1234) + idx = date_range(Timestamp('1/1/1900'), freq=offset, periods=N) + df = DataFrame(np.random.randn(N, 10), index=idx) + self.d = df.to_dict() + def time_dict_with_timestamp_offsets(self, offset): + DataFrame(self.d) -class FrameConstructorDTIndexFromOffsets(object): - params = [all_offsets, [1, 2]] - param_names = ['offset', 'n_steps'] +class FromRecords(object): - offset_kwargs = {'WeekOfMonth': {'weekday': 1, 'week': 1}, - 'LastWeekOfMonth': {'weekday': 1, 'week': 1}, - 'FY5253': {'startingMonth': 1, 'weekday': 1}, - 'FY5253Quarter': {'qtr_with_extra_week': 1, 'startingMonth': 1, 'weekday': 1}} + goal_time = 0.2 + params = [None, 1000] + param_names = ['nrows'] - offset_extra_cases = {'FY5253': {'variation': ['nearest', 'last']}, - 'FY5253Quarter': {'variation': ['nearest', 'last']}} + def setup(self, nrows): + N = 100000 + self.gen = ((x, (x * 20), (x * 100)) for x in range(N)) - def setup(self, offset, n_steps): + def time_frame_from_records_generator(self, nrows): + # issue-6700 + self.df = DataFrame.from_records(self.gen, nrows=nrows) - extra = False - if offset.endswith("_", None, -1): - extra = int(offset[-1]) - offset = offset[:-2] - kwargs = {} - if offset in self.offset_kwargs: - kwargs = self.offset_kwargs[offset] +class FromNDArray(object): - if extra: - extras = self.offset_extra_cases[offset] - for extra_arg in extras: - kwargs[extra_arg] = extras[extra_arg][extra -1] + goal_time = 0.2 - offset = getattr(offsets, offset) - self.idx = get_index_for_offset(offset(n_steps, **kwargs)) - self.df = DataFrame(np.random.randn(len(self.idx), 10), index=self.idx) - self.d = dict([(col, self.df[col]) for col in self.df.columns]) + def setup(self): + N = 100000 + self.data = np.random.randn(N) - def time_frame_ctor(self, offset, n_steps): - DataFrame(self.d) + def time_frame_from_ndarray(self): + self.df = DataFrame(self.data) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index af72ca1e9a6ab6..1819cfa2725dbf 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,20 +1,43 @@ -from .pandas_vb_common import * import string +import warnings +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, + isnull, NaT) -#---------------------------------------------------------------------- -# lookup +from .pandas_vb_common import setup # noqa + + +class GetNumericData(object): + + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(10000, 25)) + self.df['foo'] = 'bar' + self.df['bar'] = 'baz' + with warnings.catch_warnings(record=True): + self.df = self.df.consolidate() + + def time_frame_get_numeric_data(self): + self.df._get_numeric_data() + + +class Lookup(object): -class frame_fancy_lookup(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) + self.df = DataFrame(np.random.randn(10000, 8), + columns=list('abcdefgh')) self.df['foo'] = 'bar' self.row_labels = list(self.df.index[::10])[:900] - self.col_labels = (list(self.df.columns) * 100) - self.row_labels_all = np.array((list(self.df.index) * len(self.df.columns)), dtype='object') - self.col_labels_all = np.array((list(self.df.columns) * len(self.df.index)), dtype='object') + self.col_labels = list(self.df.columns) * 100 + self.row_labels_all = np.array( + list(self.df.index) * len(self.df.columns), dtype='object') + self.col_labels_all = np.array( + list(self.df.columns) * len(self.df.index), dtype='object') def time_frame_fancy_lookup(self): self.df.lookup(self.row_labels, self.col_labels) @@ -23,25 +46,20 @@ def time_frame_fancy_lookup_all(self): self.df.lookup(self.row_labels_all, self.col_labels_all) -#---------------------------------------------------------------------- -# reindex - class Reindex(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.idx = np.arange(4000, 7000) - + N = 10**3 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.idx = np.arange(4 * N, 7 * N) self.df2 = DataFrame( - dict([(c, {0: randint(0, 2, 1000).astype(np.bool_), - 1: randint(0, 1000, 1000).astype( - np.int16), - 2: randint(0, 1000, 1000).astype( - np.int32), - 3: randint(0, 1000, 1000).astype( - np.int64),}[randint(0, 4)]) for c in - range(1000)])) + {c: {0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64)} + [np.random.randint(0, 4)] for c in range(N)}) def time_reindex_axis0(self): self.df.reindex(self.idx) @@ -53,81 +71,86 @@ def time_reindex_both_axes(self): self.df.reindex(index=self.idx, columns=self.idx) def time_reindex_both_axes_ix(self): - self.df.ix[(self.idx, self.idx)] + self.df.ix[self.idx, self.idx] def time_reindex_upcast(self): self.df2.reindex(np.random.permutation(range(1200))) -#---------------------------------------------------------------------- -# iteritems (monitor no-copying behaviour) - class Iteration(object): + goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(np.random.randn(50000, 10)) - self.df3 = pd.DataFrame(np.random.randn(1000,5000), - columns=['C'+str(c) for c in range(5000)]) + N = 1000 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.df2 = DataFrame(np.random.randn(N * 50, 10)) + self.df3 = DataFrame(np.random.randn(N, 5 * N), + columns=['C' + str(c) for c in range(N * 5)]) - def f(self): + def time_iteritems(self): + # (monitor no-copying behaviour) if hasattr(self.df, '_item_cache'): self.df._item_cache.clear() - for (name, col) in self.df.iteritems(): - pass - - def g(self): - for (name, col) in self.df.iteritems(): + for name, col in self.df.iteritems(): pass - def time_iteritems(self): - self.f() - def time_iteritems_cached(self): - self.g() + for name, col in self.df.iteritems(): + pass def time_iteritems_indexing(self): - df = self.df3 - for col in df: - df[col] + for col in self.df3: + self.df3[col] def time_itertuples(self): for row in self.df2.itertuples(): pass + def time_iterrows(self): + for row in self.df.iterrows(): + pass + -#---------------------------------------------------------------------- -# to_string, to_html, repr +class ToString(object): -class Formatting(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(randn(100, 10)) + self.df = DataFrame(np.random.randn(100, 10)) - self.nrows = 500 - self.df2 = DataFrame(randn(self.nrows, 10)) - self.df2[0] = period_range('2000', '2010', self.nrows) - self.df2[1] = range(self.nrows) + def time_to_string_floats(self): + self.df.to_string() - self.nrows = 10000 - self.data = randn(self.nrows, 10) - self.idx = MultiIndex.from_arrays(np.tile(randn(3, int(self.nrows / 100)), 100)) - self.df3 = DataFrame(self.data, index=self.idx) - self.idx = randn(self.nrows) - self.df4 = DataFrame(self.data, index=self.idx) - self.df_tall = pandas.DataFrame(np.random.randn(10000, 10)) +class ToHTML(object): - self.df_wide = pandas.DataFrame(np.random.randn(10, 10000)) + goal_time = 0.2 - def time_to_string_floats(self): - self.df.to_string() + def setup(self): + nrows = 500 + self.df2 = DataFrame(np.random.randn(nrows, 10)) + self.df2[0] = period_range('2000', periods=nrows) + self.df2[1] = range(nrows) def time_to_html_mixed(self): self.df2.to_html() + +class Repr(object): + + goal_time = 0.2 + + def setup(self): + nrows = 10000 + data = np.random.randn(nrows, 10) + arrays = np.tile(np.random.randn(3, int(nrows / 100)), 100) + idx = MultiIndex.from_arrays(arrays) + self.df3 = DataFrame(data, index=idx) + self.df4 = DataFrame(data, index=np.random.randn(nrows)) + self.df_tall = DataFrame(np.random.randn(nrows, 10)) + self.df_wide = DataFrame(np.random.randn(10, nrows)) + def time_html_repr_trunc_mi(self): self.df3._repr_html_() @@ -141,21 +164,16 @@ def time_frame_repr_wide(self): repr(self.df_wide) -#---------------------------------------------------------------------- -# nulls/masking - +class MaskBool(object): -## masking - -class frame_mask_bools(object): goal_time = 0.2 def setup(self): - self.data = np.random.randn(1000, 500) - self.df = DataFrame(self.data) - self.df = self.df.where((self.df > 0)) - self.bools = (self.df > 0) - self.mask = isnull(self.df) + data = np.random.randn(1000, 500) + df = DataFrame(data) + df = df.where(df > 0) + self.bools = df > 0 + self.mask = isnull(df) def time_frame_mask_bools(self): self.bools.mask(self.mask) @@ -164,31 +182,26 @@ def time_frame_mask_floats(self): self.bools.astype(float).mask(self.mask) -## isnull +class Isnull(object): -class FrameIsnull(object): goal_time = 0.2 def setup(self): - self.df_no_null = DataFrame(np.random.randn(1000, 1000)) - - np.random.seed(1234) - self.sample = np.array([np.nan, 1.0]) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df = DataFrame(self.data) - - np.random.seed(1234) - self.sample = np.array(list(string.ascii_lowercase) + - list(string.ascii_uppercase) + - list(string.whitespace)) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df_strings= DataFrame(self.data) - - np.random.seed(1234) - self.sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), - np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) - self.data = np.random.choice(self.sample, (1000, 1000)) - self.df_obj = DataFrame(self.data) + N = 10**3 + self.df_no_null = DataFrame(np.random.randn(N, N)) + + sample = np.array([np.nan, 1.0]) + data = np.random.choice(sample, (N, N)) + self.df = DataFrame(data) + + sample = np.array(list(string.ascii_letters + string.whitespace)) + data = np.random.choice(sample, (N, N)) + self.df_strings = DataFrame(data) + + sample = np.array([NaT, np.nan, None, np.datetime64('NaT'), + np.timedelta64('NaT'), 0, 1, 2.0, '', 'abcd']) + data = np.random.choice(sample, (N, N)) + self.df_obj = DataFrame(data) def time_isnull_floats_no_null(self): isnull(self.df_no_null) @@ -203,92 +216,74 @@ def time_isnull_obj(self): isnull(self.df_obj) -# ---------------------------------------------------------------------- -# fillna in place - -class frame_fillna_inplace(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.df.values[::2] = np.nan - - def time_frame_fillna_inplace(self): - self.df.fillna(0, inplace=True) - - +class Fillna(object): -class frame_fillna_many_columns_pad(object): goal_time = 0.2 + params = ([True, False], ['pad', 'bfill']) + param_names = ['inplace', 'method'] - def setup(self): - self.values = np.random.randn(1000, 1000) - self.values[::2] = np.nan - self.df = DataFrame(self.values) - - def time_frame_fillna_many_columns_pad(self): - self.df.fillna(method='pad') + def setup(self, inplace, method): + values = np.random.randn(10000, 100) + values[::2] = np.nan + self.df = DataFrame(values) + def time_frame_fillna(self, inplace, method): + self.df.fillna(inplace=inplace, method=method) class Dropna(object): + goal_time = 0.2 + params = (['all', 'any'], [0, 1]) + param_names = ['how', 'axis'] - def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) + def setup(self, how, axis): + self.df = DataFrame(np.random.randn(10000, 1000)) self.df.ix[50:1000, 20:50] = np.nan self.df.ix[2000:3000] = np.nan self.df.ix[:, 60:70] = np.nan self.df_mixed = self.df.copy() self.df_mixed['foo'] = 'bar' - self.df_mi = self.df.copy() - self.df_mi.index = MultiIndex.from_tuples(self.df_mi.index.map((lambda x: (x, x)))) - self.df_mi.columns = MultiIndex.from_tuples(self.df_mi.columns.map((lambda x: (x, x)))) - - self.df_mixed_mi = self.df_mixed.copy() - self.df_mixed_mi.index = MultiIndex.from_tuples(self.df_mixed_mi.index.map((lambda x: (x, x)))) - self.df_mixed_mi.columns = MultiIndex.from_tuples(self.df_mixed_mi.columns.map((lambda x: (x, x)))) - - def time_dropna_axis0_all(self): - self.df.dropna(how='all', axis=0) + def time_dropna(self, how, axis): + self.df.dropna(how=how, axis=axis) - def time_dropna_axis0_any(self): - self.df.dropna(how='any', axis=0) + def time_dropna_axis_mixed_dtypes(self, how, axis): + self.df_mixed.dropna(how=how, axis=axis) - def time_dropna_axis1_all(self): - self.df.dropna(how='all', axis=1) - def time_dropna_axis1_any(self): - self.df.dropna(how='any', axis=1) +class Count(object): - def time_dropna_axis0_all_mixed_dtypes(self): - self.df_mixed.dropna(how='all', axis=0) - - def time_dropna_axis0_any_mixed_dtypes(self): - self.df_mixed.dropna(how='any', axis=0) - - def time_dropna_axis1_all_mixed_dtypes(self): - self.df_mixed.dropna(how='all', axis=1) + goal_time = 0.2 - def time_dropna_axis1_any_mixed_dtypes(self): - self.df_mixed.dropna(how='any', axis=1) + params = [0, 1] + param_names = ['axis'] - def time_count_level_axis0_multi(self): - self.df_mi.count(axis=0, level=1) + def setup(self, axis): + self.df = DataFrame(np.random.randn(10000, 1000)) + self.df.ix[50:1000, 20:50] = np.nan + self.df.ix[2000:3000] = np.nan + self.df.ix[:, 60:70] = np.nan + self.df_mixed = self.df.copy() + self.df_mixed['foo'] = 'bar' - def time_count_level_axis1_multi(self): - self.df_mi.count(axis=1, level=1) + self.df.index = MultiIndex.from_arrays([self.df.index, self.df.index]) + self.df.columns = MultiIndex.from_arrays([self.df.columns, + self.df.columns]) + self.df_mixed.index = MultiIndex.from_arrays([self.df_mixed.index, + self.df_mixed.index]) + self.df_mixed.columns = MultiIndex.from_arrays([self.df_mixed.columns, + self.df_mixed.columns]) - def time_count_level_axis0_mixed_dtypes_multi(self): - self.df_mixed_mi.count(axis=0, level=1) + def time_count_level_multi(self, axis): + self.df.count(axis=axis, level=1) - def time_count_level_axis1_mixed_dtypes_multi(self): - self.df_mixed_mi.count(axis=1, level=1) + def time_count_level_mixed_dtypes_multi(self, axis): + self.df_mixed.count(axis=axis, level=1) class Apply(object): + goal_time = 0.2 def setup(self): @@ -296,32 +291,29 @@ def setup(self): self.s = Series(np.arange(1028.0)) self.df2 = DataFrame({i: self.s for i in range(1028)}) - self.df3 = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) def time_apply_user_func(self): - self.df2.apply((lambda x: np.corrcoef(x, self.s)[(0, 1)])) + self.df2.apply(lambda x: np.corrcoef(x, self.s)[(0, 1)]) def time_apply_axis_1(self): - self.df.apply((lambda x: (x + 1)), axis=1) + self.df.apply(lambda x: x + 1, axis=1) def time_apply_lambda_mean(self): - self.df.apply((lambda x: x.mean())) + self.df.apply(lambda x: x.mean()) def time_apply_np_mean(self): self.df.apply(np.mean) def time_apply_pass_thru(self): - self.df.apply((lambda x: x)) + self.df.apply(lambda x: x) def time_apply_ref_by_name(self): - self.df3.apply((lambda x: (x['A'] + x['B'])), axis=1) + self.df3.apply(lambda x: x['A'] + x['B'], axis=1) -#---------------------------------------------------------------------- -# dtypes +class Dtypes(object): -class frame_dtypes(object): goal_time = 0.2 def setup(self): @@ -330,331 +322,211 @@ def setup(self): def time_frame_dtypes(self): self.df.dtypes -#---------------------------------------------------------------------- -# equals class Equals(object): + goal_time = 0.2 def setup(self): - self.float_df = DataFrame(np.random.randn(1000, 1000)) - self.object_df = DataFrame(([(['foo'] * 1000)] * 1000)) - self.nonunique_cols = self.object_df.copy() - self.nonunique_cols.columns = (['A'] * len(self.nonunique_cols.columns)) - self.pairs = dict([(name, self.make_pair(frame)) for (name, frame) in ( - ('float_df', self.float_df), ('object_df', self.object_df), - ('nonunique_cols', self.nonunique_cols))]) + N = 10**3 + self.float_df = DataFrame(np.random.randn(N, N)) + self.float_df_nan = self.float_df.copy() + self.float_df_nan.iloc[-1, -1] = np.nan - def make_pair(self, frame): - self.df = frame - self.df2 = self.df.copy() - self.df2.ix[((-1), (-1))] = np.nan - return (self.df, self.df2) + self.object_df = DataFrame('foo', index=range(N), columns=range(N)) + self.object_df_nan = self.object_df.copy() + self.object_df_nan.iloc[-1, -1] = np.nan - def test_equal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df) - - def test_unequal(self, name): - (self.df, self.df2) = self.pairs[name] - return self.df.equals(self.df2) + self.nonunique_cols = self.object_df.copy() + self.nonunique_cols.columns = ['A'] * len(self.nonunique_cols.columns) + self.nonunique_cols_nan = self.nonunique_cols.copy() + self.nonunique_cols_nan.iloc[-1, -1] = np.nan def time_frame_float_equal(self): - self.test_equal('float_df') + self.float_df.equals(self.float_df) def time_frame_float_unequal(self): - self.test_unequal('float_df') + self.float_df.equals(self.float_df_nan) def time_frame_nonunique_equal(self): - self.test_equal('nonunique_cols') + self.nonunique_cols.equals(self.nonunique_cols) def time_frame_nonunique_unequal(self): - self.test_unequal('nonunique_cols') + self.nonunique_cols.equals(self.nonunique_cols_nan) def time_frame_object_equal(self): - self.test_equal('object_df') + self.object_df.equals(self.object_df) def time_frame_object_unequal(self): - self.test_unequal('object_df') + self.object_df.equals(self.object_df_nan) class Interpolate(object): + goal_time = 0.2 + params = [None, 'infer'] + param_names = ['downcast'] - def setup(self): + def setup(self, downcast): + N = 10000 # this is the worst case, where every column has NaNs. - self.df = DataFrame(randn(10000, 100)) + self.df = DataFrame(np.random.randn(N, 100)) self.df.values[::2] = np.nan - self.df2 = DataFrame( - {'A': np.arange(0, 10000), 'B': np.random.randint(0, 100, 10000), - 'C': randn(10000), 'D': randn(10000),}) + self.df2 = DataFrame({'A': np.arange(0, N), + 'B': np.random.randint(0, 100, N), + 'C': np.random.randn(N), + 'D': np.random.randn(N)}) self.df2.loc[1::5, 'A'] = np.nan self.df2.loc[1::5, 'C'] = np.nan - def time_interpolate(self): - self.df.interpolate() - - def time_interpolate_some_good(self): - self.df2.interpolate() + def time_interpolate(self, downcast): + self.df.interpolate(downcast=downcast) - def time_interpolate_some_good_infer(self): - self.df2.interpolate(downcast='infer') + def time_interpolate_some_good(self, downcast): + self.df2.interpolate(downcast=downcast) class Shift(object): # frame shift speedup issue-5609 goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): + def setup(self, axis): self.df = DataFrame(np.random.rand(10000, 500)) - def time_shift_axis0(self): - self.df.shift(1, axis=0) - - def time_shift_axis_1(self): - self.df.shift(1, axis=1) - - -#----------------------------------------------------------------------------- -# from_records issue-6700 - -class frame_from_records_generator(object): - goal_time = 0.2 - - def get_data(self, n=100000): - return ((x, (x * 20), (x * 100)) for x in range(n)) - - def time_frame_from_records_generator(self): - self.df = DataFrame.from_records(self.get_data()) - - def time_frame_from_records_generator_nrows(self): - self.df = DataFrame.from_records(self.get_data(), nrows=1000) + def time_shift(self, axis): + self.df.shift(1, axis=axis) - -#----------------------------------------------------------------------------- -# nunique - -class frame_nunique(object): +class Nunique(object): def setup(self): - self.data = np.random.randn(10000, 1000) - self.df = DataFrame(self.data) + self.df = DataFrame(np.random.randn(10000, 1000)) def time_frame_nunique(self): self.df.nunique() +class Duplicated(object): -#----------------------------------------------------------------------------- -# duplicated - -class frame_duplicated(object): goal_time = 0.2 def setup(self): - self.n = (1 << 20) - self.t = date_range('2015-01-01', freq='S', periods=(self.n // 64)) - self.xs = np.random.randn((self.n // 64)).round(2) - self.df = DataFrame({'a': np.random.randint(((-1) << 8), (1 << 8), self.n), 'b': np.random.choice(self.t, self.n), 'c': np.random.choice(self.xs, self.n), }) - - self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)) + n = (1 << 20) + t = date_range('2015-01-01', freq='S', periods=(n // 64)) + xs = np.random.randn(n // 64).round(2) + self.df = DataFrame({'a': np.random.randint(-1 << 8, 1 << 8, n), + 'b': np.random.choice(t, n), + 'c': np.random.choice(xs, n)}) + self.df2 = DataFrame(np.random.randn(1000, 100).astype(str)).T def time_frame_duplicated(self): self.df.duplicated() def time_frame_duplicated_wide(self): - self.df2.T.duplicated() - - - - - + self.df2.duplicated() +class XS(object): - - - - - - - - - -class frame_xs_col(object): goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = DataFrame(randn(1, 100000)) - - def time_frame_xs_col(self): - self.df.xs(50000, axis=1) + def setup(self, axis): + self.N = 10**4 + self.df = DataFrame(np.random.randn(self.N, self.N)) + def time_frame_xs(self, axis): + self.df.xs(self.N / 2, axis=axis) -class frame_xs_row(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(100000, 1)) - def time_frame_xs_row(self): - self.df.xs(50000) +class SortValues(object): - -class frame_sort_index(object): goal_time = 0.2 + params = [True, False] + param_names = ['ascending'] - def setup(self): - self.df = DataFrame(randn(1000000, 2), columns=list('AB')) - - def time_frame_sort_index(self): - self.df.sort_index() - + def setup(self, ascending): + self.df = DataFrame(np.random.randn(1000000, 2), columns=list('AB')) -class frame_sort_index_by_columns(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) + def time_frame_sort_values(self, ascending): + self.df.sort_values(by='A', ascending=ascending) - def time_frame_sort_index_by_columns(self): - self.df.sort_index(by=['key1', 'key2']) +class SortIndexByColumns(object): -class frame_quantile_axis1(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) + N = 10000 + K = 10 + self.df = DataFrame({'key1': tm.makeStringIndex(N).values.repeat(K), + 'key2': tm.makeStringIndex(N).values.repeat(K), + 'value': np.random.randn(N * K)}) - def time_frame_quantile_axis1(self): - self.df.quantile([0.1, 0.5], axis=1) + def time_frame_sort_values_by_columns(self): + self.df.sort_values(by=['key1', 'key2']) -#---------------------------------------------------------------------- -# boolean indexing +class Quantile(object): -class frame_boolean_row_select(object): goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - def setup(self): - self.df = DataFrame(randn(10000, 100)) - self.bool_arr = np.zeros(10000, dtype=bool) - self.bool_arr[:1000] = True - - def time_frame_boolean_row_select(self): - self.df[self.bool_arr] - -class frame_getitem_single_column(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(randn(10000, 1000)) - self.df2 = DataFrame(randn(3000, 1), columns=['A']) - self.df3 = DataFrame(randn(3000, 1)) - - def h(self): - for i in range(10000): - self.df2['A'] - - def j(self): - for i in range(10000): - self.df3[0] - - def time_frame_getitem_single_column(self): - self.h() + def setup(self, axis): + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) - def time_frame_getitem_single_column2(self): - self.j() + def time_frame_quantile(self, axis): + self.df.quantile([0.1, 0.5], axis=axis) -#---------------------------------------------------------------------- -# assignment - -class frame_assign_timeseries_index(object): +class GetDtypeCounts(object): + # 2807 goal_time = 0.2 def setup(self): - self.idx = date_range('1/1/2000', periods=100000, freq='H') - self.df = DataFrame(randn(100000, 1), columns=['A'], index=self.idx) - - def time_frame_assign_timeseries_index(self): - self.f(self.df) + self.df = DataFrame(np.random.randn(10, 10000)) - def f(self, df): - self.x = self.df.copy() - self.x['date'] = self.x.index + def time_frame_get_dtype_counts(self): + self.df.get_dtype_counts() + def time_info(self): + self.df.info() -# insert many columns +class NSort(object): -class frame_insert_100_columns_begin(object): goal_time = 0.2 + params = ['first', 'last', 'all'] + param_names = ['keep'] - def setup(self): - self.N = 1000 - - def f(self, K=100): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df.insert(0, i, self.new_col) - - def g(self, K=500): - self.df = DataFrame(index=range(self.N)) - self.new_col = np.random.randn(self.N) - for i in range(K): - self.df[i] = self.new_col - - def time_frame_insert_100_columns_begin(self): - self.f() - - def time_frame_insert_500_columns_end(self): - self.g() - - - -#---------------------------------------------------------------------- -# strings methods, #2602 - -class series_string_vector_slice(object): - goal_time = 0.2 + def setup(self, keep): + self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) - def setup(self): - self.s = Series((['abcdefg', np.nan] * 500000)) + def time_nlargest(self, keep): + self.df.nlargest(100, 'A', keep=keep) - def time_series_string_vector_slice(self): - self.s.str[:5] + def time_nsmallest(self, keep): + self.df.nsmallest(100, 'A', keep=keep) -#---------------------------------------------------------------------- -# df.info() and get_dtype_counts() # 2807 +class Describe(object): -class frame_get_dtype_counts(object): goal_time = 0.2 def setup(self): - self.df = DataFrame(np.random.randn(10, 10000)) - - def time_frame_get_dtype_counts(self): - self.df.get_dtype_counts() + self.df = DataFrame({ + 'a': np.random.randint(0, 100, int(1e6)), + 'b': np.random.randint(0, 100, int(1e6)), + 'c': np.random.randint(0, 100, int(1e6)) + }) + def time_series_describe(self): + self.df['a'].describe() -class frame_nlargest(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(1000, 3), - columns=list('ABC')) - - def time_frame_nlargest(self): - self.df.nlargest(100, 'A') + def time_dataframe_describe(self): + self.df.describe() diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index 78a94976e732d1..21c1ccf46e1c44 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -1,241 +1,139 @@ -from .pandas_vb_common import * - +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Series, read_csv, factorize, date_range from pandas.core.algorithms import take_1d - try: - from cStringIO import StringIO + from pandas import (rolling_median, rolling_mean, rolling_min, rolling_max, + rolling_var, rolling_skew, rolling_kurt, rolling_std) + have_rolling_methods = True except ImportError: - from io import StringIO - + have_rolling_methods = False try: from pandas._libs import algos except ImportError: from pandas import algos - try: from pandas.util.testing import test_parallel - have_real_test_parallel = True except ImportError: have_real_test_parallel = False - def test_parallel(num_threads=1): - def wrapper(fname): return fname - return wrapper +from .pandas_vb_common import BaseIO, setup # noqa -class NoGilGroupby(object): - goal_time = 0.2 - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) +class ParallelGroupbyMethods(object): - np.random.seed(1234) - self.size = 2 ** 22 - self.ngroups = 100 - self.data = Series(np.random.randint(0, self.ngroups, size=self.size)) + goal_time = 0.2 + params = ([2, 4, 8], ['count', 'last', 'max', 'mean', 'min', 'prod', + 'sum', 'var']) + param_names = ['threads', 'method'] - if (not have_real_test_parallel): + def setup(self, threads, method): + if not have_real_test_parallel: raise NotImplementedError + N = 10**6 + ngroups = 10**3 + df = DataFrame({'key': np.random.randint(0, ngroups, size=N), + 'data': np.random.randn(N)}) - @test_parallel(num_threads=2) - def _pg2_count(self): - self.df.groupby('key')['data'].count() - - def time_count_2(self): - self._pg2_count() - - @test_parallel(num_threads=2) - def _pg2_last(self): - self.df.groupby('key')['data'].last() - - def time_last_2(self): - self._pg2_last() - - @test_parallel(num_threads=2) - def _pg2_max(self): - self.df.groupby('key')['data'].max() - - def time_max_2(self): - self._pg2_max() - - @test_parallel(num_threads=2) - def _pg2_mean(self): - self.df.groupby('key')['data'].mean() - - def time_mean_2(self): - self._pg2_mean() - - @test_parallel(num_threads=2) - def _pg2_min(self): - self.df.groupby('key')['data'].min() - - def time_min_2(self): - self._pg2_min() + @test_parallel(num_threads=threads) + def parallel(): + getattr(df.groupby('key')['data'], method)() + self.parallel = parallel - @test_parallel(num_threads=2) - def _pg2_prod(self): - self.df.groupby('key')['data'].prod() + def loop(): + getattr(df.groupby('key')['data'], method)() + self.loop = loop - def time_prod_2(self): - self._pg2_prod() + def time_parallel(self, threads, method): + self.parallel() - @test_parallel(num_threads=2) - def _pg2_sum(self): - self.df.groupby('key')['data'].sum() + def time_loop(self, threads, method): + for i in range(threads): + self.loop() - def time_sum_2(self): - self._pg2_sum() - @test_parallel(num_threads=4) - def _pg4_sum(self): - self.df.groupby('key')['data'].sum() +class ParallelGroups(object): - def time_sum_4(self): - self._pg4_sum() - - def time_sum_4_notp(self): - for i in range(4): - self.df.groupby('key')['data'].sum() - - def _f_sum(self): - self.df.groupby('key')['data'].sum() - - @test_parallel(num_threads=8) - def _pg8_sum(self): - self._f_sum() - - def time_sum_8(self): - self._pg8_sum() - - def time_sum_8_notp(self): - for i in range(8): - self._f_sum() - - @test_parallel(num_threads=2) - def _pg2_var(self): - self.df.groupby('key')['data'].var() - - def time_var_2(self): - self._pg2_var() - - # get groups - - def _groups(self): - self.data.groupby(self.data).groups - - @test_parallel(num_threads=2) - def _pg2_groups(self): - self._groups() - - def time_groups_2(self): - self._pg2_groups() - - @test_parallel(num_threads=4) - def _pg4_groups(self): - self._groups() - - def time_groups_4(self): - self._pg4_groups() - - @test_parallel(num_threads=8) - def _pg8_groups(self): - self._groups() - - def time_groups_8(self): - self._pg8_groups() - - - -class nogil_take1d_float64(object): goal_time = 0.2 + params = [2, 4, 8] + param_names = ['threads'] - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): + def setup(self, threads): + if not have_real_test_parallel: raise NotImplementedError - self.N = 10000000.0 - self.df = DataFrame({'int64': np.arange(self.N, dtype='int64'), 'float64': np.arange(self.N, dtype='float64'), }) - self.indexer = np.arange(100, (len(self.df) - 100)) + size = 2**22 + ngroups = 10**3 + data = Series(np.random.randint(0, ngroups, size=size)) - def time_nogil_take1d_float64(self): - self.take_1d_pg2_int64() + @test_parallel(num_threads=threads) + def get_groups(): + data.groupby(data).groups + self.get_groups = get_groups - @test_parallel(num_threads=2) - def take_1d_pg2_int64(self): - take_1d(self.df.int64.values, self.indexer) + def time_get_groups(self, threads): + self.get_groups() - @test_parallel(num_threads=2) - def take_1d_pg2_float64(self): - take_1d(self.df.float64.values, self.indexer) +class ParallelTake1D(object): -class nogil_take1d_int64(object): goal_time = 0.2 + params = ['int64', 'float64'] + param_names = ['dtype'] - def setup(self): - self.N = 1000000 - self.ngroups = 1000 - np.random.seed(1234) - self.df = DataFrame({'key': np.random.randint(0, self.ngroups, size=self.N), 'data': np.random.randn(self.N), }) - if (not have_real_test_parallel): + def setup(self, dtype): + if not have_real_test_parallel: raise NotImplementedError - self.N = 10000000.0 - self.df = DataFrame({'int64': np.arange(self.N, dtype='int64'), 'float64': np.arange(self.N, dtype='float64'), }) - self.indexer = np.arange(100, (len(self.df) - 100)) + N = 10**6 + df = DataFrame({'col': np.arange(N, dtype=dtype)}) + indexer = np.arange(100, len(df) - 100) - def time_nogil_take1d_int64(self): - self.take_1d_pg2_float64() + @test_parallel(num_threads=2) + def parallel_take1d(): + take_1d(df['col'].values, indexer) + self.parallel_take1d = parallel_take1d - @test_parallel(num_threads=2) - def take_1d_pg2_int64(self): - take_1d(self.df.int64.values, self.indexer) + def time_take1d(self, dtype): + self.parallel_take1d() - @test_parallel(num_threads=2) - def take_1d_pg2_float64(self): - take_1d(self.df.float64.values, self.indexer) +class ParallelKth(object): -class nogil_kth_smallest(object): number = 1 repeat = 5 def setup(self): - if (not have_real_test_parallel): + if not have_real_test_parallel: raise NotImplementedError - np.random.seed(1234) - self.N = 10000000 - self.k = 500000 - self.a = np.random.randn(self.N) - self.b = self.a.copy() - self.kwargs_list = [{'arr': self.a}, {'arr': self.b}] + N = 10**7 + k = 5 * 10**5 + kwargs_list = [{'arr': np.random.randn(N)}, + {'arr': np.random.randn(N)}] + + @test_parallel(num_threads=2, kwargs_list=kwargs_list) + def parallel_kth_smallest(arr): + algos.kth_smallest(arr, k) + self.parallel_kth_smallest = parallel_kth_smallest - def time_nogil_kth_smallest(self): - @test_parallel(num_threads=2, kwargs_list=self.kwargs_list) - def run(arr): - algos.kth_smallest(arr, self.k) - run() + def time_kth_smallest(self): + self.parallel_kth_smallest() -class nogil_datetime_fields(object): +class ParallelDatetimeFields(object): + goal_time = 0.2 def setup(self): - self.N = 100000000 - self.dti = pd.date_range('1900-01-01', periods=self.N, freq='T') - self.period = self.dti.to_period('D') - if (not have_real_test_parallel): + if not have_real_test_parallel: raise NotImplementedError + N = 10**6 + self.dti = date_range('1900-01-01', periods=N, freq='T') + self.period = self.dti.to_period('D') def time_datetime_field_year(self): @test_parallel(num_threads=2) @@ -274,149 +172,104 @@ def run(period): run(self.period) -class nogil_rolling_algos_slow(object): - goal_time = 0.2 - - def setup(self): - self.win = 100 - np.random.seed(1234) - self.arr = np.random.rand(100000) - if (not have_real_test_parallel): - raise NotImplementedError +class ParallelRolling(object): - def time_nogil_rolling_median(self): - @test_parallel(num_threads=2) - def run(arr, win): - rolling_median(arr, win) - run(self.arr, self.win) - - -class nogil_rolling_algos_fast(object): goal_time = 0.2 + params = ['median', 'mean', 'min', 'max', 'var', 'skew', 'kurt', 'std'] + param_names = ['method'] - def setup(self): - self.win = 100 - np.random.seed(1234) - self.arr = np.random.rand(1000000) - if (not have_real_test_parallel): + def setup(self, method): + if not have_real_test_parallel: + raise NotImplementedError + win = 100 + arr = np.random.rand(100000) + if hasattr(DataFrame, 'rolling'): + df = DataFrame(arr).rolling(win) + + @test_parallel(num_threads=2) + def parallel_rolling(): + getattr(df, method)() + self.parallel_rolling = parallel_rolling + elif have_rolling_methods: + rolling = {'median': rolling_median, + 'mean': rolling_mean, + 'min': rolling_min, + 'max': rolling_max, + 'var': rolling_var, + 'skew': rolling_skew, + 'kurt': rolling_kurt, + 'std': rolling_std} + + @test_parallel(num_threads=2) + def parallel_rolling(): + rolling[method](arr, win) + self.parallel_rolling = parallel_rolling + else: raise NotImplementedError - def time_nogil_rolling_mean(self): - @test_parallel(num_threads=2) - def run(arr, win): - rolling_mean(arr, win) - run(self.arr, self.win) - - def time_nogil_rolling_min(self): - @test_parallel(num_threads=2) - def run(arr, win): - rolling_min(arr, win) - run(self.arr, self.win) - - def time_nogil_rolling_max(self): - @test_parallel(num_threads=2) - def run(arr, win): - rolling_max(arr, win) - run(self.arr, self.win) - - def time_nogil_rolling_var(self): - @test_parallel(num_threads=2) - def run(arr, win): - rolling_var(arr, win) - run(self.arr, self.win) - - def time_nogil_rolling_skew(self): - @test_parallel(num_threads=2) - def run(arr, win): - rolling_skew(arr, win) - run(self.arr, self.win) - - def time_nogil_rolling_kurt(self): - @test_parallel(num_threads=2) - def run(arr, win): - rolling_kurt(arr, win) - run(self.arr, self.win) + def time_rolling(self, method): + self.parallel_rolling() - def time_nogil_rolling_std(self): - @test_parallel(num_threads=2) - def run(arr, win): - rolling_std(arr, win) - run(self.arr, self.win) +class ParallelReadCSV(BaseIO): -class nogil_read_csv(object): number = 1 repeat = 5 + params = ['float', 'object', 'datetime'] + param_names = ['dtype'] - def setup(self): - if (not have_real_test_parallel): + def setup(self, dtype): + if not have_real_test_parallel: raise NotImplementedError - # Using the values - self.df = DataFrame(np.random.randn(10000, 50)) - self.df.to_csv('__test__.csv') - - self.rng = date_range('1/1/2000', periods=10000) - self.df_date_time = DataFrame(np.random.randn(10000, 50), index=self.rng) - self.df_date_time.to_csv('__test_datetime__.csv') + rows = 10000 + cols = 50 + data = {'float': DataFrame(np.random.randn(rows, cols)), + 'datetime': DataFrame(np.random.randn(rows, cols), + index=date_range('1/1/2000', + periods=rows)), + 'object': DataFrame('foo', + index=range(rows), + columns=['object%03d'.format(i) + for i in range(5)])} + + self.fname = '__test_{}__.csv'.format(dtype) + df = data[dtype] + df.to_csv(self.fname) - self.df_object = DataFrame('foo', index=self.df.index, columns=self.create_cols('object')) - self.df_object.to_csv('__test_object__.csv') - - def create_cols(self, name): - return [('%s%03d' % (name, i)) for i in range(5)] - - @test_parallel(num_threads=2) - def pg_read_csv(self): - read_csv('__test__.csv', sep=',', header=None, float_precision=None) - - def time_read_csv(self): - self.pg_read_csv() - - @test_parallel(num_threads=2) - def pg_read_csv_object(self): - read_csv('__test_object__.csv', sep=',') - - def time_read_csv_object(self): - self.pg_read_csv_object() + @test_parallel(num_threads=2) + def parallel_read_csv(): + read_csv(self.fname) + self.parallel_read_csv = parallel_read_csv - @test_parallel(num_threads=2) - def pg_read_csv_datetime(self): - read_csv('__test_datetime__.csv', sep=',', header=None) + def time_read_csv(self, dtype): + self.parallel_read_csv() - def time_read_csv_datetime(self): - self.pg_read_csv_datetime() +class ParallelFactorize(object): -class nogil_factorize(object): number = 1 repeat = 5 + params = [2, 4, 8] + param_names = ['threads'] - def setup(self): - if (not have_real_test_parallel): + def setup(self, threads): + if not have_real_test_parallel: raise NotImplementedError - np.random.seed(1234) - self.strings = tm.makeStringIndex(100000) - - def factorize_strings(self): - pd.factorize(self.strings) - - @test_parallel(num_threads=4) - def _pg_factorize_strings_4(self): - self.factorize_strings() + strings = tm.makeStringIndex(100000) - def time_factorize_strings_4(self): - for i in range(2): - self._pg_factorize_strings_4() + @test_parallel(num_threads=threads) + def parallel(): + factorize(strings) + self.parallel = parallel - @test_parallel(num_threads=2) - def _pg_factorize_strings_2(self): - self.factorize_strings() + def loop(): + factorize(strings) + self.loop = loop - def time_factorize_strings_2(self): - for i in range(4): - self._pg_factorize_strings_2() + def time_parallel(self, threads): + self.parallel() - def time_factorize_strings(self): - for i in range(8): - self.factorize_strings() + def time_loop(self, threads): + for i in range(threads): + self.loop() diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 13b5cd2b060322..b51b41614bc498 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -1,510 +1,410 @@ -from .pandas_vb_common import * -from string import ascii_letters, digits +import warnings +from string import ascii_letters from itertools import product +from functools import partial +import numpy as np +from pandas import (DataFrame, Series, MultiIndex, date_range, period_range, + TimeGrouper, Categorical, Timestamp) +import pandas.util.testing as tm -class groupby_agg_builtins(object): - goal_time = 0.2 - - def setup(self): - np.random.seed(27182) - self.n = 100000 - self.df = DataFrame(np.random.randint(1, (self.n / 100), (self.n, 3)), columns=['jim', 'joe', 'jolie']) +from .pandas_vb_common import setup # noqa - def time_groupby_agg_builtins1(self): - self.df.groupby('jim').agg([sum, min, max]) - def time_groupby_agg_builtins2(self): - self.df.groupby(['jim', 'joe']).agg([sum, min, max]) +method_blacklist = { + 'object': {'median', 'prod', 'sem', 'cumsum', 'sum', 'cummin', 'mean', + 'max', 'skew', 'cumprod', 'cummax', 'rank', 'pct_change', 'min', + 'var', 'mad', 'describe', 'std'}, + 'datetime': {'median', 'prod', 'sem', 'cumsum', 'sum', 'mean', 'skew', + 'cumprod', 'cummax', 'pct_change', 'var', 'mad', 'describe', + 'std'} +} -#---------------------------------------------------------------------- -# dict return values -class groupby_apply_dict_return(object): +class ApplyDictReturn(object): goal_time = 0.2 def setup(self): self.labels = np.arange(1000).repeat(10) - self.data = Series(randn(len(self.labels))) - self.f = (lambda x: {'first': x.values[0], 'last': x.values[(-1)], }) + self.data = Series(np.random.randn(len(self.labels))) def time_groupby_apply_dict_return(self): - self.data.groupby(self.labels).apply(self.f) - - -#---------------------------------------------------------------------- -# groups - -class Groups(object): - goal_time = 0.1 - - size = 2 ** 22 - data = { - 'int64_small': Series(np.random.randint(0, 100, size=size)), - 'int64_large' : Series(np.random.randint(0, 10000, size=size)), - 'object_small': Series(tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))), - 'object_large': Series(tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))) - } - - param_names = ['df'] - params = ['int64_small', 'int64_large', 'object_small', 'object_large'] + self.data.groupby(self.labels).apply(lambda x: {'first': x.values[0], + 'last': x.values[-1]}) - def setup(self, df): - self.df = self.data[df] - def time_groupby_groups(self, df): - self.df.groupby(self.df).groups +class Apply(object): - -#---------------------------------------------------------------------- -# First / last functions - -class FirstLast(object): goal_time = 0.2 - param_names = ['dtype'] - params = ['float32', 'float64', 'datetime', 'object'] + def setup_cache(self): + N = 10**4 + labels = np.random.randint(0, 2000, size=N) + labels2 = np.random.randint(0, 3, size=N) + df = DataFrame({'key': labels, + 'key2': labels2, + 'value1': np.random.randn(N), + 'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4) + }) + return df - # with datetimes (GH7555) + def time_scalar_function_multi_col(self, df): + df.groupby(['key', 'key2']).apply(lambda x: 1) - def setup(self, dtype): - - if dtype == 'datetime': - self.df = DataFrame( - {'values': date_range('1/1/2011', periods=100000, freq='s'), - 'key': range(100000),}) - elif dtype == 'object': - self.df = DataFrame( - {'values': (['foo'] * 100000), - 'key': range(100000)}) - else: - labels = np.arange(10000).repeat(10) - data = Series(randn(len(labels)), dtype=dtype) - data[::3] = np.nan - data[1::3] = np.nan - labels = labels.take(np.random.permutation(len(labels))) - self.df = DataFrame({'values': data, 'key': labels}) + def time_scalar_function_single_col(self, df): + df.groupby('key').apply(lambda x: 1) - def time_groupby_first(self, dtype): - self.df.groupby('key').first() - - def time_groupby_last(self, dtype): - self.df.groupby('key').last() + @staticmethod + def df_copy_function(g): + # ensure that the group name is available (see GH #15062) + g.name + return g.copy() - def time_groupby_nth_any(self, dtype): - self.df.groupby('key').nth(0, dropna='all') + def time_copy_function_multi_col(self, df): + df.groupby(['key', 'key2']).apply(self.df_copy_function) - def time_groupby_nth_none(self, dtype): - self.df.groupby('key').nth(0) + def time_copy_overhead_single_col(self, df): + df.groupby('key').apply(self.df_copy_function) -#---------------------------------------------------------------------- -# DataFrame Apply overhead +class Groups(object): -class groupby_frame_apply(object): goal_time = 0.2 - def setup(self): - self.N = 10000 - self.labels = np.random.randint(0, 2000, size=self.N) - self.labels2 = np.random.randint(0, 3, size=self.N) - self.df = DataFrame({ - 'key': self.labels, - 'key2': self.labels2, - 'value1': np.random.randn(self.N), - 'value2': (['foo', 'bar', 'baz', 'qux'] * (self.N // 4)), - }) - - @staticmethod - def scalar_function(g): - return 1 + param_names = ['key'] + params = ['int64_small', 'int64_large', 'object_small', 'object_large'] - def time_groupby_frame_apply_scalar_function(self): - self.df.groupby(['key', 'key2']).apply(self.scalar_function) + def setup_cache(self): + size = 10**6 + data = {'int64_small': Series(np.random.randint(0, 100, size=size)), + 'int64_large': Series(np.random.randint(0, 10000, size=size)), + 'object_small': Series( + tm.makeStringIndex(100).take( + np.random.randint(0, 100, size=size))), + 'object_large': Series( + tm.makeStringIndex(10000).take( + np.random.randint(0, 10000, size=size)))} + return data - def time_groupby_frame_apply_scalar_function_overhead(self): - self.df.groupby('key').apply(self.scalar_function) + def setup(self, data, key): + self.ser = data[key] - @staticmethod - def df_copy_function(g): - # ensure that the group name is available (see GH #15062) - g.name - return g.copy() + def time_series_groups(self, data, key): + self.ser.groupby(self.ser).groups - def time_groupby_frame_df_copy_function(self): - self.df.groupby(['key', 'key2']).apply(self.df_copy_function) - def time_groupby_frame_apply_df_copy_overhead(self): - self.df.groupby('key').apply(self.df_copy_function) +class GroupManyLabels(object): - -#---------------------------------------------------------------------- -# 2d grouping, aggregate many columns - -class groupby_frame_cython_many_columns(object): goal_time = 0.2 + params = [1, 1000] + param_names = ['ncols'] - def setup(self): - self.labels = np.random.randint(0, 100, size=1000) - self.df = DataFrame(randn(1000, 1000)) + def setup(self, ncols): + N = 1000 + data = np.random.randn(N, ncols) + self.labels = np.random.randint(0, 100, size=N) + self.df = DataFrame(data) - def time_sum(self): + def time_sum(self, ncols): self.df.groupby(self.labels).sum() -#---------------------------------------------------------------------- -# single key, long, integer key +class Nth(object): -class groupby_frame_singlekey_integer(object): goal_time = 0.2 - def setup(self): - self.data = np.random.randn(100000, 1) - self.labels = np.random.randint(0, 1000, size=100000) - self.df = DataFrame(self.data) - - def time_sum(self): - self.df.groupby(self.labels).sum() + param_names = ['dtype'] + params = ['float32', 'float64', 'datetime', 'object'] + def setup(self, dtype): + N = 10**5 + # with datetimes (GH7555) + if dtype == 'datetime': + values = date_range('1/1/2011', periods=N, freq='s') + elif dtype == 'object': + values = ['foo'] * N + else: + values = np.arange(N).astype(dtype) -#---------------------------------------------------------------------- -# DataFrame nth + key = np.arange(N) + self.df = DataFrame({'key': key, 'values': values}) + self.df.iloc[1, 1] = np.nan # insert missing data -class groupby_nth(object): - goal_time = 0.2 + def time_frame_nth_any(self, dtype): + self.df.groupby('key').nth(0, dropna='any') - def setup(self): - self.df = DataFrame(np.random.randint(1, 100, (10000, 2))) + def time_groupby_nth_all(self, dtype): + self.df.groupby('key').nth(0, dropna='all') - def time_groupby_frame_nth_any(self): - self.df.groupby(0).nth(0, dropna='any') + def time_frame_nth(self, dtype): + self.df.groupby('key').nth(0) - def time_groupby_frame_nth_none(self): - self.df.groupby(0).nth(0) + def time_series_nth_any(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='any') - def time_groupby_series_nth_any(self): - self.df[1].groupby(self.df[0]).nth(0, dropna='any') + def time_series_nth_all(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0, dropna='all') - def time_groupby_series_nth_none(self): - self.df[1].groupby(self.df[0]).nth(0) + def time_series_nth(self, dtype): + self.df['values'].groupby(self.df['key']).nth(0) -#---------------------------------------------------------------------- -# groupby_indices replacement, chop up Series +class DateAttributes(object): -class groupby_indices(object): goal_time = 0.2 def setup(self): - try: - self.rng = date_range('1/1/2000', '12/31/2005', freq='H') - (self.year, self.month, self.day) = (self.rng.year, self.rng.month, self.rng.day) - except: - self.rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour()) - self.year = self.rng.map((lambda x: x.year)) - self.month = self.rng.map((lambda x: x.month)) - self.day = self.rng.map((lambda x: x.day)) - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - - def time_groupby_indices(self): + rng = date_range('1/1/2000', '12/31/2005', freq='H') + self.year, self.month, self.day = rng.year, rng.month, rng.day + self.ts = Series(np.random.randn(len(rng)), index=rng) + + def time_len_groupby_object(self): len(self.ts.groupby([self.year, self.month, self.day])) -class groupby_int64_overflow(object): +class Int64(object): + goal_time = 0.2 def setup(self): - self.arr = np.random.randint(((-1) << 12), (1 << 12), ((1 << 17), 5)) - self.i = np.random.choice(len(self.arr), (len(self.arr) * 5)) - self.arr = np.vstack((self.arr, self.arr[self.i])) - self.i = np.random.permutation(len(self.arr)) - self.arr = self.arr[self.i] - self.df = DataFrame(self.arr, columns=list('abcde')) - (self.df['jim'], self.df['joe']) = (np.random.randn(2, len(self.df)) * 10) + arr = np.random.randint(-1 << 12, 1 << 12, (1 << 17, 5)) + i = np.random.choice(len(arr), len(arr) * 5) + arr = np.vstack((arr, arr[i])) + i = np.random.permutation(len(arr)) + arr = arr[i] + self.cols = list('abcde') + self.df = DataFrame(arr, columns=self.cols) + self.df['jim'], self.df['joe'] = np.random.randn(2, len(self.df)) * 10 - def time_groupby_int64_overflow(self): - self.df.groupby(list('abcde')).max() + def time_overflow(self): + self.df.groupby(self.cols).max() -#---------------------------------------------------------------------- -# count() speed +class CountMultiDtype(object): -class groupby_multi_count(object): goal_time = 0.2 - def setup(self): - self.n = 10000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.dates[(np.random.rand(self.n) > 0.5)] = np.datetime64('nat') - self.offsets[(np.random.rand(self.n) > 0.5)] = np.timedelta64('nat') - self.value2 = np.random.randn(self.n) - self.value2[(np.random.rand(self.n) > 0.5)] = np.nan - self.obj = np.random.choice(list('ab'), size=self.n).astype(object) - self.obj[(np.random.randn(self.n) > 0.5)] = np.nan - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), - 'key2': np.random.randint(0, 100, size=self.n), - 'dates': self.dates, - 'value2': self.value2, - 'value3': np.random.randn(self.n), - 'ints': np.random.randint(0, 1000, size=self.n), - 'obj': self.obj, - 'offsets': self.offsets, }) - - def time_groupby_multi_count(self): - self.df.groupby(['key1', 'key2']).count() - - -class groupby_int_count(object): - goal_time = 0.2 + def setup_cache(self): + n = 10000 + offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') + dates = np.datetime64('now') + offsets + dates[np.random.rand(n) > 0.5] = np.datetime64('nat') + offsets[np.random.rand(n) > 0.5] = np.timedelta64('nat') + value2 = np.random.randn(n) + value2[np.random.rand(n) > 0.5] = np.nan + obj = np.random.choice(list('ab'), size=n).astype(object) + obj[np.random.randn(n) > 0.5] = np.nan + df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'dates': dates, + 'value2': value2, + 'value3': np.random.randn(n), + 'ints': np.random.randint(0, 1000, size=n), + 'obj': obj, + 'offsets': offsets}) + return df + + def time_multi_count(self, df): + df.groupby(['key1', 'key2']).count() + + +class CountMultiInt(object): - def setup(self): - self.n = 10000 - self.df = DataFrame({'key1': randint(0, 500, size=self.n), - 'key2': randint(0, 100, size=self.n), - 'ints': randint(0, 1000, size=self.n), - 'ints2': randint(0, 1000, size=self.n), }) + goal_time = 0.2 - def time_groupby_int_count(self): - self.df.groupby(['key1', 'key2']).count() + def setup_cache(self): + n = 10000 + df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'ints': np.random.randint(0, 1000, size=n), + 'ints2': np.random.randint(0, 1000, size=n)}) + return df + def time_multi_int_count(self, df): + df.groupby(['key1', 'key2']).count() -#---------------------------------------------------------------------- -# nunique() speed + def time_multi_int_nunique(self, df): + df.groupby(['key1', 'key2']).nunique() -class groupby_nunique(object): - def setup(self): - self.n = 10000 - self.df = DataFrame({'key1': randint(0, 500, size=self.n), - 'key2': randint(0, 100, size=self.n), - 'ints': randint(0, 1000, size=self.n), - 'ints2': randint(0, 1000, size=self.n), }) +class AggFunctions(object): - def time_groupby_nunique(self): - self.df.groupby(['key1', 'key2']).nunique() + goal_time = 0.2 + def setup_cache(): + N = 10**5 + fac1 = np.array(['A', 'B', 'C'], dtype='O') + fac2 = np.array(['one', 'two'], dtype='O') + df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=N)), + 'key2': fac2.take(np.random.randint(0, 2, size=N)), + 'value1': np.random.randn(N), + 'value2': np.random.randn(N), + 'value3': np.random.randn(N)}) + return df -#---------------------------------------------------------------------- -# group with different functions per column + def time_different_str_functions(self, df): + df.groupby(['key1', 'key2']).agg({'value1': 'mean', + 'value2': 'var', + 'value3': 'sum'}) -class groupby_agg_multi(object): - goal_time = 0.2 + def time_different_numpy_functions(self, df): + df.groupby(['key1', 'key2']).agg({'value1': np.mean, + 'value2': np.var, + 'value3': np.sum}) - def setup(self): - self.fac1 = np.array(['A', 'B', 'C'], dtype='O') - self.fac2 = np.array(['one', 'two'], dtype='O') - self.df = DataFrame({'key1': self.fac1.take(np.random.randint(0, 3, size=100000)), 'key2': self.fac2.take(np.random.randint(0, 2, size=100000)), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) + def time_different_python_functions_multicol(self, df): + df.groupby(['key1', 'key2']).agg([sum, min, max]) - def time_groupby_multi_different_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': 'mean', 'value2': 'var', 'value3': 'sum'}) + def time_different_python_functions_singlecol(self, df): + df.groupby('key1').agg([sum, min, max]) - def time_groupby_multi_different_numpy_functions(self): - self.df.groupby(['key1', 'key2']).agg({'value1': np.mean, 'value2': np.var, 'value3': np.sum}) +class GroupStrings(object): -class groupby_multi_index(object): goal_time = 0.2 def setup(self): - self.n = (((5 * 7) * 11) * (1 << 9)) - self.alpha = list(map(''.join, product((ascii_letters + digits), repeat=4))) - self.f = (lambda k: np.repeat(np.random.choice(self.alpha, (self.n // k)), k)) - self.df = DataFrame({'a': self.f(11), 'b': self.f(7), 'c': self.f(5), 'd': self.f(1), }) + n = 2 * 10**5 + alpha = list(map(''.join, product(ascii_letters, repeat=4))) + data = np.random.choice(alpha, (n // 5, 4), replace=False) + data = np.repeat(data, 5, axis=0) + self.df = DataFrame(data, columns=list('abcd')) self.df['joe'] = (np.random.randn(len(self.df)) * 10).round(3) - self.i = np.random.permutation(len(self.df)) - self.df = self.df.iloc[self.i].reset_index(drop=True).copy() + self.df = self.df.sample(frac=1).reset_index(drop=True) - def time_groupby_multi_index(self): + def time_multi_columns(self): self.df.groupby(list('abcd')).max() -class groupby_multi(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.ngroups = 100 - self.df = DataFrame({'key1': self.get_test_data(ngroups=self.ngroups), 'key2': self.get_test_data(ngroups=self.ngroups), 'data1': np.random.randn(self.N), 'data2': np.random.randn(self.N), }) - self.simple_series = Series(np.random.randn(self.N)) - self.key1 = self.df['key1'] - - def get_test_data(self, ngroups=100, n=100000): - self.unique_groups = range(self.ngroups) - self.arr = np.asarray(np.tile(self.unique_groups, int(n / self.ngroups)), dtype=object) - if (len(self.arr) < n): - self.arr = np.asarray((list(self.arr) + self.unique_groups[:(n - len(self.arr))]), dtype=object) - random.shuffle(self.arr) - return self.arr +class MultiColumn(object): - def f(self): - self.df.groupby(['key1', 'key2']).agg((lambda x: x.values.sum())) + goal_time = 0.2 - def time_groupby_multi_cython(self): - self.df.groupby(['key1', 'key2']).sum() + def setup_cache(self): + N = 10**5 + key1 = np.tile(np.arange(100, dtype=object), 1000) + key2 = key1.copy() + np.random.shuffle(key1) + np.random.shuffle(key2) + df = DataFrame({'key1': key1, + 'key2': key2, + 'data1': np.random.randn(N), + 'data2': np.random.randn(N)}) + return df - def time_groupby_multi_python(self): - self.df.groupby(['key1', 'key2'])['data1'].agg((lambda x: x.values.sum())) + def time_lambda_sum(self, df): + df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum()) - def time_groupby_multi_series_op(self): - self.df.groupby(['key1', 'key2'])['data1'].agg(np.std) + def time_cython_sum(self, df): + df.groupby(['key1', 'key2']).sum() - def time_groupby_series_simple_cython(self): - self.simple_series.groupby(self.key1).sum() + def time_col_select_lambda_sum(self, df): + df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum()) - def time_groupby_series_simple_rank(self): - self.df.groupby('key1').rank(pct=True) + def time_col_select_numpy_sum(self, df): + df.groupby(['key1', 'key2'])['data1'].agg(np.sum) -#---------------------------------------------------------------------- -# size() speed +class Size(object): -class groupby_size(object): goal_time = 0.2 def setup(self): - self.n = 100000 - self.offsets = np.random.randint(self.n, size=self.n).astype('timedelta64[ns]') - self.dates = (np.datetime64('now') + self.offsets) - self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) - - N = 1000000 - self.draws = pd.Series(np.random.randn(N)) - labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4)) + n = 10**5 + offsets = np.random.randint(n, size=n).astype('timedelta64[ns]') + dates = np.datetime64('now') + offsets + self.df = DataFrame({'key1': np.random.randint(0, 500, size=n), + 'key2': np.random.randint(0, 100, size=n), + 'value1': np.random.randn(n), + 'value2': np.random.randn(n), + 'value3': np.random.randn(n), + 'dates': dates}) + self.draws = Series(np.random.randn(n)) + labels = Series(['foo', 'bar', 'baz', 'qux'] * (n // 4)) self.cats = labels.astype('category') - def time_groupby_multi_size(self): + def time_multi_size(self): self.df.groupby(['key1', 'key2']).size() - def time_groupby_dt_size(self): - self.df.groupby(['dates']).size() - - def time_groupby_dt_timegrouper_size(self): - self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + def time_dt_timegrouper_size(self): + with warnings.catch_warnings(record=True): + self.df.groupby(TimeGrouper(key='dates', freq='M')).size() - def time_groupby_size(self): + def time_category_size(self): self.draws.groupby(self.cats).size() +class GroupByMethods(object): -#---------------------------------------------------------------------- -# groupby with a variable value for ngroups - -class GroupBySuite(object): goal_time = 0.2 - param_names = ['dtype', 'ngroups'] - params = [['int', 'float'], [100, 10000]] - - def setup(self, dtype, ngroups): - np.random.seed(1234) + param_names = ['dtype', 'method', 'application'] + params = [['int', 'float', 'object', 'datetime'], + ['all', 'any', 'bfill', 'count', 'cumcount', 'cummax', 'cummin', + 'cumprod', 'cumsum', 'describe', 'ffill', 'first', 'head', + 'last', 'mad', 'max', 'min', 'median', 'mean', 'nunique', + 'pct_change', 'prod', 'rank', 'sem', 'shift', 'size', 'skew', + 'std', 'sum', 'tail', 'unique', 'value_counts', 'var'], + ['direct', 'transformation']] + + def setup(self, dtype, method, application): + if method in method_blacklist.get(dtype, {}): + raise NotImplementedError # skip benchmark + ngroups = 1000 size = ngroups * 2 rng = np.arange(ngroups) values = rng.take(np.random.randint(0, ngroups, size=size)) if dtype == 'int': key = np.random.randint(0, size, size=size) - else: + elif dtype == 'float': key = np.concatenate([np.random.random(ngroups) * 0.1, np.random.random(ngroups) * 10.0]) + elif dtype == 'object': + key = ['foo'] * size + elif dtype == 'datetime': + key = date_range('1/1/2011', periods=size, freq='s') - self.df = DataFrame({'values': values, - 'key': key}) - - def time_all(self, dtype, ngroups): - self.df.groupby('key')['values'].all() - - def time_any(self, dtype, ngroups): - self.df.groupby('key')['values'].any() - - def time_count(self, dtype, ngroups): - self.df.groupby('key')['values'].count() - - def time_cumcount(self, dtype, ngroups): - self.df.groupby('key')['values'].cumcount() - - def time_cummax(self, dtype, ngroups): - self.df.groupby('key')['values'].cummax() - - def time_cummin(self, dtype, ngroups): - self.df.groupby('key')['values'].cummin() - - def time_cumprod(self, dtype, ngroups): - self.df.groupby('key')['values'].cumprod() - - def time_cumsum(self, dtype, ngroups): - self.df.groupby('key')['values'].cumsum() - - def time_describe(self, dtype, ngroups): - self.df.groupby('key')['values'].describe() - - def time_diff(self, dtype, ngroups): - self.df.groupby('key')['values'].diff() - - def time_first(self, dtype, ngroups): - self.df.groupby('key')['values'].first() - - def time_head(self, dtype, ngroups): - self.df.groupby('key')['values'].head() - - def time_last(self, dtype, ngroups): - self.df.groupby('key')['values'].last() - - def time_mad(self, dtype, ngroups): - self.df.groupby('key')['values'].mad() - - def time_max(self, dtype, ngroups): - self.df.groupby('key')['values'].max() - - def time_mean(self, dtype, ngroups): - self.df.groupby('key')['values'].mean() - - def time_median(self, dtype, ngroups): - self.df.groupby('key')['values'].median() - - def time_min(self, dtype, ngroups): - self.df.groupby('key')['values'].min() - - def time_nunique(self, dtype, ngroups): - self.df.groupby('key')['values'].nunique() - - def time_pct_change(self, dtype, ngroups): - self.df.groupby('key')['values'].pct_change() - - def time_prod(self, dtype, ngroups): - self.df.groupby('key')['values'].prod() - - def time_rank(self, dtype, ngroups): - self.df.groupby('key')['values'].rank() - - def time_sem(self, dtype, ngroups): - self.df.groupby('key')['values'].sem() - - def time_size(self, dtype, ngroups): - self.df.groupby('key')['values'].size() + df = DataFrame({'values': values, 'key': key}) - def time_skew(self, dtype, ngroups): - self.df.groupby('key')['values'].skew() + if application == 'transform': + if method == 'describe': + raise NotImplementedError - def time_std(self, dtype, ngroups): - self.df.groupby('key')['values'].std() + self.as_group_method = lambda: df.groupby( + 'key')['values'].transform(method) + self.as_field_method = lambda: df.groupby( + 'values')['key'].transform(method) + else: + self.as_group_method = getattr(df.groupby('key')['values'], method) + self.as_field_method = getattr(df.groupby('values')['key'], method) - def time_sum(self, dtype, ngroups): - self.df.groupby('key')['values'].sum() + def time_dtype_as_group(self, dtype, method, application): + self.as_group_method() - def time_tail(self, dtype, ngroups): - self.df.groupby('key')['values'].tail() + def time_dtype_as_field(self, dtype, method, application): + self.as_field_method() - def time_unique(self, dtype, ngroups): - self.df.groupby('key')['values'].unique() - def time_value_counts(self, dtype, ngroups): - self.df.groupby('key')['values'].value_counts() +class RankWithTies(object): + # GH 21237 + goal_time = 0.2 + param_names = ['dtype', 'tie_method'] + params = [['float64', 'float32', 'int64', 'datetime64'], + ['first', 'average', 'dense', 'min', 'max']] + + def setup(self, dtype, tie_method): + N = 10**4 + if dtype == 'datetime64': + data = np.array([Timestamp("2011/01/01")] * N, dtype=dtype) + else: + data = np.array([1] * N, dtype=dtype) + self.df = DataFrame({'values': data, 'key': ['foo'] * N}) - def time_var(self, dtype, ngroups): - self.df.groupby('key')['values'].var() + def time_rank_ties(self, dtype, tie_method): + self.df.groupby('key').rank(method=tie_method) -class groupby_float32(object): +class Float32(object): # GH 13335 goal_time = 0.2 @@ -515,27 +415,28 @@ def setup(self): arr = np.repeat(tmp, 10) self.df = DataFrame(dict(a=arr, b=arr)) - def time_groupby_sum(self): + def time_sum(self): self.df.groupby(['a'])['b'].sum() -class groupby_categorical(object): +class Categories(object): + goal_time = 0.2 def setup(self): - N = 100000 + N = 10**5 arr = np.random.random(N) - - self.df = DataFrame(dict( - a=Categorical(np.random.randint(10000, size=N)), - b=arr)) - self.df_ordered = DataFrame(dict( - a=Categorical(np.random.randint(10000, size=N), ordered=True), - b=arr)) - self.df_extra_cat = DataFrame(dict( - a=Categorical(np.random.randint(100, size=N), - categories=np.arange(10000)), - b=arr)) + data = {'a': Categorical(np.random.randint(10000, size=N)), + 'b': arr} + self.df = DataFrame(data) + data = {'a': Categorical(np.random.randint(10000, size=N), + ordered=True), + 'b': arr} + self.df_ordered = DataFrame(data) + data = {'a': Categorical(np.random.randint(100, size=N), + categories=np.arange(10000)), + 'b': arr} + self.df_extra_cat = DataFrame(data) def time_groupby_sort(self): self.df.groupby('a')['b'].count() @@ -556,130 +457,70 @@ def time_groupby_extra_cat_nosort(self): self.df_extra_cat.groupby('a', sort=False)['b'].count() -class groupby_period(object): +class Datelike(object): # GH 14338 goal_time = 0.2 - - def make_grouper(self, N): - return pd.period_range('1900-01-01', freq='D', periods=N) - - def setup(self): - N = 10000 - self.grouper = self.make_grouper(N) - self.df = pd.DataFrame(np.random.randn(N, 2)) - - def time_groupby_sum(self): + params = ['period_range', 'date_range', 'date_range_tz'] + param_names = ['grouper'] + + def setup(self, grouper): + N = 10**4 + rng_map = {'period_range': period_range, + 'date_range': date_range, + 'date_range_tz': partial(date_range, tz='US/Central')} + self.grouper = rng_map[grouper]('1900-01-01', freq='D', periods=N) + self.df = DataFrame(np.random.randn(10**4, 2)) + + def time_sum(self, grouper): self.df.groupby(self.grouper).sum() -class groupby_datetime(groupby_period): - def make_grouper(self, N): - return pd.date_range('1900-01-01', freq='D', periods=N) - - -class groupby_datetimetz(groupby_period): - def make_grouper(self, N): - return pd.date_range('1900-01-01', freq='D', periods=N, - tz='US/Central') - -#---------------------------------------------------------------------- -# Series.value_counts - -class series_value_counts(object): - goal_time = 0.2 - - def setup(self): - self.s = Series(np.random.randint(0, 1000, size=100000)) - self.s2 = self.s.astype(float) - - self.K = 1000 - self.N = 100000 - self.uniques = tm.makeStringIndex(self.K).values - self.s3 = Series(np.tile(self.uniques, (self.N // self.K))) - - def time_value_counts_int64(self): - self.s.value_counts() - - def time_value_counts_float64(self): - self.s2.value_counts() - - def time_value_counts_strings(self): - self.s.value_counts() - - -#---------------------------------------------------------------------- -# pivot_table - -class groupby_pivot_table(object): - goal_time = 0.2 - - def setup(self): - self.fac1 = np.array(['A', 'B', 'C'], dtype='O') - self.fac2 = np.array(['one', 'two'], dtype='O') - self.ind1 = np.random.randint(0, 3, size=100000) - self.ind2 = np.random.randint(0, 2, size=100000) - self.df = DataFrame({'key1': self.fac1.take(self.ind1), 'key2': self.fac2.take(self.ind2), 'key3': self.fac2.take(self.ind2), 'value1': np.random.randn(100000), 'value2': np.random.randn(100000), 'value3': np.random.randn(100000), }) - - def time_groupby_pivot_table(self): - self.df.pivot_table(index='key1', columns=['key2', 'key3']) - - -#---------------------------------------------------------------------- -# Sum booleans #2692 - -class groupby_sum_booleans(object): +class SumBools(object): + # GH 2692 goal_time = 0.2 def setup(self): - self.N = 500 - self.df = DataFrame({'ii': range(self.N), 'bb': [True for x in range(self.N)], }) + N = 500 + self.df = DataFrame({'ii': range(N), + 'bb': [True] * N}) def time_groupby_sum_booleans(self): self.df.groupby('ii').sum() -#---------------------------------------------------------------------- -# multi-indexed group sum #9049 - -class groupby_sum_multiindex(object): +class SumMultiLevel(object): + # GH 9049 goal_time = 0.2 + timeout = 120.0 def setup(self): - self.N = 50 - self.df = DataFrame({'A': (list(range(self.N)) * 2), 'B': list(range((self.N * 2))), 'C': 1, }).set_index(['A', 'B']) + N = 50 + self.df = DataFrame({'A': list(range(N)) * 2, + 'B': range(N * 2), + 'C': 1}).set_index(['A', 'B']) def time_groupby_sum_multiindex(self): self.df.groupby(level=[0, 1]).sum() -#------------------------------------------------------------------------------- -# Transform testing - class Transform(object): + goal_time = 0.2 def setup(self): n1 = 400 n2 = 250 - - index = MultiIndex( - levels=[np.arange(n1), pd.util.testing.makeStringIndex(n2)], - labels=[[i for i in range(n1) for _ in range(n2)], - (list(range(n2)) * n1)], - names=['lev1', 'lev2']) - - data = DataFrame(np.random.randn(n1 * n2, 3), - index=index, columns=['col1', 'col20', 'col3']) - step = int((n1 * n2 * 0.1)) - for col in range(len(data.columns)): - idx = col - while (idx < len(data)): - data.set_value(data.index[idx], data.columns[col], np.nan) - idx += step + index = MultiIndex(levels=[np.arange(n1), tm.makeStringIndex(n2)], + labels=[np.repeat(range(n1), n2).tolist(), + list(range(n2)) * n1], + names=['lev1', 'lev2']) + arr = np.random.randn(n1 * n2, 3) + arr[::10000, 0] = np.nan + arr[1::10000, 1] = np.nan + arr[2::10000, 2] = np.nan + data = DataFrame(arr, index=index, columns=['col1', 'col20', 'col3']) self.df = data - self.f_fillna = (lambda x: x.fillna(method='pad')) - np.random.seed(2718281) n = 20000 self.df1 = DataFrame(np.random.randint(1, n, (n, 3)), columns=['jim', 'joe', 'jolie']) @@ -691,10 +532,10 @@ def setup(self): self.df4 = self.df3.copy() self.df4['jim'] = self.df4['joe'] - def time_transform_func(self): - self.df.groupby(level='lev2').transform(self.f_fillna) + def time_transform_lambda_max(self): + self.df.groupby(level='lev1').transform(lambda x: max(x)) - def time_transform_ufunc(self): + def time_transform_ufunc_max(self): self.df.groupby(level='lev1').transform(np.max) def time_transform_multi_key1(self): @@ -710,63 +551,31 @@ def time_transform_multi_key4(self): self.df4.groupby(['jim', 'joe'])['jolie'].transform('max') +class TransformBools(object): - -np.random.seed(0) -N = 120000 -N_TRANSITIONS = 1400 -transition_points = np.random.permutation(np.arange(N))[:N_TRANSITIONS] -transition_points.sort() -transitions = np.zeros((N,), dtype=np.bool) -transitions[transition_points] = True -g = transitions.cumsum() -df = DataFrame({'signal': np.random.rand(N), }) - - - - - -class groupby_transform_series(object): goal_time = 0.2 def setup(self): - np.random.seed(0) N = 120000 transition_points = np.sort(np.random.choice(np.arange(N), 1400)) - transitions = np.zeros((N,), dtype=np.bool) + transitions = np.zeros(N, dtype=np.bool) transitions[transition_points] = True self.g = transitions.cumsum() self.df = DataFrame({'signal': np.random.rand(N)}) - def time_groupby_transform_series(self): + def time_transform_mean(self): self.df['signal'].groupby(self.g).transform(np.mean) -class groupby_transform_series2(object): +class TransformNaN(object): + # GH 12737 goal_time = 0.2 def setup(self): - np.random.seed(0) - self.df = DataFrame({'key': (np.arange(100000) // 3), - 'val': np.random.randn(100000)}) - - self.df_nans = pd.DataFrame({'key': np.repeat(np.arange(1000), 10), - 'B': np.nan, - 'C': np.nan}) - self.df_nans.ix[4::10, 'B':'C'] = 5 - - def time_transform_series2(self): - self.df.groupby('key')['val'].transform(np.mean) - - def time_cumprod(self): - self.df.groupby('key').cumprod() - - def time_cumsum(self): - self.df.groupby('key').cumsum() - - def time_shift(self): - self.df.groupby('key').shift() + self.df_nans = DataFrame({'key': np.repeat(np.arange(1000), 10), + 'B': np.nan, + 'C': np.nan}) + self.df_nans.loc[4::10, 'B':'C'] = 5 - def time_transform_dataframe(self): - # GH 12737 + def time_first(self): self.df_nans.groupby('key').transform('first') diff --git a/asv_bench/benchmarks/hdfstore_bench.py b/asv_bench/benchmarks/hdfstore_bench.py deleted file mode 100644 index 7d490180e8af6e..00000000000000 --- a/asv_bench/benchmarks/hdfstore_bench.py +++ /dev/null @@ -1,129 +0,0 @@ -from .pandas_vb_common import * -import os - - -class HDF5(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(25000) - self.df = DataFrame({'float1': randn(25000), 'float2': randn(25000),}, - index=self.index) - - self.df_mixed = DataFrame( - {'float1': randn(25000), 'float2': randn(25000), - 'string1': (['foo'] * 25000), - 'bool1': ([True] * 25000), - 'int1': np.random.randint(0, 250000, size=25000),}, - index=self.index) - - self.df_wide = DataFrame(np.random.randn(25000, 100)) - - self.df2 = DataFrame({'float1': randn(25000), 'float2': randn(25000)}, - index=date_range('1/1/2000', periods=25000)) - self.df_wide2 = DataFrame(np.random.randn(25000, 100), - index=date_range('1/1/2000', periods=25000)) - - self.df_dc = DataFrame(np.random.randn(10000, 10), - columns=[('C%03d' % i) for i in range(10)]) - - self.f = '__test__.h5' - self.remove(self.f) - - self.store = HDFStore(self.f) - self.store.put('fixed', self.df) - self.store.put('fixed_mixed', self.df_mixed) - self.store.append('table', self.df2) - self.store.append('table_mixed', self.df_mixed) - self.store.append('table_wide', self.df_wide) - self.store.append('table_wide2', self.df_wide2) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - def time_read_store(self): - self.store.get('fixed') - - def time_read_store_mixed(self): - self.store.get('fixed_mixed') - - def time_write_store(self): - self.store.put('fixed_write', self.df) - - def time_write_store_mixed(self): - self.store.put('fixed_mixed_write', self.df_mixed) - - def time_read_store_table_mixed(self): - self.store.select('table_mixed') - - def time_write_store_table_mixed(self): - self.store.append('table_mixed_write', self.df_mixed) - - def time_read_store_table(self): - self.store.select('table') - - def time_write_store_table(self): - self.store.append('table_write', self.df) - - def time_read_store_table_wide(self): - self.store.select('table_wide') - - def time_write_store_table_wide(self): - self.store.append('table_wide_write', self.df_wide) - - def time_write_store_table_dc(self): - self.store.append('table_dc_write', self.df_dc, data_columns=True) - - def time_query_store_table_wide(self): - start = self.df_wide2.index[10000] - stop = self.df_wide2.index[15000] - self.store.select('table_wide', where="index > start and index < stop") - - def time_query_store_table(self): - start = self.df2.index[10000] - stop = self.df2.index[15000] - self.store.select('table', where="index > start and index < stop") - - def time_store_repr(self): - repr(self.store) - - def time_store_str(self): - str(self.store) - - def time_store_info(self): - self.store.info() - - -class HDF5Panel(object): - goal_time = 0.2 - - def setup(self): - self.f = '__test__.h5' - self.p = Panel(randn(20, 1000, 25), - items=[('Item%03d' % i) for i in range(20)], - major_axis=date_range('1/1/2000', periods=1000), - minor_axis=[('E%03d' % i) for i in range(25)]) - self.remove(self.f) - self.store = HDFStore(self.f) - self.store.append('p1', self.p) - - def teardown(self): - self.store.close() - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - - def time_read_store_table_panel(self): - self.store.select('p1') - - def time_write_store_table_panel(self): - self.store.append('p2', self.p) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index 7697c3b9d3840c..f1703e163917ac 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -1,240 +1,194 @@ -from .pandas_vb_common import * +import numpy as np +import pandas.util.testing as tm +from pandas import (Series, date_range, DatetimeIndex, Index, RangeIndex, + Float64Index) + +from .pandas_vb_common import setup # noqa class SetOperations(object): - goal_time = 0.2 - def setup(self): - self.rng = date_range('1/1/2000', periods=10000, freq='T') - self.rng2 = self.rng[:(-1)] + goal_time = 0.2 + params = (['datetime', 'date_string', 'int', 'strings'], + ['intersection', 'union', 'symmetric_difference']) + param_names = ['dtype', 'method'] + + def setup(self, dtype, method): + N = 10**5 + dates_left = date_range('1/1/2000', periods=N, freq='T') + fmt = '%Y-%m-%d %H:%M:%S' + date_str_left = Index(dates_left.strftime(fmt)) + int_left = Index(np.arange(N)) + str_left = tm.makeStringIndex(N) + data = {'datetime': {'left': dates_left, 'right': dates_left[:-1]}, + 'date_string': {'left': date_str_left, + 'right': date_str_left[:-1]}, + 'int': {'left': int_left, 'right': int_left[:-1]}, + 'strings': {'left': str_left, 'right': str_left[:-1]}} + self.left = data[dtype]['left'] + self.right = data[dtype]['right'] + + def time_operation(self, dtype, method): + getattr(self.left, method)(self.right) + + +class SetDisjoint(object): - # object index with datetime values - if (self.rng.dtype == object): - self.idx_rng = self.rng.view(Index) - else: - self.idx_rng = self.rng.asobject - self.idx_rng2 = self.idx_rng[:(-1)] + goal_time = 0.2 - # other datetime - N = 100000 - A = N - 20000 + def setup(self): + N = 10**5 B = N + 20000 - self.dtidx1 = DatetimeIndex(range(N)) - self.dtidx2 = DatetimeIndex(range(A, B)) - self.dtidx3 = DatetimeIndex(range(N, B)) - - # integer - self.N = 1000000 - self.options = np.arange(self.N) - self.left = Index( - self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - self.right = Index( - self.options.take(np.random.permutation(self.N)[:(self.N // 2)])) - - # strings - N = 10000 - strs = tm.rands_array(10, N) - self.leftstr = Index(strs[:N * 2 // 3]) - self.rightstr = Index(strs[N // 3:]) - - def time_datetime_intersection(self): - self.rng.intersection(self.rng2) - - def time_datetime_union(self): - self.rng.union(self.rng2) - - def time_datetime_difference(self): - self.dtidx1.difference(self.dtidx2) + self.datetime_left = DatetimeIndex(range(N)) + self.datetime_right = DatetimeIndex(range(N, B)) def time_datetime_difference_disjoint(self): - self.dtidx1.difference(self.dtidx3) - - def time_datetime_symmetric_difference(self): - self.dtidx1.symmetric_difference(self.dtidx2) - - def time_index_datetime_intersection(self): - self.idx_rng.intersection(self.idx_rng2) - - def time_index_datetime_union(self): - self.idx_rng.union(self.idx_rng2) - - def time_int64_intersection(self): - self.left.intersection(self.right) - - def time_int64_union(self): - self.left.union(self.right) - - def time_int64_difference(self): - self.left.difference(self.right) - - def time_int64_symmetric_difference(self): - self.left.symmetric_difference(self.right) - - def time_str_difference(self): - self.leftstr.difference(self.rightstr) - - def time_str_symmetric_difference(self): - self.leftstr.symmetric_difference(self.rightstr) + self.datetime_left.difference(self.datetime_right) class Datetime(object): + goal_time = 0.2 def setup(self): - self.dr = pd.date_range('20000101', freq='D', periods=10000) + self.dr = date_range('20000101', freq='D', periods=10000) def time_is_dates_only(self): self.dr._is_dates_only -class Float64(object): - goal_time = 0.2 +class Ops(object): - def setup(self): - self.idx = tm.makeFloatIndex(1000000) - self.mask = ((np.arange(self.idx.size) % 3) == 0) - self.series_mask = Series(self.mask) + sample_time = 0.2 + params = ['float', 'int'] + param_names = ['dtype'] - self.baseidx = np.arange(1000000.0) + def setup(self, dtype): + N = 10**6 + indexes = {'int': 'makeIntIndex', 'float': 'makeFloatIndex'} + self.index = getattr(tm, indexes[dtype])(N) - def time_boolean_indexer(self): - self.idx[self.mask] + def time_add(self, dtype): + self.index + 2 - def time_boolean_series_indexer(self): - self.idx[self.series_mask] + def time_subtract(self, dtype): + self.index - 2 - def time_construct(self): - Index(self.baseidx) + def time_multiply(self, dtype): + self.index * 2 - def time_div(self): - (self.idx / 2) + def time_divide(self, dtype): + self.index / 2 - def time_get(self): - self.idx[1] - - def time_mul(self): - (self.idx * 2) + def time_modulo(self, dtype): + self.index % 2 - def time_slice_indexer_basic(self): - self.idx[:(-1)] - - def time_slice_indexer_even(self): - self.idx[::2] +class Range(object): -class StringIndex(object): goal_time = 0.2 def setup(self): - self.idx = tm.makeStringIndex(1000000) - self.mask = ((np.arange(1000000) % 3) == 0) - self.series_mask = Series(self.mask) - - def time_boolean_indexer(self): - self.idx[self.mask] - - def time_boolean_series_indexer(self): - self.idx[self.series_mask] - - def time_slice_indexer_basic(self): - self.idx[:(-1)] - - def time_slice_indexer_even(self): - self.idx[::2] - + self.idx_inc = RangeIndex(start=0, stop=10**7, step=3) + self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3) -class Multi1(object): - goal_time = 0.2 + def time_max(self): + self.idx_inc.max() - def setup(self): - (n, k) = (200, 5000) - self.levels = [np.arange(n), tm.makeStringIndex(n).values, (1000 + np.arange(n))] - self.labels = [np.random.choice(n, (k * n)) for lev in self.levels] - self.mi = MultiIndex(levels=self.levels, labels=self.labels) + def time_max_trivial(self): + self.idx_dec.max() - self.iterables = [tm.makeStringIndex(10000), range(20)] + def time_min(self): + self.idx_dec.min() - def time_duplicated(self): - self.mi.duplicated() + def time_min_trivial(self): + self.idx_inc.min() - def time_from_product(self): - MultiIndex.from_product(self.iterables) +class IndexAppend(object): -class Multi2(object): goal_time = 0.2 def setup(self): - self.n = ((((3 * 5) * 7) * 11) * (1 << 10)) - (low, high) = (((-1) << 12), (1 << 12)) - self.f = (lambda k: np.repeat(np.random.randint(low, high, (self.n // k)), k)) - self.i = np.random.permutation(self.n) - self.mi = MultiIndex.from_arrays([self.f(11), self.f(7), self.f(5), self.f(3), self.f(1)])[self.i] - self.a = np.repeat(np.arange(100), 1000) - self.b = np.tile(np.arange(1000), 100) - self.midx2 = MultiIndex.from_arrays([self.a, self.b]) - self.midx2 = self.midx2.take(np.random.permutation(np.arange(100000))) + N = 10000 + self.range_idx = RangeIndex(0, 100) + self.int_idx = self.range_idx.astype(int) + self.obj_idx = self.int_idx.astype(str) + self.range_idxs = [] + self.int_idxs = [] + self.object_idxs = [] + for i in range(1, N): + r_idx = RangeIndex(i * 100, (i + 1) * 100) + self.range_idxs.append(r_idx) + i_idx = r_idx.astype(int) + self.int_idxs.append(i_idx) + o_idx = i_idx.astype(str) + self.object_idxs.append(o_idx) - def time_sortlevel_int64(self): - self.mi.sortlevel() + def time_append_range_list(self): + self.range_idx.append(self.range_idxs) - def time_sortlevel_zero(self): - self.midx2.sortlevel(0) + def time_append_int_list(self): + self.int_idx.append(self.int_idxs) - def time_sortlevel_one(self): - self.midx2.sortlevel(1) + def time_append_obj_list(self): + self.obj_idx.append(self.object_idxs) -class Multi3(object): - goal_time = 0.2 - - def setup(self): - self.level1 = range(1000) - self.level2 = date_range(start='1/1/2012', periods=100) - self.mi = MultiIndex.from_product([self.level1, self.level2]) +class Indexing(object): - def time_datetime_level_values_full(self): - self.mi.copy().values + goal_time = 0.2 + params = ['String', 'Float', 'Int'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10**6 + self.idx = getattr(tm, 'make{}Index'.format(dtype))(N) + self.array_mask = (np.arange(N) % 3) == 0 + self.series_mask = Series(self.array_mask) + self.sorted = self.idx.sort_values() + half = N // 2 + self.non_unique = self.idx[:half].append(self.idx[:half]) + self.non_unique_sorted = self.sorted[:half].append(self.sorted[:half]) + self.key = self.sorted[N // 4] + + def time_boolean_array(self, dtype): + self.idx[self.array_mask] + + def time_boolean_series(self, dtype): + self.idx[self.series_mask] - def time_datetime_level_values_sliced(self): - self.mi[:10].values + def time_get(self, dtype): + self.idx[1] + def time_slice(self, dtype): + self.idx[:-1] -class Range(object): - goal_time = 0.2 - - def setup(self): - self.idx_inc = RangeIndex(start=0, stop=10**7, step=3) - self.idx_dec = RangeIndex(start=10**7, stop=-1, step=-3) + def time_slice_step(self, dtype): + self.idx[::2] - def time_max(self): - self.idx_inc.max() + def time_get_loc(self, dtype): + self.idx.get_loc(self.key) - def time_max_trivial(self): - self.idx_dec.max() + def time_get_loc_sorted(self, dtype): + self.sorted.get_loc(self.key) - def time_min(self): - self.idx_dec.min() + def time_get_loc_non_unique(self, dtype): + self.non_unique.get_loc(self.key) - def time_min_trivial(self): - self.idx_inc.min() + def time_get_loc_non_unique_sorted(self, dtype): + self.non_unique_sorted.get_loc(self.key) -class IndexOps(object): +class Float64IndexMethod(object): + # GH 13166 goal_time = 0.2 def setup(self): - N = 10000 - self.ridx = [RangeIndex(i * 100, (i + 1) * 100) for i in range(N)] - self.iidx = [idx.astype(int) for idx in self.ridx] - self.oidx = [idx.astype(str) for idx in self.iidx] - - def time_concat_range(self): - self.ridx[0].append(self.ridx[1:]) - - def time_concat_int(self): - self.iidx[0].append(self.iidx[1:]) + N = 100000 + a = np.arange(N) + self.ind = Float64Index(a * 4.8000000418824129e-08) - def time_concat_obj(self): - self.oidx[0].append(self.oidx[1:]) + def time_get_loc(self): + self.ind.get_loc(0) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index f3e7ebbbd33e8c..739ad6a3d278b7 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -1,305 +1,354 @@ -from .pandas_vb_common import * +import warnings +import numpy as np +import pandas.util.testing as tm +from pandas import (Series, DataFrame, MultiIndex, Int64Index, Float64Index, + IntervalIndex, CategoricalIndex, + IndexSlice, concat, date_range) +from .pandas_vb_common import setup, Panel # noqa -class Int64Indexing(object): - goal_time = 0.2 - def setup(self): - self.s = Series(np.random.rand(1000000)) +class NumericSeriesIndexing(object): - def time_getitem_scalar(self): - self.s[800000] + goal_time = 0.2 + params = [Int64Index, Float64Index] + param = ['index'] - def time_getitem_slice(self): - self.s[:800000] + def setup(self, index): + N = 10**6 + idx = index(range(N)) + self.data = Series(np.random.rand(N), index=idx) + self.array = np.arange(10000) + self.array_list = self.array.tolist() - def time_getitem_list_like(self): - self.s[[800000]] + def time_getitem_scalar(self, index): + self.data[800000] - def time_getitem_array(self): - self.s[np.arange(10000)] + def time_getitem_slice(self, index): + self.data[:800000] - def time_getitem_lists(self): - self.s[np.arange(10000).tolist()] + def time_getitem_list_like(self, index): + self.data[[800000]] - def time_iloc_array(self): - self.s.iloc[np.arange(10000)] + def time_getitem_array(self, index): + self.data[self.array] - def time_iloc_list_like(self): - self.s.iloc[[800000]] + def time_getitem_lists(self, index): + self.data[self.array_list] - def time_iloc_scalar(self): - self.s.iloc[800000] + def time_iloc_array(self, index): + self.data.iloc[self.array] - def time_iloc_slice(self): - self.s.iloc[:800000] + def time_iloc_list_like(self, index): + self.data.iloc[[800000]] - def time_ix_array(self): - self.s.ix[np.arange(10000)] + def time_iloc_scalar(self, index): + self.data.iloc[800000] - def time_ix_list_like(self): - self.s.ix[[800000]] + def time_iloc_slice(self, index): + self.data.iloc[:800000] - def time_ix_scalar(self): - self.s.ix[800000] + def time_ix_array(self, index): + self.data.ix[self.array] - def time_ix_slice(self): - self.s.ix[:800000] + def time_ix_list_like(self, index): + self.data.ix[[800000]] - def time_loc_array(self): - self.s.loc[np.arange(10000)] + def time_ix_scalar(self, index): + self.data.ix[800000] - def time_loc_list_like(self): - self.s.loc[[800000]] + def time_ix_slice(self, index): + self.data.ix[:800000] - def time_loc_scalar(self): - self.s.loc[800000] + def time_loc_array(self, index): + self.data.loc[self.array] - def time_loc_slice(self): - self.s.loc[:800000] + def time_loc_list_like(self, index): + self.data.loc[[800000]] + def time_loc_scalar(self, index): + self.data.loc[800000] -class StringIndexing(object): - goal_time = 0.2 + def time_loc_slice(self, index): + self.data.loc[:800000] - def setup(self): - self.index = tm.makeStringIndex(1000000) - self.s = Series(np.random.rand(1000000), index=self.index) - self.lbl = self.s.index[800000] - - def time_getitem_label_slice(self): - self.s[:self.lbl] - def time_getitem_pos_slice(self): - self.s[:800000] +class NonNumericSeriesIndexing(object): - def time_get_value(self): - self.s.get_value(self.lbl) + goal_time = 0.2 + params = ['string', 'datetime'] + param_names = ['index'] + + def setup(self, index): + N = 10**5 + indexes = {'string': tm.makeStringIndex(N), + 'datetime': date_range('1900', periods=N, freq='s')} + index = indexes[index] + self.s = Series(np.random.rand(N), index=index) + self.lbl = index[80000] + + def time_getitem_label_slice(self, index): + self.s[:self.lbl] + def time_getitem_pos_slice(self, index): + self.s[:80000] -class DatetimeIndexing(object): - goal_time = 0.2 + def time_get_value(self, index): + with warnings.catch_warnings(record=True): + self.s.get_value(self.lbl) - def setup(self): - tm.N = 1000 - self.ts = tm.makeTimeSeries() - self.dt = self.ts.index[500] + def time_getitem_scalar(self, index): + self.s[self.lbl] - def time_getitem_scalar(self): - self.ts[self.dt] +class DataFrameStringIndexing(object): -class DataFrameIndexing(object): goal_time = 0.2 def setup(self): - self.index = tm.makeStringIndex(1000) - self.columns = tm.makeStringIndex(30) - self.df = DataFrame(np.random.randn(1000, 30), index=self.index, - columns=self.columns) - self.idx = self.index[100] - self.col = self.columns[10] - - self.df2 = DataFrame(np.random.randn(10000, 4), - columns=['A', 'B', 'C', 'D']) - self.indexer = (self.df2['B'] > 0) - self.obj_indexer = self.indexer.astype('O') - - # duptes - self.idx_dupe = (np.array(range(30)) * 99) - self.df3 = DataFrame({'A': ([0.1] * 1000), 'B': ([1] * 1000),}) - self.df3 = concat([self.df3, (2 * self.df3), (3 * self.df3)]) - - self.df_big = DataFrame(dict(A=(['foo'] * 1000000))) + index = tm.makeStringIndex(1000) + columns = tm.makeStringIndex(30) + self.df = DataFrame(np.random.randn(1000, 30), index=index, + columns=columns) + self.idx_scalar = index[100] + self.col_scalar = columns[10] + self.bool_indexer = self.df[self.col_scalar] > 0 + self.bool_obj_indexer = self.bool_indexer.astype(object) def time_get_value(self): - self.df.get_value(self.idx, self.col) + with warnings.catch_warnings(record=True): + self.df.get_value(self.idx_scalar, self.col_scalar) - def time_get_value_ix(self): - self.df.ix[(self.idx, self.col)] + def time_ix(self): + self.df.ix[self.idx_scalar, self.col_scalar] + + def time_loc(self): + self.df.loc[self.idx_scalar, self.col_scalar] def time_getitem_scalar(self): - self.df[self.col][self.idx] + self.df[self.col_scalar][self.idx_scalar] def time_boolean_rows(self): - self.df2[self.indexer] + self.df[self.bool_indexer] def time_boolean_rows_object(self): - self.df2[self.obj_indexer] - - def time_iloc_dups(self): - self.df3.iloc[self.idx_dupe] - - def time_loc_dups(self): - self.df3.loc[self.idx_dupe] + self.df[self.bool_obj_indexer] - def time_iloc_big(self): - self.df_big.iloc[:100, 0] +class DataFrameNumericIndexing(object): -class IndexingMethods(object): - # GH 13166 goal_time = 0.2 def setup(self): - a = np.arange(100000) - self.ind = pd.Float64Index(a * 4.8000000418824129e-08) + self.idx_dupe = np.array(range(30)) * 99 + self.df = DataFrame(np.random.randn(10000, 5)) + self.df_dup = concat([self.df, 2 * self.df, 3 * self.df]) + self.bool_indexer = [True] * 5000 + [False] * 5000 + + def time_iloc_dups(self): + self.df_dup.iloc[self.idx_dupe] + + def time_loc_dups(self): + self.df_dup.loc[self.idx_dupe] - self.s = Series(np.random.rand(100000)) - self.ts = Series(np.random.rand(100000), - index=date_range('2011-01-01', freq='S', periods=100000)) - self.indexer = ([True, False, True, True, False] * 20000) + def time_iloc(self): + self.df.iloc[:100, 0] - def time_get_loc_float(self): - self.ind.get_loc(0) + def time_loc(self): + self.df.loc[:100, 0] - def time_take_dtindex(self): - self.ts.take(self.indexer) + def time_bool_indexer(self): + self.df[self.bool_indexer] - def time_take_intindex(self): + +class Take(object): + + goal_time = 0.2 + params = ['int', 'datetime'] + param_names = ['index'] + + def setup(self, index): + N = 100000 + indexes = {'int': Int64Index(np.arange(N)), + 'datetime': date_range('2011-01-01', freq='S', periods=N)} + index = indexes[index] + self.s = Series(np.random.rand(N), index=index) + self.indexer = [True, False, True, True, False] * 20000 + + def time_take(self, index): self.s.take(self.indexer) class MultiIndexing(object): + goal_time = 0.2 def setup(self): - self.mi = MultiIndex.from_tuples([(x, y) for x in range(1000) for y in range(1000)]) - self.s = Series(np.random.randn(1000000), index=self.mi) + mi = MultiIndex.from_product([range(1000), range(1000)]) + self.s = Series(np.random.randn(1000000), index=mi) self.df = DataFrame(self.s) - # slicers - np.random.seed(1234) - self.idx = pd.IndexSlice - self.n = 100000 - self.mdt = pandas.DataFrame() - self.mdt['A'] = np.random.choice(range(10000, 45000, 1000), self.n) - self.mdt['B'] = np.random.choice(range(10, 400), self.n) - self.mdt['C'] = np.random.choice(range(1, 150), self.n) - self.mdt['D'] = np.random.choice(range(10000, 45000), self.n) - self.mdt['x'] = np.random.choice(range(400), self.n) - self.mdt['y'] = np.random.choice(range(25), self.n) - self.test_A = 25000 - self.test_B = 25 - self.test_C = 40 - self.test_D = 35000 - self.eps_A = 5000 - self.eps_B = 5 - self.eps_C = 5 - self.eps_D = 5000 - self.mdt2 = self.mdt.set_index(['A', 'B', 'C', 'D']).sortlevel() - self.miint = MultiIndex.from_product( - [np.arange(1000), - np.arange(1000)], names=['one', 'two']) - - import string - - self.mi_large = MultiIndex.from_product( - [np.arange(1000), np.arange(20), list(string.ascii_letters)], - names=['one', 'two', 'three']) - self.mi_med = MultiIndex.from_product( - [np.arange(1000), np.arange(10), list('A')], - names=['one', 'two', 'three']) - self.mi_small = MultiIndex.from_product( - [np.arange(100), list('A'), list('A')], - names=['one', 'two', 'three']) - - rng = np.random.RandomState(4) - size = 1 << 16 - self.mi_unused_levels = pd.MultiIndex.from_arrays([ - rng.randint(0, 1 << 13, size), - rng.randint(0, 1 << 10, size)])[rng.rand(size) < 0.1] - - def time_series_xs_mi_ix(self): + n = 100000 + self.mdt = DataFrame({'A': np.random.choice(range(10000, 45000, 1000), + n), + 'B': np.random.choice(range(10, 400), n), + 'C': np.random.choice(range(1, 150), n), + 'D': np.random.choice(range(10000, 45000), n), + 'x': np.random.choice(range(400), n), + 'y': np.random.choice(range(25), n)}) + self.idx = IndexSlice[20000:30000, 20:30, 35:45, 30000:40000] + self.mdt = self.mdt.set_index(['A', 'B', 'C', 'D']).sort_index() + + def time_series_ix(self): self.s.ix[999] - def time_frame_xs_mi_ix(self): + def time_frame_ix(self): self.df.ix[999] - def time_multiindex_slicers(self): - self.mdt2.loc[self.idx[ - (self.test_A - self.eps_A):(self.test_A + self.eps_A), - (self.test_B - self.eps_B):(self.test_B + self.eps_B), - (self.test_C - self.eps_C):(self.test_C + self.eps_C), - (self.test_D - self.eps_D):(self.test_D + self.eps_D)], :] + def time_index_slice(self): + self.mdt.loc[self.idx, :] - def time_multiindex_get_indexer(self): - self.miint.get_indexer( - np.array([(0, 10), (0, 11), (0, 12), - (0, 13), (0, 14), (0, 15), - (0, 16), (0, 17), (0, 18), - (0, 19)], dtype=object)) - def time_multiindex_large_get_loc(self): - self.mi_large.get_loc((999, 19, 'Z')) +class IntervalIndexing(object): - def time_multiindex_large_get_loc_warm(self): - for _ in range(1000): - self.mi_large.get_loc((999, 19, 'Z')) + goal_time = 0.2 - def time_multiindex_med_get_loc(self): - self.mi_med.get_loc((999, 9, 'A')) + def setup_cache(self): + idx = IntervalIndex.from_breaks(np.arange(1000001)) + monotonic = Series(np.arange(1000000), index=idx) + return monotonic - def time_multiindex_med_get_loc_warm(self): - for _ in range(1000): - self.mi_med.get_loc((999, 9, 'A')) + def time_getitem_scalar(self, monotonic): + monotonic[80000] - def time_multiindex_string_get_loc(self): - self.mi_small.get_loc((99, 'A', 'A')) + def time_loc_scalar(self, monotonic): + monotonic.loc[80000] - def time_multiindex_small_get_loc_warm(self): - for _ in range(1000): - self.mi_small.get_loc((99, 'A', 'A')) + def time_getitem_list(self, monotonic): + monotonic[80000:] - def time_is_monotonic(self): - self.miint.is_monotonic + def time_loc_list(self, monotonic): + monotonic.loc[80000:] - def time_remove_unused_levels(self): - self.mi_unused_levels.remove_unused_levels() +class CategoricalIndexIndexing(object): -class IntervalIndexing(object): goal_time = 0.2 + params = ['monotonic_incr', 'monotonic_decr', 'non_monotonic'] + param_names = ['index'] - def setup(self): - self.monotonic = Series(np.arange(1000000), - index=IntervalIndex.from_breaks(np.arange(1000001))) + def setup(self, index): + N = 10**5 + values = list('a' * N + 'b' * N + 'c' * N) + indices = { + 'monotonic_incr': CategoricalIndex(values), + 'monotonic_decr': CategoricalIndex(reversed(values)), + 'non_monotonic': CategoricalIndex(list('abc' * N))} + self.data = indices[index] - def time_getitem_scalar(self): - self.monotonic[80000] + self.int_scalar = 10000 + self.int_list = list(range(10000)) + + self.cat_scalar = 'b' + self.cat_list = ['a', 'c'] + + def time_getitem_scalar(self, index): + self.data[self.int_scalar] + + def time_getitem_slice(self, index): + self.data[:self.int_scalar] - def time_loc_scalar(self): - self.monotonic.loc[80000] + def time_getitem_list_like(self, index): + self.data[[self.int_scalar]] - def time_getitem_list(self): - self.monotonic[80000:] + def time_getitem_list(self, index): + self.data[self.int_list] - def time_loc_list(self): - self.monotonic.loc[80000:] + def time_getitem_bool_array(self, index): + self.data[self.data == self.cat_scalar] + + def time_get_loc_scalar(self, index): + self.data.get_loc(self.cat_scalar) + + def time_get_indexer_list(self, index): + self.data.get_indexer(self.cat_list) class PanelIndexing(object): + goal_time = 0.2 def setup(self): - self.p = Panel(np.random.randn(100, 100, 100)) - self.inds = range(0, 100, 10) + with warnings.catch_warnings(record=True): + self.p = Panel(np.random.randn(100, 100, 100)) + self.inds = range(0, 100, 10) def time_subset(self): - self.p.ix[(self.inds, self.inds, self.inds)] + with warnings.catch_warnings(record=True): + self.p.ix[(self.inds, self.inds, self.inds)] + + +class MethodLookup(object): + + goal_time = 0.2 + + def setup_cache(self): + s = Series() + return s + + def time_lookup_iloc(self, s): + s.iloc + + def time_lookup_ix(self, s): + s.ix + + def time_lookup_loc(self, s): + s.loc + +class GetItemSingleColumn(object): -class IndexerLookup(object): goal_time = 0.2 def setup(self): - self.s = Series(range(10)) + self.df_string_col = DataFrame(np.random.randn(3000, 1), columns=['A']) + self.df_int_col = DataFrame(np.random.randn(3000, 1)) - def time_lookup_iloc(self): - self.s.iloc + def time_frame_getitem_single_column_label(self): + self.df_string_col['A'] - def time_lookup_ix(self): - self.s.ix + def time_frame_getitem_single_column_int(self): + self.df_int_col[0] - def time_lookup_loc(self): - self.s.loc + +class AssignTimeseriesIndex(object): + + goal_time = 0.2 + + def setup(self): + N = 100000 + idx = date_range('1/1/2000', periods=N, freq='H') + self.df = DataFrame(np.random.randn(N, 1), columns=['A'], index=idx) + + def time_frame_assign_timeseries_index(self): + self.df['date'] = self.df.index + + +class InsertColumns(object): + + goal_time = 0.2 + + def setup(self): + self.N = 10**3 + self.df = DataFrame(index=range(self.N)) + + def time_insert(self): + np.random.seed(1234) + for i in range(100): + self.df.insert(0, i, np.random.randn(self.N), + allow_duplicates=True) + + def time_assign_with_setitem(self): + np.random.seed(1234) + for i in range(100): + self.df[i] = np.random.randn(self.N) diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index dc1d6de73f8ae3..16d9e7cd73cbb1 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -1,77 +1,80 @@ -from .pandas_vb_common import * -import pandas as pd +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Series, to_numeric +from .pandas_vb_common import numeric_dtypes, lib, setup # noqa -class DtypeInfer(object): - goal_time = 0.2 +class NumericInferOps(object): # from GH 7332 + goal_time = 0.2 + params = numeric_dtypes + param_names = ['dtype'] + + def setup(self, dtype): + N = 5 * 10**5 + self.df = DataFrame({'A': np.arange(N).astype(dtype), + 'B': np.arange(N).astype(dtype)}) + + def time_add(self, dtype): + self.df['A'] + self.df['B'] + + def time_subtract(self, dtype): + self.df['A'] - self.df['B'] - def setup(self): - self.N = 500000 - self.df_int64 = DataFrame(dict(A=np.arange(self.N, dtype='int64'), - B=np.arange(self.N, dtype='int64'))) - self.df_int32 = DataFrame(dict(A=np.arange(self.N, dtype='int32'), - B=np.arange(self.N, dtype='int32'))) - self.df_uint32 = DataFrame(dict(A=np.arange(self.N, dtype='uint32'), - B=np.arange(self.N, dtype='uint32'))) - self.df_float64 = DataFrame(dict(A=np.arange(self.N, dtype='float64'), - B=np.arange(self.N, dtype='float64'))) - self.df_float32 = DataFrame(dict(A=np.arange(self.N, dtype='float32'), - B=np.arange(self.N, dtype='float32'))) - self.df_datetime64 = DataFrame(dict(A=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'), - B=pd.to_datetime(np.arange(self.N, dtype='int64'), unit='ms'))) - self.df_timedelta64 = DataFrame(dict(A=(self.df_datetime64['A'] - self.df_datetime64['B']), - B=self.df_datetime64['B'])) + def time_multiply(self, dtype): + self.df['A'] * self.df['B'] - def time_int64(self): - (self.df_int64['A'] + self.df_int64['B']) + def time_divide(self, dtype): + self.df['A'] / self.df['B'] - def time_int32(self): - (self.df_int32['A'] + self.df_int32['B']) + def time_modulo(self, dtype): + self.df['A'] % self.df['B'] - def time_uint32(self): - (self.df_uint32['A'] + self.df_uint32['B']) - def time_float64(self): - (self.df_float64['A'] + self.df_float64['B']) +class DateInferOps(object): + # from GH 7332 + goal_time = 0.2 + + def setup_cache(self): + N = 5 * 10**5 + df = DataFrame({'datetime64': np.arange(N).astype('datetime64[ms]')}) + df['timedelta'] = df['datetime64'] - df['datetime64'] + return df - def time_float32(self): - (self.df_float32['A'] + self.df_float32['B']) + def time_subtract_datetimes(self, df): + df['datetime64'] - df['datetime64'] - def time_datetime64(self): - (self.df_datetime64['A'] - self.df_datetime64['B']) + def time_timedelta_plus_datetime(self, df): + df['timedelta'] + df['datetime64'] - def time_timedelta64_1(self): - (self.df_timedelta64['A'] + self.df_timedelta64['B']) + def time_add_timedeltas(self, df): + df['timedelta'] + df['timedelta'] - def time_timedelta64_2(self): - (self.df_timedelta64['A'] + self.df_timedelta64['A']) +class ToNumeric(object): -class to_numeric(object): goal_time = 0.2 + params = ['ignore', 'coerce'] + param_names = ['errors'] - def setup(self): - self.n = 10000 - self.float = Series(np.random.randn(self.n * 100)) + def setup(self, errors): + N = 10000 + self.float = Series(np.random.randn(N)) self.numstr = self.float.astype('str') - self.str = Series(tm.makeStringIndex(self.n)) + self.str = Series(tm.makeStringIndex(N)) - def time_from_float(self): - pd.to_numeric(self.float) + def time_from_float(self, errors): + to_numeric(self.float, errors=errors) - def time_from_numeric_str(self): - pd.to_numeric(self.numstr) + def time_from_numeric_str(self, errors): + to_numeric(self.numstr, errors=errors) - def time_from_str_ignore(self): - pd.to_numeric(self.str, errors='ignore') + def time_from_str(self, errors): + to_numeric(self.str, errors=errors) - def time_from_str_coerce(self): - pd.to_numeric(self.str, errors='coerce') - -class to_numeric_downcast(object): +class ToNumericDowncast(object): param_names = ['dtype', 'downcast'] params = [['string-float', 'string-int', 'string-nint', 'datetime64', @@ -81,37 +84,30 @@ class to_numeric_downcast(object): N = 500000 N2 = int(N / 2) - data_dict = { - 'string-int': (['1'] * N2) + ([2] * N2), - 'string-nint': (['-1'] * N2) + ([2] * N2), - 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], - dtype='datetime64[D]'), N), - 'string-float': (['1.1'] * N2) + ([2] * N2), - 'int-list': ([1] * N2) + ([2] * N2), - 'int32': np.repeat(np.int32(1), N) - } + data_dict = {'string-int': ['1'] * N2 + [2] * N2, + 'string-nint': ['-1'] * N2 + [2] * N2, + 'datetime64': np.repeat(np.array(['1970-01-01', '1970-01-02'], + dtype='datetime64[D]'), N), + 'string-float': ['1.1'] * N2 + [2] * N2, + 'int-list': [1] * N2 + [2] * N2, + 'int32': np.repeat(np.int32(1), N)} def setup(self, dtype, downcast): self.data = self.data_dict[dtype] def time_downcast(self, dtype, downcast): - pd.to_numeric(self.data, downcast=downcast) + to_numeric(self.data, downcast=downcast) class MaybeConvertNumeric(object): - def setup(self): - n = 1000000 - arr = np.repeat([2**63], n) - arr = arr + np.arange(n).astype('uint64') - arr = np.array([arr[i] if i%2 == 0 else - str(arr[i]) for i in range(n)], - dtype=object) - - arr[-1] = -1 - self.data = arr - self.na_values = set() - - def time_convert(self): - lib.maybe_convert_numeric(self.data, self.na_values, - coerce_numeric=False) + def setup_cache(self): + N = 10**6 + arr = np.repeat([2**63], N) + np.arange(N).astype('uint64') + data = arr.astype(object) + data[1::2] = arr[1::2].astype(str) + data[-1] = -1 + return data + + def time_convert(self, data): + lib.maybe_convert_numeric(data, set(), coerce_numeric=False) diff --git a/doc/sphinxext/ipython_sphinxext/__init__.py b/asv_bench/benchmarks/io/__init__.py similarity index 100% rename from doc/sphinxext/ipython_sphinxext/__init__.py rename to asv_bench/benchmarks/io/__init__.py diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py new file mode 100644 index 00000000000000..2d4bdc7ae812a0 --- /dev/null +++ b/asv_bench/benchmarks/io/csv.py @@ -0,0 +1,229 @@ +import random +import timeit +import string + +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, Categorical, date_range, read_csv +from pandas.compat import PY2 +from pandas.compat import cStringIO as StringIO + +from ..pandas_vb_common import setup, BaseIO # noqa + + +class ToCSV(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + params = ['wide', 'long', 'mixed'] + param_names = ['kind'] + + def setup(self, kind): + wide_frame = DataFrame(np.random.randn(3000, 30)) + long_frame = DataFrame({'A': np.arange(50000), + 'B': np.arange(50000) + 1., + 'C': np.arange(50000) + 2., + 'D': np.arange(50000) + 3.}) + mixed_frame = DataFrame({'float': np.random.randn(5000), + 'int': np.random.randn(5000).astype(int), + 'bool': (np.arange(5000) % 2) == 0, + 'datetime': date_range('2001', + freq='s', + periods=5000), + 'object': ['foo'] * 5000}) + mixed_frame.loc[30:500, 'float'] = np.nan + data = {'wide': wide_frame, + 'long': long_frame, + 'mixed': mixed_frame} + self.df = data[kind] + + def time_frame(self, kind): + self.df.to_csv(self.fname) + + +class ToCSVDatetime(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + + def setup(self): + rng = date_range('1/1/2000', periods=1000) + self.data = DataFrame(rng, index=rng) + + def time_frame_date_formatting(self): + self.data.to_csv(self.fname, date_format='%Y%m%d') + + +class StringIORewind(object): + + def data(self, stringio_object): + stringio_object.seek(0) + return stringio_object + + +class ReadCSVDInferDatetimeFormat(StringIORewind): + + goal_time = 0.2 + params = ([True, False], ['custom', 'iso8601', 'ymd']) + param_names = ['infer_datetime_format', 'format'] + + def setup(self, infer_datetime_format, format): + rng = date_range('1/1/2000', periods=1000) + formats = {'custom': '%m/%d/%Y %H:%M:%S.%f', + 'iso8601': '%Y-%m-%d %H:%M:%S', + 'ymd': '%Y%m%d'} + dt_format = formats[format] + self.StringIO_input = StringIO('\n'.join( + rng.strftime(dt_format).tolist())) + + def time_read_csv(self, infer_datetime_format, format): + read_csv(self.data(self.StringIO_input), + header=None, names=['foo'], parse_dates=['foo'], + infer_datetime_format=infer_datetime_format) + + +class ReadCSVSkipRows(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + params = [None, 10000] + param_names = ['skiprows'] + + def setup(self, skiprows): + N = 20000 + index = tm.makeStringIndex(N) + df = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N), + 'string1': ['foo'] * N, + 'bool1': [True] * N, + 'int1': np.random.randint(0, N, size=N)}, + index=index) + df.to_csv(self.fname) + + def time_skipprows(self, skiprows): + read_csv(self.fname, skiprows=skiprows) + + +class ReadUint64Integers(StringIORewind): + + goal_time = 0.2 + + def setup(self): + self.na_values = [2**63 + 500] + arr = np.arange(10000).astype('uint64') + 2**63 + self.data1 = StringIO('\n'.join(arr.astype(str).tolist())) + arr = arr.astype(object) + arr[500] = -1 + self.data2 = StringIO('\n'.join(arr.astype(str).tolist())) + + def time_read_uint64(self): + read_csv(self.data(self.data1), header=None, names=['foo']) + + def time_read_uint64_neg_values(self): + read_csv(self.data(self.data2), header=None, names=['foo']) + + def time_read_uint64_na_values(self): + read_csv(self.data(self.data1), header=None, names=['foo'], + na_values=self.na_values) + + +class ReadCSVThousands(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + params = ([',', '|'], [None, ',']) + param_names = ['sep', 'thousands'] + + def setup(self, sep, thousands): + N = 10000 + K = 8 + data = np.random.randn(N, K) * np.random.randint(100, 10000, (N, K)) + df = DataFrame(data) + if thousands is not None: + fmt = ':{}'.format(thousands) + fmt = '{' + fmt + '}' + df = df.applymap(lambda x: fmt.format(x)) + df.to_csv(self.fname, sep=sep) + + def time_thousands(self, sep, thousands): + read_csv(self.fname, sep=sep, thousands=thousands) + + +class ReadCSVComment(StringIORewind): + + goal_time = 0.2 + + def setup(self): + data = ['A,B,C'] + (['1,2,3 # comment'] * 100000) + self.StringIO_input = StringIO('\n'.join(data)) + + def time_comment(self): + read_csv(self.data(self.StringIO_input), comment='#', + header=None, names=list('abc')) + + +class ReadCSVFloatPrecision(StringIORewind): + + goal_time = 0.2 + params = ([',', ';'], ['.', '_'], [None, 'high', 'round_trip']) + param_names = ['sep', 'decimal', 'float_precision'] + + def setup(self, sep, decimal, float_precision): + floats = [''.join(random.choice(string.digits) for _ in range(28)) + for _ in range(15)] + rows = sep.join(['0{}'.format(decimal) + '{}'] * 3) + '\n' + data = rows * 5 + data = data.format(*floats) * 200 # 1000 x 3 strings csv + self.StringIO_input = StringIO(data) + + def time_read_csv(self, sep, decimal, float_precision): + read_csv(self.data(self.StringIO_input), sep=sep, header=None, + names=list('abc'), float_precision=float_precision) + + def time_read_csv_python_engine(self, sep, decimal, float_precision): + read_csv(self.data(self.StringIO_input), sep=sep, header=None, engine='python', + float_precision=None, names=list('abc')) + + +class ReadCSVCategorical(BaseIO): + + goal_time = 0.2 + fname = '__test__.csv' + + def setup(self): + N = 100000 + group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] + df = DataFrame(np.random.choice(group1, (N, 3)), columns=list('abc')) + df.to_csv(self.fname, index=False) + + def time_convert_post(self): + read_csv(self.fname).apply(Categorical) + + def time_convert_direct(self): + read_csv(self.fname, dtype='category') + + +class ReadCSVParseDates(StringIORewind): + + goal_time = 0.2 + + def setup(self): + data = """{},19:00:00,18:56:00,0.8100,2.8100,7.2000,0.0000,280.0000\n + {},20:00:00,19:56:00,0.0100,2.2100,7.2000,0.0000,260.0000\n + {},21:00:00,20:56:00,-0.5900,2.2100,5.7000,0.0000,280.0000\n + {},21:00:00,21:18:00,-0.9900,2.0100,3.6000,0.0000,270.0000\n + {},22:00:00,21:56:00,-0.5900,1.7100,5.1000,0.0000,290.0000\n + """ + two_cols = ['KORD,19990127'] * 5 + data = data.format(*two_cols) + self.StringIO_input = StringIO(data) + + def time_multiple_date(self): + read_csv(self.data(self.StringIO_input), sep=',', header=None, + names=list(string.digits[:9]), + parse_dates=[[1, 2], [1, 3]]) + + def time_baseline(self): + read_csv(self.data(self.StringIO_input), sep=',', header=None, + parse_dates=[1], + names=list(string.digits[:9])) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py new file mode 100644 index 00000000000000..58ab6bb8046c59 --- /dev/null +++ b/asv_bench/benchmarks/io/excel.py @@ -0,0 +1,36 @@ +import numpy as np +from pandas import DataFrame, date_range, ExcelWriter, read_excel +from pandas.compat import BytesIO +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class Excel(object): + + goal_time = 0.2 + params = ['openpyxl', 'xlsxwriter', 'xlwt'] + param_names = ['engine'] + + def setup(self, engine): + N = 2000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.bio_read = BytesIO() + self.writer_read = ExcelWriter(self.bio_read, engine=engine) + self.df.to_excel(self.writer_read, sheet_name='Sheet1') + self.writer_read.save() + self.bio_read.seek(0) + + def time_read_excel(self, engine): + read_excel(self.bio_read) + + def time_write_excel(self, engine): + bio_write = BytesIO() + bio_write.seek(0) + writer_write = ExcelWriter(bio_write, engine=engine) + self.df.to_excel(writer_write, sheet_name='Sheet1') + writer_write.save() diff --git a/asv_bench/benchmarks/io/hdf.py b/asv_bench/benchmarks/io/hdf.py new file mode 100644 index 00000000000000..4b6e1d69af92d7 --- /dev/null +++ b/asv_bench/benchmarks/io/hdf.py @@ -0,0 +1,151 @@ +import warnings + +import numpy as np +from pandas import DataFrame, Panel, date_range, HDFStore, read_hdf +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class HDFStoreDataFrame(BaseIO): + + goal_time = 0.2 + + def setup(self): + N = 25000 + index = tm.makeStringIndex(N) + self.df = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N)}, + index=index) + self.df_mixed = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N), + 'string1': ['foo'] * N, + 'bool1': [True] * N, + 'int1': np.random.randint(0, N, size=N)}, + index=index) + self.df_wide = DataFrame(np.random.randn(N, 100)) + self.start_wide = self.df_wide.index[10000] + self.stop_wide = self.df_wide.index[15000] + self.df2 = DataFrame({'float1': np.random.randn(N), + 'float2': np.random.randn(N)}, + index=date_range('1/1/2000', periods=N)) + self.start = self.df2.index[10000] + self.stop = self.df2.index[15000] + self.df_wide2 = DataFrame(np.random.randn(N, 100), + index=date_range('1/1/2000', periods=N)) + self.df_dc = DataFrame(np.random.randn(N, 10), + columns=['C%03d' % i for i in range(10)]) + + self.fname = '__test__.h5' + + self.store = HDFStore(self.fname) + self.store.put('fixed', self.df) + self.store.put('fixed_mixed', self.df_mixed) + self.store.append('table', self.df2) + self.store.append('table_mixed', self.df_mixed) + self.store.append('table_wide', self.df_wide) + self.store.append('table_wide2', self.df_wide2) + + def teardown(self): + self.store.close() + self.remove(self.fname) + + def time_read_store(self): + self.store.get('fixed') + + def time_read_store_mixed(self): + self.store.get('fixed_mixed') + + def time_write_store(self): + self.store.put('fixed_write', self.df) + + def time_write_store_mixed(self): + self.store.put('fixed_mixed_write', self.df_mixed) + + def time_read_store_table_mixed(self): + self.store.select('table_mixed') + + def time_write_store_table_mixed(self): + self.store.append('table_mixed_write', self.df_mixed) + + def time_read_store_table(self): + self.store.select('table') + + def time_write_store_table(self): + self.store.append('table_write', self.df) + + def time_read_store_table_wide(self): + self.store.select('table_wide') + + def time_write_store_table_wide(self): + self.store.append('table_wide_write', self.df_wide) + + def time_write_store_table_dc(self): + self.store.append('table_dc_write', self.df_dc, data_columns=True) + + def time_query_store_table_wide(self): + self.store.select('table_wide', where="index > self.start_wide and " + "index < self.stop_wide") + + def time_query_store_table(self): + self.store.select('table', where="index > self.start and " + "index < self.stop") + + def time_store_repr(self): + repr(self.store) + + def time_store_str(self): + str(self.store) + + def time_store_info(self): + self.store.info() + + +class HDFStorePanel(BaseIO): + + goal_time = 0.2 + + def setup(self): + self.fname = '__test__.h5' + with warnings.catch_warnings(record=True): + self.p = Panel(np.random.randn(20, 1000, 25), + items=['Item%03d' % i for i in range(20)], + major_axis=date_range('1/1/2000', periods=1000), + minor_axis=['E%03d' % i for i in range(25)]) + self.store = HDFStore(self.fname) + self.store.append('p1', self.p) + + def teardown(self): + self.store.close() + self.remove(self.fname) + + def time_read_store_table_panel(self): + with warnings.catch_warnings(record=True): + self.store.select('p1') + + def time_write_store_table_panel(self): + with warnings.catch_warnings(record=True): + self.store.append('p2', self.p) + + +class HDF(BaseIO): + + goal_time = 0.2 + params = ['table', 'fixed'] + param_names = ['format'] + + def setup(self, format): + self.fname = '__test__.h5' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df.to_hdf(self.fname, 'df', format=format) + + def time_read_hdf(self, format): + read_hdf(self.fname, 'df') + + def time_write_hdf(self, format): + self.df.to_hdf(self.fname, 'df', format=format) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py new file mode 100644 index 00000000000000..acfdd327c3b518 --- /dev/null +++ b/asv_bench/benchmarks/io/json.py @@ -0,0 +1,127 @@ +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, date_range, timedelta_range, concat, read_json + +from ..pandas_vb_common import setup, BaseIO # noqa + + +class ReadJSON(BaseIO): + + goal_time = 0.2 + fname = "__test__.json" + params = (['split', 'index', 'records'], ['int', 'datetime']) + param_names = ['orient', 'index'] + + def setup(self, orient, index): + N = 100000 + indexes = {'int': np.arange(N), + 'datetime': date_range('20000101', periods=N, freq='H')} + df = DataFrame(np.random.randn(N, 5), + columns=['float_{}'.format(i) for i in range(5)], + index=indexes[index]) + df.to_json(self.fname, orient=orient) + + def time_read_json(self, orient, index): + read_json(self.fname, orient=orient) + + +class ReadJSONLines(BaseIO): + + goal_time = 0.2 + fname = "__test_lines__.json" + params = ['int', 'datetime'] + param_names = ['index'] + + def setup(self, index): + N = 100000 + indexes = {'int': np.arange(N), + 'datetime': date_range('20000101', periods=N, freq='H')} + df = DataFrame(np.random.randn(N, 5), + columns=['float_{}'.format(i) for i in range(5)], + index=indexes[index]) + df.to_json(self.fname, orient='records', lines=True) + + def time_read_json_lines(self, index): + read_json(self.fname, orient='records', lines=True) + + def time_read_json_lines_concat(self, index): + concat(read_json(self.fname, orient='records', lines=True, + chunksize=25000)) + + def peakmem_read_json_lines(self, index): + read_json(self.fname, orient='records', lines=True) + + def peakmem_read_json_lines_concat(self, index): + concat(read_json(self.fname, orient='records', lines=True, + chunksize=25000)) + + +class ToJSON(BaseIO): + + goal_time = 0.2 + fname = "__test__.json" + params = ['split', 'columns', 'index'] + param_names = ['orient'] + + def setup(self, lines_orient): + N = 10**5 + ncols = 5 + index = date_range('20000101', periods=N, freq='H') + timedeltas = timedelta_range(start=1, periods=N, freq='s') + datetimes = date_range(start=1, periods=N, freq='s') + ints = np.random.randint(100000000, size=N) + floats = np.random.randn(N) + strings = tm.makeStringIndex(N) + self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) + self.df_date_idx = DataFrame(np.random.randn(N, ncols), index=index) + self.df_td_int_ts = DataFrame({'td_1': timedeltas, + 'td_2': timedeltas, + 'int_1': ints, + 'int_2': ints, + 'ts_1': datetimes, + 'ts_2': datetimes}, + index=index) + self.df_int_floats = DataFrame({'int_1': ints, + 'int_2': ints, + 'int_3': ints, + 'float_1': floats, + 'float_2': floats, + 'float_3': floats}, + index=index) + self.df_int_float_str = DataFrame({'int_1': ints, + 'int_2': ints, + 'float_1': floats, + 'float_2': floats, + 'str_1': strings, + 'str_2': strings}, + index=index) + + def time_floats_with_int_index(self, orient): + self.df.to_json(self.fname, orient=orient) + + def time_floats_with_dt_index(self, orient): + self.df_date_idx.to_json(self.fname, orient=orient) + + def time_delta_int_tstamp(self, orient): + self.df_td_int_ts.to_json(self.fname, orient=orient) + + def time_float_int(self, orient): + self.df_int_floats.to_json(self.fname, orient=orient) + + def time_float_int_str(self, orient): + self.df_int_float_str.to_json(self.fname, orient=orient) + + def time_floats_with_int_idex_lines(self, orient): + self.df.to_json(self.fname, orient='records', lines=True) + + def time_floats_with_dt_index_lines(self, orient): + self.df_date_idx.to_json(self.fname, orient='records', lines=True) + + def time_delta_int_tstamp_lines(self, orient): + self.df_td_int_ts.to_json(self.fname, orient='records', lines=True) + + def time_float_int_lines(self, orient): + self.df_int_floats.to_json(self.fname, orient='records', lines=True) + + def time_float_int_str_lines(self, orient): + self.df_int_float_str.to_json(self.fname, orient='records', lines=True) diff --git a/asv_bench/benchmarks/io/msgpack.py b/asv_bench/benchmarks/io/msgpack.py new file mode 100644 index 00000000000000..8ccce01117ca40 --- /dev/null +++ b/asv_bench/benchmarks/io/msgpack.py @@ -0,0 +1,26 @@ +import numpy as np +from pandas import DataFrame, date_range, read_msgpack +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class MSGPack(BaseIO): + + goal_time = 0.2 + + def setup(self): + self.fname = '__test__.msg' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df.to_msgpack(self.fname) + + def time_read_msgpack(self): + read_msgpack(self.fname) + + def time_write_msgpack(self): + self.df.to_msgpack(self.fname) diff --git a/asv_bench/benchmarks/io/pickle.py b/asv_bench/benchmarks/io/pickle.py new file mode 100644 index 00000000000000..2ad0fcca6eb26e --- /dev/null +++ b/asv_bench/benchmarks/io/pickle.py @@ -0,0 +1,26 @@ +import numpy as np +from pandas import DataFrame, date_range, read_pickle +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class Pickle(BaseIO): + + goal_time = 0.2 + + def setup(self): + self.fname = '__test__.pkl' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df.to_pickle(self.fname) + + def time_read_pickle(self): + read_pickle(self.fname) + + def time_write_pickle(self): + self.df.to_pickle(self.fname) diff --git a/asv_bench/benchmarks/io/sas.py b/asv_bench/benchmarks/io/sas.py new file mode 100644 index 00000000000000..526c524de7fff7 --- /dev/null +++ b/asv_bench/benchmarks/io/sas.py @@ -0,0 +1,21 @@ +import os + +from pandas import read_sas + + +class SAS(object): + + goal_time = 0.2 + params = ['sas7bdat', 'xport'] + param_names = ['format'] + + def setup(self, format): + # Read files that are located in 'pandas/io/tests/sas/data' + files = {'sas7bdat': 'test1.sas7bdat', 'xport': 'paxraw_d_short.xpt'} + file = files[format] + paths = [os.path.dirname(__file__), '..', '..', '..', 'pandas', + 'tests', 'io', 'sas', 'data', file] + self.f = os.path.join(*paths) + + def time_read_msgpack(self, format): + read_sas(self.f, format=format) diff --git a/asv_bench/benchmarks/io/sql.py b/asv_bench/benchmarks/io/sql.py new file mode 100644 index 00000000000000..ef4e501e5f3b96 --- /dev/null +++ b/asv_bench/benchmarks/io/sql.py @@ -0,0 +1,132 @@ +import sqlite3 + +import numpy as np +import pandas.util.testing as tm +from pandas import DataFrame, date_range, read_sql_query, read_sql_table +from sqlalchemy import create_engine + +from ..pandas_vb_common import setup # noqa + + +class SQL(object): + + goal_time = 0.2 + params = ['sqlalchemy', 'sqlite'] + param_names = ['connection'] + + def setup(self, connection): + N = 10000 + con = {'sqlalchemy': create_engine('sqlite:///:memory:'), + 'sqlite': sqlite3.connect(':memory:')} + self.table_name = 'test_type' + self.query_all = 'SELECT * FROM {}'.format(self.table_name) + self.con = con[connection] + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_to_sql_dataframe(self, connection): + self.df.to_sql('test1', self.con, if_exists='replace') + + def time_read_sql_query(self, connection): + read_sql_query(self.query_all, self.con) + + +class WriteSQLDtypes(object): + + goal_time = 0.2 + params = (['sqlalchemy', 'sqlite'], + ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime']) + param_names = ['connection', 'dtype'] + + def setup(self, connection, dtype): + N = 10000 + con = {'sqlalchemy': create_engine('sqlite:///:memory:'), + 'sqlite': sqlite3.connect(':memory:')} + self.table_name = 'test_type' + self.query_col = 'SELECT {} FROM {}'.format(dtype, self.table_name) + self.con = con[connection] + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_to_sql_dataframe_column(self, connection, dtype): + self.df[[dtype]].to_sql('test1', self.con, if_exists='replace') + + def time_read_sql_query_select_column(self, connection, dtype): + read_sql_query(self.query_col, self.con) + + +class ReadSQLTable(object): + + goal_time = 0.2 + + def setup(self): + N = 10000 + self.table_name = 'test' + self.con = create_engine('sqlite:///:memory:') + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_read_sql_table_all(self): + read_sql_table(self.table_name, self.con) + + def time_read_sql_table_parse_dates(self): + read_sql_table(self.table_name, self.con, columns=['datetime_string'], + parse_dates=['datetime_string']) + + +class ReadSQLTableDtypes(object): + + goal_time = 0.2 + + params = ['float', 'float_with_nan', 'string', 'bool', 'int', 'datetime'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10000 + self.table_name = 'test' + self.con = create_engine('sqlite:///:memory:') + self.df = DataFrame({'float': np.random.randn(N), + 'float_with_nan': np.random.randn(N), + 'string': ['foo'] * N, + 'bool': [True] * N, + 'int': np.random.randint(0, N, size=N), + 'datetime': date_range('2000-01-01', + periods=N, + freq='s')}, + index=tm.makeStringIndex(N)) + self.df.loc[1000:3000, 'float_with_nan'] = np.nan + self.df['datetime_string'] = self.df['datetime'].astype(str) + self.df.to_sql(self.table_name, self.con, if_exists='replace') + + def time_read_sql_table_column(self, dtype): + read_sql_table(self.table_name, self.con, columns=[dtype]) diff --git a/asv_bench/benchmarks/io/stata.py b/asv_bench/benchmarks/io/stata.py new file mode 100644 index 00000000000000..e0f5752ca930ff --- /dev/null +++ b/asv_bench/benchmarks/io/stata.py @@ -0,0 +1,37 @@ +import numpy as np +from pandas import DataFrame, date_range, read_stata +import pandas.util.testing as tm + +from ..pandas_vb_common import BaseIO, setup # noqa + + +class Stata(BaseIO): + + goal_time = 0.2 + params = ['tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'] + param_names = ['convert_dates'] + + def setup(self, convert_dates): + self.fname = '__test__.dta' + N = 100000 + C = 5 + self.df = DataFrame(np.random.randn(N, C), + columns=['float{}'.format(i) for i in range(C)], + index=date_range('20000101', periods=N, freq='H')) + self.df['object'] = tm.makeStringIndex(N) + self.df['int8_'] = np.random.randint(np.iinfo(np.int8).min, + np.iinfo(np.int8).max - 27, N) + self.df['int16_'] = np.random.randint(np.iinfo(np.int16).min, + np.iinfo(np.int16).max - 27, N) + self.df['int32_'] = np.random.randint(np.iinfo(np.int32).min, + np.iinfo(np.int32).max - 27, N) + self.df['float32_'] = np.array(np.random.randn(N), + dtype=np.float32) + self.convert_dates = {'index': convert_dates} + self.df.to_stata(self.fname, self.convert_dates) + + def time_read_stata(self, convert_dates): + read_stata(self.fname) + + def time_write_stata(self, convert_dates): + self.df.to_stata(self.fname, self.convert_dates) diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py deleted file mode 100644 index 93273955a29b9f..00000000000000 --- a/asv_bench/benchmarks/io_bench.py +++ /dev/null @@ -1,224 +0,0 @@ -import os -from .pandas_vb_common import * -from pandas import concat, Timestamp, compat -try: - from StringIO import StringIO -except ImportError: - from io import StringIO -import timeit - - -class frame_to_csv(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(3000, 30)) - - def time_frame_to_csv(self): - self.df.to_csv('__test__.csv') - - -class frame_to_csv2(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame({'A': range(50000), }) - self.df['B'] = (self.df.A + 1.0) - self.df['C'] = (self.df.A + 2.0) - self.df['D'] = (self.df.A + 3.0) - - def time_frame_to_csv2(self): - self.df.to_csv('__test__.csv') - - -class frame_to_csv_date_formatting(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = DataFrame(self.rng, index=self.rng) - - def time_frame_to_csv_date_formatting(self): - self.data.to_csv('__test__.csv', date_format='%Y%m%d') - - -class frame_to_csv_mixed(object): - goal_time = 0.2 - - def setup(self): - self.df_float = DataFrame(np.random.randn(5000, 5), dtype='float64', columns=self.create_cols('float')) - self.df_int = DataFrame(np.random.randn(5000, 5), dtype='int64', columns=self.create_cols('int')) - self.df_bool = DataFrame(True, index=self.df_float.index, columns=self.create_cols('bool')) - self.df_object = DataFrame('foo', index=self.df_float.index, columns=self.create_cols('object')) - self.df_dt = DataFrame(Timestamp('20010101'), index=self.df_float.index, columns=self.create_cols('date')) - self.df_float.ix[30:500, 1:3] = np.nan - self.df = concat([self.df_float, self.df_int, self.df_bool, self.df_object, self.df_dt], axis=1) - - def time_frame_to_csv_mixed(self): - self.df.to_csv('__test__.csv') - - def create_cols(self, name): - return [('%s%03d' % (name, i)) for i in range(5)] - - -class read_csv_infer_datetime_format_custom(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%m/%d/%Y %H:%M:%S.%f')))) - - def time_read_csv_infer_datetime_format_custom(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) - - -class read_csv_infer_datetime_format_iso8601(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S')))) - - def time_read_csv_infer_datetime_format_iso8601(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) - - -class read_csv_infer_datetime_format_ymd(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y%m%d')))) - - def time_read_csv_infer_datetime_format_ymd(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'], infer_datetime_format=True) - - -class read_csv_skiprows(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(20000) - self.df = DataFrame({'float1': randn(20000), 'float2': randn(20000), 'string1': (['foo'] * 20000), 'bool1': ([True] * 20000), 'int1': np.random.randint(0, 200000, size=20000), }, index=self.index) - self.df.to_csv('__test__.csv') - - def time_read_csv_skiprows(self): - read_csv('__test__.csv', skiprows=10000) - - -class read_csv_standard(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_csv('__test__.csv') - - def time_read_csv_standard(self): - read_csv('__test__.csv') - - -class read_parse_dates_iso8601(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range('1/1/2000', periods=1000) - self.data = '\n'.join(self.rng.map((lambda x: x.strftime('%Y-%m-%d %H:%M:%S')))) - - def time_read_parse_dates_iso8601(self): - read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo']) - - -class read_uint64_integers(object): - goal_time = 0.2 - - def setup(self): - self.na_values = [2**63 + 500] - - self.arr1 = np.arange(10000).astype('uint64') + 2**63 - self.data1 = '\n'.join(map(lambda x: str(x), self.arr1)) - - self.arr2 = self.arr1.copy().astype(object) - self.arr2[500] = -1 - self.data2 = '\n'.join(map(lambda x: str(x), self.arr2)) - - def time_read_uint64(self): - read_csv(StringIO(self.data1), header=None) - - def time_read_uint64_neg_values(self): - read_csv(StringIO(self.data2), header=None) - - def time_read_uint64_na_values(self): - read_csv(StringIO(self.data1), header=None, na_values=self.na_values) - - -class write_csv_standard(object): - goal_time = 0.2 - - def setup(self): - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - - def time_write_csv_standard(self): - self.df.to_csv('__test__.csv') - - -class read_csv_from_s3(object): - # Make sure that we can read part of a file from S3 without - # needing to download the entire thing. Use the timeit.default_timer - # to measure wall time instead of CPU time -- we want to see - # how long it takes to download the data. - timer = timeit.default_timer - params = ([None, "gzip", "bz2"], ["python", "c"]) - param_names = ["compression", "engine"] - - def setup(self, compression, engine): - if compression == "bz2" and engine == "c" and compat.PY2: - # The Python 2 C parser can't read bz2 from open files. - raise NotImplementedError - try: - import s3fs - except ImportError: - # Skip these benchmarks if `boto` is not installed. - raise NotImplementedError - - self.big_fname = "s3://pandas-test/large_random.csv" - - def time_read_nrows(self, compression, engine): - # Read a small number of rows from a huge (100,000 x 50) table. - ext = "" - if compression == "gzip": - ext = ".gz" - elif compression == "bz2": - ext = ".bz2" - pd.read_csv(self.big_fname + ext, nrows=10, - compression=compression, engine=engine) - - -class read_json_lines(object): - goal_time = 0.2 - fname = "__test__.json" - - def setup(self): - self.N = 100000 - self.C = 5 - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)])) - self.df.to_json(self.fname,orient="records",lines=True) - - def teardown(self): - try: - os.remove(self.fname) - except: - pass - - def time_read_json_lines(self): - pd.read_json(self.fname, lines=True) - - def time_read_json_lines_chunk(self): - pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4)) - - def peakmem_read_json_lines(self): - pd.read_json(self.fname, lines=True) - - def peakmem_read_json_lines_chunk(self): - pd.concat(pd.read_json(self.fname, lines=True, chunksize=self.N//4)) diff --git a/asv_bench/benchmarks/io_sql.py b/asv_bench/benchmarks/io_sql.py deleted file mode 100644 index ec855e5d33525e..00000000000000 --- a/asv_bench/benchmarks/io_sql.py +++ /dev/null @@ -1,105 +0,0 @@ -import sqlalchemy -from .pandas_vb_common import * -import sqlite3 -from sqlalchemy import create_engine - - -#------------------------------------------------------------------------------- -# to_sql - -class WriteSQL(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - - def time_fallback(self): - self.df.to_sql('test1', self.con, if_exists='replace') - - def time_sqlalchemy(self): - self.df.to_sql('test1', self.engine, if_exists='replace') - - -#------------------------------------------------------------------------------- -# read_sql - -class ReadSQL(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.index = tm.makeStringIndex(10000) - self.df = DataFrame({'float1': randn(10000), 'float2': randn(10000), 'string1': (['foo'] * 10000), 'bool1': ([True] * 10000), 'int1': np.random.randint(0, 100000, size=10000), }, index=self.index) - self.df.to_sql('test2', self.engine, if_exists='replace') - self.df.to_sql('test2', self.con, if_exists='replace') - - def time_read_query_fallback(self): - read_sql_query('SELECT * FROM test2', self.con) - - def time_read_query_sqlalchemy(self): - read_sql_query('SELECT * FROM test2', self.engine) - - def time_read_table_sqlalchemy(self): - read_sql_table('test2', self.engine) - - -#------------------------------------------------------------------------------- -# type specific write - -class WriteSQLTypes(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'string': (['foo'] * 10000), 'bool': ([True] * 10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df.loc[1000:3000, 'float'] = np.nan - - def time_string_fallback(self): - self.df[['string']].to_sql('test_string', self.con, if_exists='replace') - - def time_string_sqlalchemy(self): - self.df[['string']].to_sql('test_string', self.engine, if_exists='replace') - - def time_float_fallback(self): - self.df[['float']].to_sql('test_float', self.con, if_exists='replace') - - def time_float_sqlalchemy(self): - self.df[['float']].to_sql('test_float', self.engine, if_exists='replace') - - def time_datetime_sqlalchemy(self): - self.df[['datetime']].to_sql('test_datetime', self.engine, if_exists='replace') - - -#------------------------------------------------------------------------------- -# type specific read - -class ReadSQLTypes(object): - goal_time = 0.2 - - def setup(self): - self.engine = create_engine('sqlite:///:memory:') - self.con = sqlite3.connect(':memory:') - self.df = DataFrame({'float': randn(10000), 'datetime': date_range('2000-01-01', periods=10000, freq='s'), }) - self.df['datetime_string'] = self.df['datetime'].map(str) - self.df.to_sql('test_type', self.engine, if_exists='replace') - self.df[['float', 'datetime_string']].to_sql('test_type', self.con, if_exists='replace') - - def time_datetime_read_and_parse_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime_string'], parse_dates=['datetime_string']) - - def time_datetime_read_as_native_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['datetime']) - - def time_float_read_query_fallback(self): - read_sql_query('SELECT float FROM test_type', self.con) - - def time_float_read_query_sqlalchemy(self): - read_sql_query('SELECT float FROM test_type', self.engine) - - def time_float_read_table_sqlalchemy(self): - read_sql_table('test_type', self.engine, columns=['float']) diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 3b0e33b72ddc10..de0a3b33da1474 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -1,20 +1,25 @@ -from .pandas_vb_common import * +import warnings +import string +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, MultiIndex, date_range, concat, merge, + merge_asof) try: from pandas import merge_ordered except ImportError: from pandas import ordered_merge as merge_ordered +from .pandas_vb_common import Panel, setup # noqa -# ---------------------------------------------------------------------- -# Append class Append(object): + goal_time = 0.2 def setup(self): - self.df1 = pd.DataFrame(np.random.randn(10000, 4), - columns=['A', 'B', 'C', 'D']) + self.df1 = DataFrame(np.random.randn(10000, 4), + columns=['A', 'B', 'C', 'D']) self.df2 = self.df1.copy() self.df2.index = np.arange(10000, 20000) self.mdf1 = self.df1.copy() @@ -22,7 +27,8 @@ def setup(self): self.mdf1['obj2'] = 'bar' self.mdf1['int1'] = 5 try: - self.mdf1.consolidate(inplace=True) + with warnings.catch_warnings(record=True): + self.mdf1.consolidate(inplace=True) except: pass self.mdf2 = self.mdf1.copy() @@ -35,237 +41,228 @@ def time_append_mixed(self): self.mdf1.append(self.mdf2) -# ---------------------------------------------------------------------- -# Concat - class Concat(object): - goal_time = 0.2 - def setup(self): - self.n = 1000 - self.indices = tm.makeStringIndex(1000) - self.s = Series(self.n, index=self.indices) - self.pieces = [self.s[i:(- i)] for i in range(1, 10)] - self.pieces = (self.pieces * 50) - - self.df_small = pd.DataFrame(randn(5, 4)) + goal_time = 0.2 + params = [0, 1] + param_names = ['axis'] - # empty - self.df = pd.DataFrame(dict(A=range(10000)), index=date_range('20130101', periods=10000, freq='s')) - self.empty = pd.DataFrame() + def setup(self, axis): + N = 1000 + s = Series(N, index=tm.makeStringIndex(N)) + self.series = [s[i:- i] for i in range(1, 10)] * 50 + self.small_frames = [DataFrame(np.random.randn(5, 4))] * 1000 + df = DataFrame({'A': range(N)}, + index=date_range('20130101', periods=N, freq='s')) + self.empty_left = [DataFrame(), df] + self.empty_right = [df, DataFrame()] - def time_concat_series_axis1(self): - concat(self.pieces, axis=1) + def time_concat_series(self, axis): + concat(self.series, axis=axis) - def time_concat_small_frames(self): - concat(([self.df_small] * 1000)) + def time_concat_small_frames(self, axis): + concat(self.small_frames, axis=axis) - def time_concat_empty_frames1(self): - concat([self.df, self.empty]) + def time_concat_empty_right(self, axis): + concat(self.empty_right, axis=axis) - def time_concat_empty_frames2(self): - concat([self.empty, self.df]) + def time_concat_empty_left(self, axis): + concat(self.empty_left, axis=axis) class ConcatPanels(object): - goal_time = 0.2 - - def setup(self): - dataset = np.zeros((10000, 200, 2), dtype=np.float32) - self.panels_f = [pd.Panel(np.copy(dataset, order='F')) - for i in range(20)] - self.panels_c = [pd.Panel(np.copy(dataset, order='C')) - for i in range(20)] - def time_c_ordered_axis0(self): - concat(self.panels_c, axis=0, ignore_index=True) - - def time_f_ordered_axis0(self): - concat(self.panels_f, axis=0, ignore_index=True) + goal_time = 0.2 + params = ([0, 1, 2], [True, False]) + param_names = ['axis', 'ignore_index'] - def time_c_ordered_axis1(self): - concat(self.panels_c, axis=1, ignore_index=True) + def setup(self, axis, ignore_index): + with warnings.catch_warnings(record=True): + panel_c = Panel(np.zeros((10000, 200, 2), + dtype=np.float32, + order='C')) + self.panels_c = [panel_c] * 20 + panel_f = Panel(np.zeros((10000, 200, 2), + dtype=np.float32, + order='F')) + self.panels_f = [panel_f] * 20 - def time_f_ordered_axis1(self): - concat(self.panels_f, axis=1, ignore_index=True) + def time_c_ordered(self, axis, ignore_index): + with warnings.catch_warnings(record=True): + concat(self.panels_c, axis=axis, ignore_index=ignore_index) - def time_c_ordered_axis2(self): - concat(self.panels_c, axis=2, ignore_index=True) + def time_f_ordered(self, axis, ignore_index): + with warnings.catch_warnings(record=True): + concat(self.panels_f, axis=axis, ignore_index=ignore_index) - def time_f_ordered_axis2(self): - concat(self.panels_f, axis=2, ignore_index=True) +class ConcatDataFrames(object): -class ConcatFrames(object): goal_time = 0.2 + params = ([0, 1], [True, False]) + param_names = ['axis', 'ignore_index'] - def setup(self): - dataset = np.zeros((10000, 200), dtype=np.float32) - - self.frames_f = [pd.DataFrame(np.copy(dataset, order='F')) - for i in range(20)] - self.frames_c = [pd.DataFrame(np.copy(dataset, order='C')) - for i in range(20)] - - def time_c_ordered_axis0(self): - concat(self.frames_c, axis=0, ignore_index=True) - - def time_f_ordered_axis0(self): - concat(self.frames_f, axis=0, ignore_index=True) + def setup(self, axis, ignore_index): + frame_c = DataFrame(np.zeros((10000, 200), + dtype=np.float32, order='C')) + self.frame_c = [frame_c] * 20 + frame_f = DataFrame(np.zeros((10000, 200), + dtype=np.float32, order='F')) + self.frame_f = [frame_f] * 20 - def time_c_ordered_axis1(self): - concat(self.frames_c, axis=1, ignore_index=True) + def time_c_ordered(self, axis, ignore_index): + concat(self.frame_c, axis=axis, ignore_index=ignore_index) - def time_f_ordered_axis1(self): - concat(self.frames_f, axis=1, ignore_index=True) + def time_f_ordered(self, axis, ignore_index): + concat(self.frame_f, axis=axis, ignore_index=ignore_index) -# ---------------------------------------------------------------------- -# Joins - class Join(object): - goal_time = 0.2 - - def setup(self): - self.level1 = tm.makeStringIndex(10).values - self.level2 = tm.makeStringIndex(1000).values - self.label1 = np.arange(10).repeat(1000) - self.label2 = np.tile(np.arange(1000), 10) - self.key1 = np.tile(self.level1.take(self.label1), 10) - self.key2 = np.tile(self.level2.take(self.label2), 10) - self.shuf = np.arange(100000) - random.shuffle(self.shuf) - try: - self.index2 = MultiIndex(levels=[self.level1, self.level2], - labels=[self.label1, self.label2]) - self.index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], - labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - self.df_multi = DataFrame(np.random.randn(len(self.index2), 4), - index=self.index2, - columns=['A', 'B', 'C', 'D']) - except: - pass - self.df = pd.DataFrame({'data1': np.random.randn(100000), - 'data2': np.random.randn(100000), - 'key1': self.key1, - 'key2': self.key2}) - self.df_key1 = pd.DataFrame(np.random.randn(len(self.level1), 4), - index=self.level1, - columns=['A', 'B', 'C', 'D']) - self.df_key2 = pd.DataFrame(np.random.randn(len(self.level2), 4), - index=self.level2, - columns=['A', 'B', 'C', 'D']) - self.df_shuf = self.df.reindex(self.df.index[self.shuf]) - - def time_join_dataframe_index_multi(self): - self.df.join(self.df_multi, on=['key1', 'key2']) - - def time_join_dataframe_index_single_key_bigger(self): - self.df.join(self.df_key2, on='key2') - def time_join_dataframe_index_single_key_bigger_sort(self): - self.df_shuf.join(self.df_key2, on='key2', sort=True) - - def time_join_dataframe_index_single_key_small(self): - self.df.join(self.df_key1, on='key1') + goal_time = 0.2 + params = [True, False] + param_names = ['sort'] + + def setup(self, sort): + level1 = tm.makeStringIndex(10).values + level2 = tm.makeStringIndex(1000).values + label1 = np.arange(10).repeat(1000) + label2 = np.tile(np.arange(1000), 10) + index2 = MultiIndex(levels=[level1, level2], + labels=[label1, label2]) + self.df_multi = DataFrame(np.random.randn(len(index2), 4), + index=index2, + columns=['A', 'B', 'C', 'D']) + + self.key1 = np.tile(level1.take(label1), 10) + self.key2 = np.tile(level2.take(label2), 10) + self.df = DataFrame({'data1': np.random.randn(100000), + 'data2': np.random.randn(100000), + 'key1': self.key1, + 'key2': self.key2}) + + self.df_key1 = DataFrame(np.random.randn(len(level1), 4), + index=level1, + columns=['A', 'B', 'C', 'D']) + self.df_key2 = DataFrame(np.random.randn(len(level2), 4), + index=level2, + columns=['A', 'B', 'C', 'D']) + + shuf = np.arange(100000) + np.random.shuffle(shuf) + self.df_shuf = self.df.reindex(self.df.index[shuf]) + + def time_join_dataframe_index_multi(self, sort): + self.df.join(self.df_multi, on=['key1', 'key2'], sort=sort) + + def time_join_dataframe_index_single_key_bigger(self, sort): + self.df.join(self.df_key2, on='key2', sort=sort) + + def time_join_dataframe_index_single_key_small(self, sort): + self.df.join(self.df_key1, on='key1', sort=sort) + + def time_join_dataframe_index_shuffle_key_bigger_sort(self, sort): + self.df_shuf.join(self.df_key2, on='key2', sort=sort) class JoinIndex(object): + goal_time = 0.2 def setup(self): - np.random.seed(2718281) - self.n = 50000 - self.left = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jim', 'joe']) - self.right = pd.DataFrame(np.random.randint(1, (self.n / 500), (self.n, 2)), columns=['jolie', 'jolia']).set_index('jolie') + N = 50000 + self.left = DataFrame(np.random.randint(1, N / 500, (N, 2)), + columns=['jim', 'joe']) + self.right = DataFrame(np.random.randint(1, N / 500, (N, 2)), + columns=['jolie', 'jolia']).set_index('jolie') def time_left_outer_join_index(self): self.left.join(self.right, on='jim') -class join_non_unique_equal(object): +class JoinNonUnique(object): # outer join of non-unique # GH 6329 - goal_time = 0.2 def setup(self): - self.date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') - self.daily_dates = self.date_index.to_period('D').to_timestamp('S', 'S') - self.fracofday = (self.date_index.view(np.ndarray) - self.daily_dates.view(np.ndarray)) - self.fracofday = (self.fracofday.astype('timedelta64[ns]').astype(np.float64) / 86400000000000.0) - self.fracofday = Series(self.fracofday, self.daily_dates) - self.index = date_range(self.date_index.min().to_period('A').to_timestamp('D', 'S'), self.date_index.max().to_period('A').to_timestamp('D', 'E'), freq='D') - self.temp = Series(1.0, self.index) + date_index = date_range('01-Jan-2013', '23-Jan-2013', freq='T') + daily_dates = date_index.to_period('D').to_timestamp('S', 'S') + self.fracofday = date_index.values - daily_dates.values + self.fracofday = self.fracofday.astype('timedelta64[ns]') + self.fracofday = self.fracofday.astype(np.float64) / 86400000000000.0 + self.fracofday = Series(self.fracofday, daily_dates) + index = date_range(date_index.min(), date_index.max(), freq='D') + self.temp = Series(1.0, index)[self.fracofday.index] def time_join_non_unique_equal(self): - (self.fracofday * self.temp[self.fracofday.index]) - + self.fracofday * self.temp -# ---------------------------------------------------------------------- -# Merges class Merge(object): - goal_time = 0.2 - def setup(self): - self.N = 10000 - self.indices = tm.makeStringIndex(self.N).values - self.indices2 = tm.makeStringIndex(self.N).values - self.key = np.tile(self.indices[:8000], 10) - self.key2 = np.tile(self.indices2[:8000], 10) - self.left = pd.DataFrame({'key': self.key, 'key2': self.key2, - 'value': np.random.randn(80000)}) - self.right = pd.DataFrame({'key': self.indices[2000:], - 'key2': self.indices2[2000:], - 'value2': np.random.randn(8000)}) - - self.df = pd.DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), - 'key2': np.tile(np.arange(250).repeat(10), 4), - 'value': np.random.randn(10000)}) - self.df2 = pd.DataFrame({'key1': np.arange(500), 'value2': randn(500)}) + goal_time = 0.2 + params = [True, False] + param_names = ['sort'] + + def setup(self, sort): + N = 10000 + indices = tm.makeStringIndex(N).values + indices2 = tm.makeStringIndex(N).values + key = np.tile(indices[:8000], 10) + key2 = np.tile(indices2[:8000], 10) + self.left = DataFrame({'key': key, 'key2': key2, + 'value': np.random.randn(80000)}) + self.right = DataFrame({'key': indices[2000:], + 'key2': indices2[2000:], + 'value2': np.random.randn(8000)}) + + self.df = DataFrame({'key1': np.tile(np.arange(500).repeat(10), 2), + 'key2': np.tile(np.arange(250).repeat(10), 4), + 'value': np.random.randn(10000)}) + self.df2 = DataFrame({'key1': np.arange(500), + 'value2': np.random.randn(500)}) self.df3 = self.df[:5000] - def time_merge_2intkey_nosort(self): - merge(self.left, self.right, sort=False) + def time_merge_2intkey(self, sort): + merge(self.left, self.right, sort=sort) - def time_merge_2intkey_sort(self): - merge(self.left, self.right, sort=True) + def time_merge_dataframe_integer_2key(self, sort): + merge(self.df, self.df3, sort=sort) - def time_merge_dataframe_integer_2key(self): - merge(self.df, self.df3) + def time_merge_dataframe_integer_key(self, sort): + merge(self.df, self.df2, on='key1', sort=sort) - def time_merge_dataframe_integer_key(self): - merge(self.df, self.df2, on='key1') +class I8Merge(object): -class i8merge(object): goal_time = 0.2 + params = ['inner', 'outer', 'left', 'right'] + param_names = ['how'] - def setup(self): - (low, high, n) = (((-1) << 10), (1 << 10), (1 << 20)) - self.left = pd.DataFrame(np.random.randint(low, high, (n, 7)), - columns=list('ABCDEFG')) + def setup(self, how): + low, high, n = -1000, 1000, 10**6 + self.left = DataFrame(np.random.randint(low, high, (n, 7)), + columns=list('ABCDEFG')) self.left['left'] = self.left.sum(axis=1) - self.i = np.random.permutation(len(self.left)) - self.right = self.left.iloc[self.i].copy() - self.right.columns = (self.right.columns[:(-1)].tolist() + ['right']) - self.right.index = np.arange(len(self.right)) - self.right['right'] *= (-1) + self.right = self.left.sample(frac=1).rename({'left': 'right'}, axis=1) + self.right = self.right.reset_index(drop=True) + self.right['right'] *= -1 - def time_i8merge(self): - merge(self.left, self.right, how='outer') + def time_i8merge(self, how): + merge(self.left, self.right, how=how) class MergeCategoricals(object): + goal_time = 0.2 def setup(self): - self.left_object = pd.DataFrame( + self.left_object = DataFrame( {'X': np.random.choice(range(0, 10), size=(10000,)), 'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))}) - self.right_object = pd.DataFrame( + self.right_object = DataFrame( {'X': np.random.choice(range(0, 10), size=(10000,)), 'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))}) @@ -281,103 +278,85 @@ def time_merge_cat(self): merge(self.left_cat, self.right_cat, on='X') -# ---------------------------------------------------------------------- -# Ordered merge - class MergeOrdered(object): def setup(self): - groups = tm.makeStringIndex(10).values - - self.left = pd.DataFrame({'group': groups.repeat(5000), - 'key' : np.tile(np.arange(0, 10000, 2), 10), - 'lvalue': np.random.randn(50000)}) - - self.right = pd.DataFrame({'key' : np.arange(10000), - 'rvalue' : np.random.randn(10000)}) + self.left = DataFrame({'group': groups.repeat(5000), + 'key': np.tile(np.arange(0, 10000, 2), 10), + 'lvalue': np.random.randn(50000)}) + self.right = DataFrame({'key': np.arange(10000), + 'rvalue': np.random.randn(10000)}) def time_merge_ordered(self): merge_ordered(self.left, self.right, on='key', left_by='group') -# ---------------------------------------------------------------------- -# asof merge - class MergeAsof(object): def setup(self): - import string - np.random.seed(0) one_count = 200000 two_count = 1000000 - self.df1 = pd.DataFrame( + df1 = DataFrame( {'time': np.random.randint(0, one_count / 20, one_count), 'key': np.random.choice(list(string.ascii_uppercase), one_count), 'key2': np.random.randint(0, 25, one_count), 'value1': np.random.randn(one_count)}) - self.df2 = pd.DataFrame( + df2 = DataFrame( {'time': np.random.randint(0, two_count / 20, two_count), 'key': np.random.choice(list(string.ascii_uppercase), two_count), 'key2': np.random.randint(0, 25, two_count), 'value2': np.random.randn(two_count)}) - self.df1 = self.df1.sort_values('time') - self.df2 = self.df2.sort_values('time') + df1 = df1.sort_values('time') + df2 = df2.sort_values('time') - self.df1['time32'] = np.int32(self.df1.time) - self.df2['time32'] = np.int32(self.df2.time) + df1['time32'] = np.int32(df1.time) + df2['time32'] = np.int32(df2.time) - self.df1a = self.df1[['time', 'value1']] - self.df2a = self.df2[['time', 'value2']] - self.df1b = self.df1[['time', 'key', 'value1']] - self.df2b = self.df2[['time', 'key', 'value2']] - self.df1c = self.df1[['time', 'key2', 'value1']] - self.df2c = self.df2[['time', 'key2', 'value2']] - self.df1d = self.df1[['time32', 'value1']] - self.df2d = self.df2[['time32', 'value2']] - self.df1e = self.df1[['time', 'key', 'key2', 'value1']] - self.df2e = self.df2[['time', 'key', 'key2', 'value2']] + self.df1a = df1[['time', 'value1']] + self.df2a = df2[['time', 'value2']] + self.df1b = df1[['time', 'key', 'value1']] + self.df2b = df2[['time', 'key', 'value2']] + self.df1c = df1[['time', 'key2', 'value1']] + self.df2c = df2[['time', 'key2', 'value2']] + self.df1d = df1[['time32', 'value1']] + self.df2d = df2[['time32', 'value2']] + self.df1e = df1[['time', 'key', 'key2', 'value1']] + self.df2e = df2[['time', 'key', 'key2', 'value2']] - def time_noby(self): + def time_on_int(self): merge_asof(self.df1a, self.df2a, on='time') + def time_on_int32(self): + merge_asof(self.df1d, self.df2d, on='time32') + def time_by_object(self): merge_asof(self.df1b, self.df2b, on='time', by='key') def time_by_int(self): merge_asof(self.df1c, self.df2c, on='time', by='key2') - def time_on_int32(self): - merge_asof(self.df1d, self.df2d, on='time32') - def time_multiby(self): merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2']) -# ---------------------------------------------------------------------- -# data alignment - class Align(object): + goal_time = 0.2 def setup(self): - self.n = 1000000 - self.sz = 500000 - self.rng = np.arange(0, 10000000000000, 10000000) - self.stamps = (np.datetime64(datetime.now()).view('i8') + self.rng) - self.idx1 = np.sort(self.sample(self.stamps, self.sz)) - self.idx2 = np.sort(self.sample(self.stamps, self.sz)) - self.ts1 = Series(np.random.randn(self.sz), self.idx1) - self.ts2 = Series(np.random.randn(self.sz), self.idx2) - - def sample(self, values, k): - self.sampler = np.random.permutation(len(values)) - return values.take(self.sampler[:k]) + size = 5 * 10**5 + rng = np.arange(0, 10**13, 10**7) + stamps = np.datetime64('now').view('i8') + rng + idx1 = np.sort(np.random.choice(stamps, size, replace=False)) + idx2 = np.sort(np.random.choice(stamps, size, replace=False)) + self.ts1 = Series(np.random.randn(size), idx1) + self.ts2 = Series(np.random.randn(size), idx2) def time_series_align_int64_index(self): - (self.ts1 + self.ts2) + self.ts1 + self.ts2 def time_series_align_left_monotonic(self): self.ts1.align(self.ts2, join='left') diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py new file mode 100644 index 00000000000000..0c92214795557f --- /dev/null +++ b/asv_bench/benchmarks/multiindex_object.py @@ -0,0 +1,140 @@ +import string + +import numpy as np +import pandas.util.testing as tm +from pandas import date_range, MultiIndex + +from .pandas_vb_common import setup # noqa + + +class GetLoc(object): + + goal_time = 0.2 + + def setup(self): + self.mi_large = MultiIndex.from_product( + [np.arange(1000), np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) + self.mi_med = MultiIndex.from_product( + [np.arange(1000), np.arange(10), list('A')], + names=['one', 'two', 'three']) + self.mi_small = MultiIndex.from_product( + [np.arange(100), list('A'), list('A')], + names=['one', 'two', 'three']) + + def time_large_get_loc(self): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_large_get_loc_warm(self): + for _ in range(1000): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_med_get_loc(self): + self.mi_med.get_loc((999, 9, 'A')) + + def time_med_get_loc_warm(self): + for _ in range(1000): + self.mi_med.get_loc((999, 9, 'A')) + + def time_string_get_loc(self): + self.mi_small.get_loc((99, 'A', 'A')) + + def time_small_get_loc_warm(self): + for _ in range(1000): + self.mi_small.get_loc((99, 'A', 'A')) + + +class Duplicates(object): + + goal_time = 0.2 + + def setup(self): + size = 65536 + arrays = [np.random.randint(0, 8192, size), + np.random.randint(0, 1024, size)] + mask = np.random.rand(size) < 0.1 + self.mi_unused_levels = MultiIndex.from_arrays(arrays) + self.mi_unused_levels = self.mi_unused_levels[mask] + + def time_remove_unused_levels(self): + self.mi_unused_levels.remove_unused_levels() + + +class Integer(object): + + goal_time = 0.2 + + def setup(self): + self.mi_int = MultiIndex.from_product([np.arange(1000), + np.arange(1000)], + names=['one', 'two']) + self.obj_index = np.array([(0, 10), (0, 11), (0, 12), + (0, 13), (0, 14), (0, 15), + (0, 16), (0, 17), (0, 18), + (0, 19)], dtype=object) + + def time_get_indexer(self): + self.mi_int.get_indexer(self.obj_index) + + def time_is_monotonic(self): + self.mi_int.is_monotonic + + +class Duplicated(object): + + goal_time = 0.2 + + def setup(self): + n, k = 200, 5000 + levels = [np.arange(n), + tm.makeStringIndex(n).values, + 1000 + np.arange(n)] + labels = [np.random.choice(n, (k * n)) for lev in levels] + self.mi = MultiIndex(levels=levels, labels=labels) + + def time_duplicated(self): + self.mi.duplicated() + + +class Sortlevel(object): + + goal_time = 0.2 + + def setup(self): + n = 1182720 + low, high = -4096, 4096 + arrs = [np.repeat(np.random.randint(low, high, (n // k)), k) + for k in [11, 7, 5, 3, 1]] + self.mi_int = MultiIndex.from_arrays(arrs)[np.random.permutation(n)] + + a = np.repeat(np.arange(100), 1000) + b = np.tile(np.arange(1000), 100) + self.mi = MultiIndex.from_arrays([a, b]) + self.mi = self.mi.take(np.random.permutation(np.arange(100000))) + + def time_sortlevel_int64(self): + self.mi_int.sortlevel() + + def time_sortlevel_zero(self): + self.mi.sortlevel(0) + + def time_sortlevel_one(self): + self.mi.sortlevel(1) + + +class Values(object): + + goal_time = 0.2 + + def setup_cache(self): + + level1 = range(1000) + level2 = date_range(start='1/1/2012', periods=100) + mi = MultiIndex.from_product([level1, level2]) + return mi + + def time_datetime_level_values_copy(self, mi): + mi.copy().values + + def time_datetime_level_values_sliced(self, mi): + mi[:10].values diff --git a/asv_bench/benchmarks/offset.py b/asv_bench/benchmarks/offset.py new file mode 100644 index 00000000000000..e161b887ee86f0 --- /dev/null +++ b/asv_bench/benchmarks/offset.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +import warnings +from datetime import datetime + +import numpy as np +import pandas as pd +try: + import pandas.tseries.holiday # noqa +except ImportError: + pass + +hcal = pd.tseries.holiday.USFederalHolidayCalendar() +# These offests currently raise a NotImplimentedError with .apply_index() +non_apply = [pd.offsets.Day(), + pd.offsets.BYearEnd(), + pd.offsets.BYearBegin(), + pd.offsets.BQuarterEnd(), + pd.offsets.BQuarterBegin(), + pd.offsets.BMonthEnd(), + pd.offsets.BMonthBegin(), + pd.offsets.CustomBusinessDay(), + pd.offsets.CustomBusinessDay(calendar=hcal), + pd.offsets.CustomBusinessMonthBegin(calendar=hcal), + pd.offsets.CustomBusinessMonthEnd(calendar=hcal), + pd.offsets.CustomBusinessMonthEnd(calendar=hcal)] +other_offsets = [pd.offsets.YearEnd(), pd.offsets.YearBegin(), + pd.offsets.QuarterEnd(), pd.offsets.QuarterBegin(), + pd.offsets.MonthEnd(), pd.offsets.MonthBegin(), + pd.offsets.DateOffset(months=2, days=2), + pd.offsets.BusinessDay(), pd.offsets.SemiMonthEnd(), + pd.offsets.SemiMonthBegin()] +offsets = non_apply + other_offsets + + +class ApplyIndex(object): + + goal_time = 0.2 + + params = other_offsets + param_names = ['offset'] + + def setup(self, offset): + N = 10000 + self.rng = pd.date_range(start='1/1/2000', periods=N, freq='T') + + def time_apply_index(self, offset): + offset.apply_index(self.rng) + + +class OnOffset(object): + + goal_time = 0.2 + + params = offsets + param_names = ['offset'] + + def setup(self, offset): + self.dates = [datetime(2016, m, d) + for m in [10, 11, 12] + for d in [1, 2, 3, 28, 29, 30, 31] + if not (m == 11 and d == 31)] + + def time_on_offset(self, offset): + for date in self.dates: + offset.onOffset(date) + + +class OffsetSeriesArithmetic(object): + + goal_time = 0.2 + params = offsets + param_names = ['offset'] + + def setup(self, offset): + N = 1000 + rng = pd.date_range(start='1/1/2000', periods=N, freq='T') + self.data = pd.Series(rng) + + def time_add_offset(self, offset): + with warnings.catch_warnings(record=True): + self.data + offset + + +class OffsetDatetimeIndexArithmetic(object): + + goal_time = 0.2 + params = offsets + param_names = ['offset'] + + def setup(self, offset): + N = 1000 + self.data = pd.date_range(start='1/1/2000', periods=N, freq='T') + + def time_add_offset(self, offset): + with warnings.catch_warnings(record=True): + self.data + offset + + +class OffestDatetimeArithmetic(object): + + goal_time = 0.2 + params = offsets + param_names = ['offset'] + + def setup(self, offset): + self.date = datetime(2011, 1, 1) + self.dt64 = np.datetime64('2011-01-01 09:00Z') + + def time_apply(self, offset): + offset.apply(self.date) + + def time_apply_np_dt64(self, offset): + offset.apply(self.dt64) + + def time_add(self, offset): + self.date + offset + + def time_add_10(self, offset): + self.date + (10 * offset) + + def time_subtract(self, offset): + self.date - offset + + def time_subtract_10(self, offset): + self.date - (10 * offset) diff --git a/asv_bench/benchmarks/packers.py b/asv_bench/benchmarks/packers.py deleted file mode 100644 index 24f80cc836dd45..00000000000000 --- a/asv_bench/benchmarks/packers.py +++ /dev/null @@ -1,318 +0,0 @@ -from .pandas_vb_common import * -from numpy.random import randint -import pandas as pd -from collections import OrderedDict -from pandas.compat import BytesIO -import sqlite3 -import os -from sqlalchemy import create_engine -import numpy as np -from random import randrange - -class _Packers(object): - goal_time = 0.2 - - def _setup(self): - self.f = '__test__.msg' - self.N = 100000 - self.C = 5 - self.index = date_range('20000101', periods=self.N, freq='H') - self.df = DataFrame(dict([('float{0}'.format(i), randn(self.N)) for i in range(self.C)]), index=self.index) - self.df2 = self.df.copy() - self.df2['object'] = [('%08x' % randrange((16 ** 8))) for _ in range(self.N)] - self.remove(self.f) - - def remove(self, f): - try: - os.remove(self.f) - except: - pass - -class Packers(_Packers): - goal_time = 0.2 - - def setup(self): - self._setup() - self.df.to_csv(self.f) - - def time_packers_read_csv(self): - pd.read_csv(self.f) - -class packers_read_excel(_Packers): - goal_time = 0.2 - - def setup(self): - self._setup() - self.bio = BytesIO() - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def time_packers_read_excel(self): - self.bio.seek(0) - pd.read_excel(self.bio) - - -class packers_read_hdf_store(_Packers): - goal_time = 0.2 - - def setup(self): - self._setup() - self.df2.to_hdf(self.f, 'df') - - def time_packers_read_hdf_store(self): - pd.read_hdf(self.f, 'df') - - -class packers_read_hdf_table(_Packers): - - def setup(self): - self._setup() - self.df2.to_hdf(self.f, 'df', format='table') - - def time_packers_read_hdf_table(self): - pd.read_hdf(self.f, 'df') - - -class packers_read_json(_Packers): - - def setup(self): - self._setup() - self.df.to_json(self.f, orient='split') - self.df.index = np.arange(self.N) - - def time_packers_read_json(self): - pd.read_json(self.f, orient='split') - - -class packers_read_json_date_index(_Packers): - - def setup(self): - self._setup() - self.remove(self.f) - self.df.to_json(self.f, orient='split') - - def time_packers_read_json_date_index(self): - pd.read_json(self.f, orient='split') - - -class packers_read_pack(_Packers): - - def setup(self): - self._setup() - self.df2.to_msgpack(self.f) - - def time_packers_read_pack(self): - pd.read_msgpack(self.f) - - -class packers_read_pickle(_Packers): - - def setup(self): - self._setup() - self.df2.to_pickle(self.f) - - def time_packers_read_pickle(self): - pd.read_pickle(self.f) - -class packers_read_sql(_Packers): - - def setup(self): - self._setup() - self.engine = create_engine('sqlite:///:memory:') - self.df2.to_sql('table', self.engine, if_exists='replace') - - def time_packers_read_sql(self): - pd.read_sql_table('table', self.engine) - - -class packers_read_stata(_Packers): - - def setup(self): - self._setup() - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_read_stata(self): - pd.read_stata(self.f) - - -class packers_read_stata_with_validation(_Packers): - - def setup(self): - self._setup() - self.df['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] - self.df['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] - self.df['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] - self.df['float32_'] = np.array(randn(self.N), dtype=np.float32) - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_packers_read_stata_with_validation(self): - pd.read_stata(self.f) - - -class packers_read_sas(_Packers): - - def setup(self): - - testdir = os.path.join(os.path.dirname(__file__), '..', '..', - 'pandas', 'tests', 'io', 'sas') - if not os.path.exists(testdir): - testdir = os.path.join(os.path.dirname(__file__), '..', '..', - 'pandas', 'io', 'tests', 'sas') - self.f = os.path.join(testdir, 'data', 'test1.sas7bdat') - self.f2 = os.path.join(testdir, 'data', 'paxraw_d_short.xpt') - - def time_read_sas7bdat(self): - pd.read_sas(self.f, format='sas7bdat') - - def time_read_xport(self): - pd.read_sas(self.f2, format='xport') - - -class CSV(_Packers): - - def setup(self): - self._setup() - - def time_write_csv(self): - self.df.to_csv(self.f) - - def teardown(self): - self.remove(self.f) - - -class Excel(_Packers): - - def setup(self): - self._setup() - self.bio = BytesIO() - - def time_write_excel_openpyxl(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='openpyxl') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def time_write_excel_xlsxwriter(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlsxwriter') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - def time_write_excel_xlwt(self): - self.bio.seek(0) - self.writer = pd.io.excel.ExcelWriter(self.bio, engine='xlwt') - self.df[:2000].to_excel(self.writer) - self.writer.save() - - -class HDF(_Packers): - - def setup(self): - self._setup() - - def time_write_hdf_store(self): - self.df2.to_hdf(self.f, 'df') - - def time_write_hdf_table(self): - self.df2.to_hdf(self.f, 'df', table=True) - - def teardown(self): - self.remove(self.f) - -class JSON(_Packers): - - def setup(self): - self._setup() - self.df_date = self.df.copy() - self.df.index = np.arange(self.N) - self.cols = [(lambda i: ('{0}_timedelta'.format(i), [pd.Timedelta(('%d seconds' % randrange(1000000.0))) for _ in range(self.N)])), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_timestamp'.format(i), [pd.Timestamp((1418842918083256000 + randrange(1000000000.0, 1e+18, 200))) for _ in range(self.N)]))] - self.df_mixed = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N)))] - self.df_mixed2 = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - self.cols = [(lambda i: ('{0}_float'.format(i), randn(self.N))), (lambda i: ('{0}_int'.format(i), randint(100000000.0, size=self.N))), (lambda i: ('{0}_str'.format(i), [('%08x' % randrange((16 ** 8))) for _ in range(self.N)]))] - self.df_mixed3 = DataFrame(OrderedDict([self.cols[(i % len(self.cols))](i) for i in range(self.C)]), index=self.index) - - def time_write_json(self): - self.df.to_json(self.f, orient='split') - - def time_write_json_T(self): - self.df.to_json(self.f, orient='columns') - - def time_write_json_date_index(self): - self.df_date.to_json(self.f, orient='split') - - def time_write_json_mixed_delta_int_tstamp(self): - self.df_mixed.to_json(self.f, orient='split') - - def time_write_json_mixed_float_int(self): - self.df_mixed2.to_json(self.f, orient='index') - - def time_write_json_mixed_float_int_T(self): - self.df_mixed2.to_json(self.f, orient='columns') - - def time_write_json_mixed_float_int_str(self): - self.df_mixed3.to_json(self.f, orient='split') - - def time_write_json_lines(self): - self.df.to_json(self.f, orient="records", lines=True) - - def teardown(self): - self.remove(self.f) - - -class MsgPack(_Packers): - - def setup(self): - self._setup() - - def time_write_msgpack(self): - self.df2.to_msgpack(self.f) - - def teardown(self): - self.remove(self.f) - - -class Pickle(_Packers): - - def setup(self): - self._setup() - - def time_write_pickle(self): - self.df2.to_pickle(self.f) - - def teardown(self): - self.remove(self.f) - - -class SQL(_Packers): - - def setup(self): - self._setup() - self.engine = create_engine('sqlite:///:memory:') - - def time_write_sql(self): - self.df2.to_sql('table', self.engine, if_exists='replace') - - -class STATA(_Packers): - - def setup(self): - self._setup() - - self.df3=self.df.copy() - self.df3['int8_'] = [randint(np.iinfo(np.int8).min, (np.iinfo(np.int8).max - 27)) for _ in range(self.N)] - self.df3['int16_'] = [randint(np.iinfo(np.int16).min, (np.iinfo(np.int16).max - 27)) for _ in range(self.N)] - self.df3['int32_'] = [randint(np.iinfo(np.int32).min, (np.iinfo(np.int32).max - 27)) for _ in range(self.N)] - self.df3['float32_'] = np.array(randn(self.N), dtype=np.float32) - - def time_write_stata(self): - self.df.to_stata(self.f, {'index': 'tc', }) - - def time_write_stata_with_validation(self): - self.df3.to_stata(self.f, {'index': 'tc', }) - - def teardown(self): - self.remove(self.f) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index b1a58e49fe86c4..e255cd94f265bf 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -1,35 +1,43 @@ -from pandas import * -import pandas as pd -from numpy.random import randn -from numpy.random import randint -import pandas.util.testing as tm -import random -import numpy as np -import threading +import os from importlib import import_module -try: - from pandas.compat import range -except ImportError: - pass - -np.random.seed(1234) +import numpy as np +from pandas import Panel -# try em until it works! -for imp in ['pandas._libs.lib', 'pandas.lib', 'pandas_tseries']: +# Compatibility import for lib +for imp in ['pandas._libs.lib', 'pandas.lib']: try: lib = import_module(imp) break except: pass -try: - Panel = Panel -except Exception: - Panel = WidePanel +numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32, + np.float64, np.int16, np.int8, np.uint16, np.uint8] +datetime_dtypes = [np.datetime64, np.timedelta64] + + +def setup(*args, **kwargs): + # This function just needs to be imported into each benchmark file to + # set up the random seed before each function. + # http://asv.readthedocs.io/en/latest/writing_benchmarks.html + np.random.seed(1234) + + +class BaseIO(object): + """ + Base class for IO benchmarks + """ + fname = None + + def remove(self, f): + """Remove created files""" + try: + os.remove(f) + except: + # On Windows, attempting to remove a file that is in use + # causes an exception to be raised + pass -# didn't add to namespace until later -try: - from pandas.core.index import MultiIndex -except ImportError: - pass + def teardown(self, *args, **kwargs): + self.remove(self.fname) diff --git a/asv_bench/benchmarks/panel_ctor.py b/asv_bench/benchmarks/panel_ctor.py index cc6071b054662f..ce946c76ed1996 100644 --- a/asv_bench/benchmarks/panel_ctor.py +++ b/asv_bench/benchmarks/panel_ctor.py @@ -1,65 +1,60 @@ -from .pandas_vb_common import * -from datetime import timedelta +import warnings +from datetime import datetime, timedelta +from pandas import DataFrame, DatetimeIndex, date_range -class Constructors1(object): - goal_time = 0.2 - - def setup(self): - self.data_frames = {} - self.start = datetime(1990, 1, 1) - self.end = datetime(2012, 1, 1) - for x in range(100): - self.end += timedelta(days=1) - self.dr = np.asarray(date_range(self.start, self.end)) - self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) - self.data_frames[x] = self.df - - def time_panel_from_dict_all_different_indexes(self): - Panel.from_dict(self.data_frames) +from .pandas_vb_common import Panel, setup # noqa -class Constructors2(object): +class DifferentIndexes(object): goal_time = 0.2 def setup(self): self.data_frames = {} + start = datetime(1990, 1, 1) + end = datetime(2012, 1, 1) for x in range(100): - self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq='D')) - self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) - self.data_frames[x] = self.df + end += timedelta(days=1) + idx = date_range(start, end) + df = DataFrame({'a': 0, 'b': 1, 'c': 2}, index=idx) + self.data_frames[x] = df - def time_panel_from_dict_equiv_indexes(self): - Panel.from_dict(self.data_frames) + def time_from_dict(self): + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) -class Constructors3(object): +class SameIndexes(object): + goal_time = 0.2 def setup(self): - self.dr = np.asarray(DatetimeIndex(start=datetime(1990, 1, 1), end=datetime(2012, 1, 1), freq='D')) - self.data_frames = {} - for x in range(100): - self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) - self.data_frames[x] = self.df + idx = DatetimeIndex(start=datetime(1990, 1, 1), + end=datetime(2012, 1, 1), + freq='D') + df = DataFrame({'a': 0, 'b': 1, 'c': 2}, index=idx) + self.data_frames = dict(enumerate([df] * 100)) - def time_panel_from_dict_same_index(self): - Panel.from_dict(self.data_frames) + def time_from_dict(self): + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) -class Constructors4(object): +class TwoIndexes(object): + goal_time = 0.2 def setup(self): - self.data_frames = {} - self.start = datetime(1990, 1, 1) - self.end = datetime(2012, 1, 1) - for x in range(100): - if (x == 50): - self.end += timedelta(days=1) - self.dr = np.asarray(date_range(self.start, self.end)) - self.df = DataFrame({'a': ([0] * len(self.dr)), 'b': ([1] * len(self.dr)), 'c': ([2] * len(self.dr)), }, index=self.dr) - self.data_frames[x] = self.df - - def time_panel_from_dict_two_different_indexes(self): - Panel.from_dict(self.data_frames) + start = datetime(1990, 1, 1) + end = datetime(2012, 1, 1) + df1 = DataFrame({'a': 0, 'b': 1, 'c': 2}, + index=DatetimeIndex(start=start, end=end, freq='D')) + end += timedelta(days=1) + df2 = DataFrame({'a': 0, 'b': 1, 'c': 2}, + index=DatetimeIndex(start=start, end=end, freq='D')) + dfs = [df1] * 50 + [df2] * 50 + self.data_frames = dict(enumerate(dfs)) + + def time_from_dict(self): + with warnings.catch_warnings(record=True): + Panel.from_dict(self.data_frames) diff --git a/asv_bench/benchmarks/panel_methods.py b/asv_bench/benchmarks/panel_methods.py index 6609305502011b..a5b1a92e9cf679 100644 --- a/asv_bench/benchmarks/panel_methods.py +++ b/asv_bench/benchmarks/panel_methods.py @@ -1,24 +1,24 @@ -from .pandas_vb_common import * +import warnings +import numpy as np -class PanelMethods(object): - goal_time = 0.2 +from .pandas_vb_common import Panel, setup # noqa - def setup(self): - self.index = date_range(start='2000', freq='D', periods=1000) - self.panel = Panel(np.random.randn(100, len(self.index), 1000)) - def time_pct_change_items(self): - self.panel.pct_change(1, axis='items') +class PanelMethods(object): - def time_pct_change_major(self): - self.panel.pct_change(1, axis='major') + goal_time = 0.2 + params = ['items', 'major', 'minor'] + param_names = ['axis'] - def time_pct_change_minor(self): - self.panel.pct_change(1, axis='minor') + def setup(self, axis): + with warnings.catch_warnings(record=True): + self.panel = Panel(np.random.randn(100, 1000, 100)) - def time_shift(self): - self.panel.shift(1) + def time_pct_change(self, axis): + with warnings.catch_warnings(record=True): + self.panel.pct_change(1, axis=axis) - def time_shift_minor(self): - self.panel.shift(1, axis='minor') + def time_shift(self, axis): + with warnings.catch_warnings(record=True): + self.panel.shift(1, axis=axis) diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py deleted file mode 100644 index 32bf7e50d1a896..00000000000000 --- a/asv_bench/benchmarks/parser_vb.py +++ /dev/null @@ -1,121 +0,0 @@ -from .pandas_vb_common import * -import os -from pandas import read_csv -try: - from cStringIO import StringIO -except ImportError: - from io import StringIO - - -class read_csv1(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.df = DataFrame((np.random.randn(self.N, self.K) * np.random.randint(100, 10000, (self.N, self.K)))) - self.df.to_csv('test.csv', sep='|') - - self.format = (lambda x: '{:,}'.format(x)) - self.df2 = self.df.applymap(self.format) - self.df2.to_csv('test2.csv', sep='|') - - def time_sep(self): - read_csv('test.csv', sep='|') - - def time_thousands(self): - read_csv('test.csv', sep='|', thousands=',') - - def teardown(self): - os.remove('test.csv') - os.remove('test2.csv') - - -class read_csv2(object): - goal_time = 0.2 - - def setup(self): - self.data = ['A,B,C'] - self.data = (self.data + (['1,2,3 # comment'] * 100000)) - self.data = '\n'.join(self.data) - - def time_comment(self): - read_csv(StringIO(self.data), comment='#') - - -class read_csv3(object): - goal_time = 0.2 - - def setup(self): - self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n -0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n -0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n -0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n -0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" - self.data2 = self.data.replace(',', ';').replace('.', ',') - self.data = (self.data * 200) - self.data2 = (self.data2 * 200) - - def time_default_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, - float_precision=None) - - def time_default_converter_with_decimal(self): - read_csv(StringIO(self.data2), sep=';', header=None, - float_precision=None, decimal=',') - - def time_default_converter_python_engine(self): - read_csv(StringIO(self.data), sep=',', header=None, - float_precision=None, engine='python') - - def time_default_converter_with_decimal_python_engine(self): - read_csv(StringIO(self.data2), sep=';', header=None, - float_precision=None, decimal=',', engine='python') - - def time_precise_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, - float_precision='high') - - def time_roundtrip_converter(self): - read_csv(StringIO(self.data), sep=',', header=None, - float_precision='round_trip') - - -class read_csv_categorical(object): - goal_time = 0.2 - - def setup(self): - N = 100000 - group1 = ['aaaaaaaa', 'bbbbbbb', 'cccccccc', 'dddddddd', 'eeeeeeee'] - df = DataFrame({'a': np.random.choice(group1, N).astype('object'), - 'b': np.random.choice(group1, N).astype('object'), - 'c': np.random.choice(group1, N).astype('object')}) - df.to_csv('strings.csv', index=False) - - def time_convert_post(self): - read_csv('strings.csv').apply(pd.Categorical) - - def time_convert_direct(self): - read_csv('strings.csv', dtype='category') - - def teardown(self): - os.remove('strings.csv') - - -class read_csv_dateparsing(object): - goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 8 - self.data = 'KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' - self.data = (self.data * 200) - self.data2 = 'KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000\n KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000\n KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000\n KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000\n KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000\n ' - self.data2 = (self.data2 * 200) - - def time_multiple_date(self): - read_csv(StringIO(self.data), sep=',', header=None, - parse_dates=[[1, 2], [1, 3]]) - - def time_baseline(self): - read_csv(StringIO(self.data2), sep=',', header=None, parse_dates=[1]) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index df3c2bf3e4b464..c34f9a737473e9 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,143 +1,100 @@ -import pandas as pd -from pandas import Series, Period, PeriodIndex, date_range +from pandas import (DataFrame, Series, Period, PeriodIndex, date_range, + period_range) class PeriodProperties(object): - def setup(self): - self.per = Period('2012-06-01', freq='M') - def time_year(self): - self.per.year + params = (['M', 'min'], + ['year', 'month', 'day', 'hour', 'minute', 'second', + 'is_leap_year', 'quarter', 'qyear', 'week', 'daysinmonth', + 'dayofweek', 'dayofyear', 'start_time', 'end_time']) + param_names = ['freq', 'attr'] + + def setup(self, freq, attr): + self.per = Period('2012-06-01', freq=freq) - def time_month(self): - self.per.month + def time_property(self, freq, attr): + getattr(self.per, attr) - def time_quarter(self): - self.per.quarter - def time_day(self): - self.per.day +class PeriodUnaryMethods(object): - def time_hour(self): - self.per.hour + params = ['M', 'min'] + param_names = ['freq'] - def time_minute(self): - self.per.second + def setup(self, freq): + self.per = Period('2012-06-01', freq=freq) - def time_second(self): - self.per.second + def time_to_timestamp(self, freq): + self.per.to_timestamp() - def time_leap_year(self): - self.per.is_leapyear + def time_now(self, freq): + self.per.now(freq) + + def time_asfreq(self, freq): + self.per.asfreq('A') -class Constructor(object): +class PeriodIndexConstructor(object): + goal_time = 0.2 - def setup(self): + params = ['D'] + param_names = ['freq'] + + def setup(self, freq): self.rng = date_range('1985', periods=1000) self.rng2 = date_range('1985', periods=1000).to_pydatetime() - def time_from_date_range(self): - PeriodIndex(self.rng, freq='D') + def time_from_date_range(self, freq): + PeriodIndex(self.rng, freq=freq) + + def time_from_pydatetime(self, freq): + PeriodIndex(self.rng2, freq=freq) - def time_from_pydatetime(self): - PeriodIndex(self.rng2, freq='D') +class DataFramePeriodColumn(object): -class DataFrame(object): goal_time = 0.2 def setup(self): - self.rng = pd.period_range(start='1/1/1990', freq='S', periods=20000) - self.df = pd.DataFrame(index=range(len(self.rng))) + self.rng = period_range(start='1/1/1990', freq='S', periods=20000) + self.df = DataFrame(index=range(len(self.rng))) def time_setitem_period_column(self): self.df['col'] = self.rng + def time_set_index(self): + # GH#21582 limited by comparisons of Period objects + self.df['col2'] = self.rng + self.df.set_index('col2', append=True) + class Algorithms(object): + goal_time = 0.2 - def setup(self): + params = ['index', 'series'] + param_names = ['typ'] + + def setup(self, typ): data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), Period('2011-03', freq='M'), Period('2011-04', freq='M')] - self.s = Series(data * 1000) - self.i = PeriodIndex(data, freq='M') - - def time_drop_duplicates_pseries(self): - self.s.drop_duplicates() - - def time_drop_duplicates_pindex(self): - self.i.drop_duplicates() - - def time_value_counts_pseries(self): - self.s.value_counts() - - def time_value_counts_pindex(self): - self.i.value_counts() - -class Properties(object): - def setup(self): - self.per = Period('2017-09-06 08:28', freq='min') - - def time_year(self): - self.per.year - - def time_month(self): - self.per.month - - def time_day(self): - self.per.day - - def time_hour(self): - self.per.hour - - def time_minute(self): - self.per.minute - - def time_second(self): - self.per.second - - def time_is_leap_year(self): - self.per.is_leap_year - - def time_quarter(self): - self.per.quarter + if typ == 'index': + self.vector = PeriodIndex(data * 1000, freq='M') + elif typ == 'series': + self.vector = Series(data * 1000) - def time_qyear(self): - self.per.qyear + def time_drop_duplicates(self, typ): + self.vector.drop_duplicates() - def time_week(self): - self.per.week + def time_value_counts(self, typ): + self.vector.value_counts() - def time_daysinmonth(self): - self.per.daysinmonth - - def time_dayofweek(self): - self.per.dayofweek - - def time_dayofyear(self): - self.per.dayofyear - - def time_start_time(self): - self.per.start_time - - def time_end_time(self): - self.per.end_time - - def time_to_timestamp(): - self.per.to_timestamp() - - def time_now(): - self.per.now() - - def time_asfreq(): - self.per.asfreq('A') +class Indexing(object): -class period_standard_indexing(object): goal_time = 0.2 def setup(self): @@ -158,7 +115,7 @@ def time_series_loc(self): self.series.loc[self.period] def time_align(self): - pd.DataFrame({'a': self.series, 'b': self.series[:500]}) + DataFrame({'a': self.series, 'b': self.series[:500]}) def time_intersection(self): self.index[:750].intersection(self.index[250:]) diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index dda684b35e3018..5b49112b0e07d7 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -1,24 +1,44 @@ -from .pandas_vb_common import * -try: - from pandas import date_range -except ImportError: - def date_range(start=None, end=None, periods=None, freq=None): - return DatetimeIndex(start, end, periods=periods, offset=freq) +import numpy as np +from pandas import DataFrame, Series, DatetimeIndex, date_range try: from pandas.plotting import andrews_curves except ImportError: from pandas.tools.plotting import andrews_curves +import matplotlib +matplotlib.use('Agg') + +from .pandas_vb_common import setup # noqa + + +class Plotting(object): + + goal_time = 0.2 + + def setup(self): + self.s = Series(np.random.randn(1000000)) + self.df = DataFrame({'col': self.s}) + + def time_series_plot(self): + self.s.plot() + + def time_frame_plot(self): + self.df.plot() class TimeseriesPlotting(object): + goal_time = 0.2 def setup(self): - import matplotlib - matplotlib.use('Agg') - self.N = 2000 - self.M = 5 - self.df = DataFrame(np.random.randn(self.N, self.M), index=date_range('1/1/1975', periods=self.N)) + N = 2000 + M = 5 + idx = date_range('1/1/1975', periods=N) + self.df = DataFrame(np.random.randn(N, M), index=idx) + + idx_irregular = DatetimeIndex(np.concatenate((idx.values[0:10], + idx.values[12:]))) + self.df2 = DataFrame(np.random.randn(len(idx_irregular), M), + index=idx_irregular) def time_plot_regular(self): self.df.plot() @@ -26,18 +46,19 @@ def time_plot_regular(self): def time_plot_regular_compat(self): self.df.plot(x_compat=True) + def time_plot_irregular(self): + self.df2.plot() + class Misc(object): + goal_time = 0.6 def setup(self): - import matplotlib - matplotlib.use('Agg') - self.N = 500 - self.M = 10 - data_dict = {x: np.random.randn(self.N) for x in range(self.M)} - data_dict["Name"] = ["A"] * self.N - self.df = DataFrame(data_dict) + N = 500 + M = 10 + self.df = DataFrame(np.random.randn(N, M)) + self.df['Name'] = ["A"] * N def time_plot_andrews_curves(self): andrews_curves(self.df, "Name") diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py index 537d275e7c727e..413427a16f40bf 100644 --- a/asv_bench/benchmarks/reindex.py +++ b/asv_bench/benchmarks/reindex.py @@ -1,89 +1,77 @@ -from .pandas_vb_common import * -from random import shuffle +import numpy as np +import pandas.util.testing as tm +from pandas import (DataFrame, Series, DatetimeIndex, MultiIndex, Index, + date_range) +from .pandas_vb_common import setup, lib # noqa -class Reindexing(object): +class Reindex(object): + goal_time = 0.2 def setup(self): - self.rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min') - self.df = DataFrame(np.random.rand(10000, 10), index=self.rng, + rng = DatetimeIndex(start='1/1/1970', periods=10000, freq='1min') + self.df = DataFrame(np.random.rand(10000, 10), index=rng, columns=range(10)) self.df['foo'] = 'bar' - self.rng2 = Index(self.rng[::2]) - + self.rng_subset = Index(rng[::2]) self.df2 = DataFrame(index=range(10000), data=np.random.rand(10000, 30), columns=range(30)) - - # multi-index N = 5000 K = 200 level1 = tm.makeStringIndex(N).values.repeat(K) level2 = np.tile(tm.makeStringIndex(K).values, N) index = MultiIndex.from_arrays([level1, level2]) - self.s1 = Series(np.random.randn((N * K)), index=index) - self.s2 = self.s1[::2] + self.s = Series(np.random.randn(N * K), index=index) + self.s_subset = self.s[::2] def time_reindex_dates(self): - self.df.reindex(self.rng2) + self.df.reindex(self.rng_subset) def time_reindex_columns(self): self.df2.reindex(columns=self.df.columns[1:5]) def time_reindex_multiindex(self): - self.s1.reindex(self.s2.index) + self.s.reindex(self.s_subset.index) -#---------------------------------------------------------------------- -# Pad / backfill +class ReindexMethod(object): - -class FillMethod(object): goal_time = 0.2 + params = ['pad', 'backfill'] + param_names = ['method'] - def setup(self): - self.rng = date_range('1/1/2000', periods=100000, freq='1min') - self.ts = Series(np.random.randn(len(self.rng)), index=self.rng) - self.ts2 = self.ts[::2] - self.ts3 = self.ts2.reindex(self.ts.index) - self.ts4 = self.ts3.astype('float32') - - def pad(self, source_series, target_index): - try: - source_series.reindex(target_index, method='pad') - except: - source_series.reindex(target_index, fillMethod='pad') + def setup(self, method): + N = 100000 + self.idx = date_range('1/1/2000', periods=N, freq='1min') + self.ts = Series(np.random.randn(N), index=self.idx)[::2] - def backfill(self, source_series, target_index): - try: - source_series.reindex(target_index, method='backfill') - except: - source_series.reindex(target_index, fillMethod='backfill') + def time_reindex_method(self, method): + self.ts.reindex(self.idx, method=method) - def time_backfill_dates(self): - self.backfill(self.ts2, self.ts.index) - def time_pad_daterange(self): - self.pad(self.ts2, self.ts.index) +class Fillna(object): - def time_backfill(self): - self.ts3.fillna(method='backfill') - - def time_backfill_float32(self): - self.ts4.fillna(method='backfill') - - def time_pad(self): - self.ts3.fillna(method='pad') + goal_time = 0.2 + params = ['pad', 'backfill'] + param_names = ['method'] - def time_pad_float32(self): - self.ts4.fillna(method='pad') + def setup(self, method): + N = 100000 + self.idx = date_range('1/1/2000', periods=N, freq='1min') + ts = Series(np.random.randn(N), index=self.idx)[::2] + self.ts_reindexed = ts.reindex(self.idx) + self.ts_float32 = self.ts_reindexed.astype('float32') + def time_reindexed(self, method): + self.ts_reindexed.fillna(method=method) -#---------------------------------------------------------------------- -# align on level + def time_float_32(self, method): + self.ts_float32.fillna(method=method) class LevelAlign(object): + goal_time = 0.2 def setup(self): @@ -92,7 +80,6 @@ def setup(self): labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) self.df_level = DataFrame(np.random.randn(100, 4), @@ -102,106 +89,84 @@ def time_align_level(self): self.df.align(self.df_level, level=1, copy=False) def time_reindex_level(self): - self.df_level.reindex(self.df.index, level=1) - + self.df_level.reindex(self.index, level=1) -#---------------------------------------------------------------------- -# drop_duplicates +class DropDuplicates(object): -class Duplicates(object): goal_time = 0.2 - - def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, - 'value': np.random.randn((self.N * self.K)),}) - self.col_array_list = list(self.df.values.T) - - self.df2 = self.df.copy() - self.df2.ix[:10000, :] = np.nan + params = [True, False] + param_names = ['inplace'] + + def setup(self, inplace): + N = 10000 + K = 10 + key1 = tm.makeStringIndex(N).values.repeat(K) + key2 = tm.makeStringIndex(N).values.repeat(K) + self.df = DataFrame({'key1': key1, 'key2': key2, + 'value': np.random.randn(N * K)}) + self.df_nan = self.df.copy() + self.df_nan.iloc[:10000, :] = np.nan self.s = Series(np.random.randint(0, 1000, size=10000)) - self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10)) - - np.random.seed(1234) - self.N = 1000000 - self.K = 10000 - self.key1 = np.random.randint(0, self.K, size=self.N) - self.df_int = DataFrame({'key1': self.key1}) - self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K, - dtype=bool) - for i in range(10)}) - - def time_frame_drop_dups(self): - self.df.drop_duplicates(['key1', 'key2']) - - def time_frame_drop_dups_inplace(self): - self.df.drop_duplicates(['key1', 'key2'], inplace=True) + self.s_str = Series(np.tile(tm.makeStringIndex(1000).values, 10)) - def time_frame_drop_dups_na(self): - self.df2.drop_duplicates(['key1', 'key2']) + N = 1000000 + K = 10000 + key1 = np.random.randint(0, K, size=N) + self.df_int = DataFrame({'key1': key1}) + self.df_bool = DataFrame(np.random.randint(0, 2, size=(K, 10), + dtype=bool)) - def time_frame_drop_dups_na_inplace(self): - self.df2.drop_duplicates(['key1', 'key2'], inplace=True) + def time_frame_drop_dups(self, inplace): + self.df.drop_duplicates(['key1', 'key2'], inplace=inplace) - def time_series_drop_dups_int(self): - self.s.drop_duplicates() + def time_frame_drop_dups_na(self, inplace): + self.df_nan.drop_duplicates(['key1', 'key2'], inplace=inplace) - def time_series_drop_dups_string(self): - self.s2.drop_duplicates() + def time_series_drop_dups_int(self, inplace): + self.s.drop_duplicates(inplace=inplace) - def time_frame_drop_dups_int(self): - self.df_int.drop_duplicates() + def time_series_drop_dups_string(self, inplace): + self.s_str.drop_duplicates(inplace=inplace) - def time_frame_drop_dups_bool(self): - self.df_bool.drop_duplicates() + def time_frame_drop_dups_int(self, inplace): + self.df_int.drop_duplicates(inplace=inplace) -#---------------------------------------------------------------------- -# blog "pandas escaped the zoo" + def time_frame_drop_dups_bool(self, inplace): + self.df_bool.drop_duplicates(inplace=inplace) class Align(object): + # blog "pandas escaped the zoo" goal_time = 0.2 def setup(self): n = 50000 indices = tm.makeStringIndex(n) subsample_size = 40000 - - def sample(values, k): - sampler = np.arange(len(values)) - shuffle(sampler) - return values.take(sampler[:k]) - - self.x = Series(np.random.randn(50000), indices) + self.x = Series(np.random.randn(n), indices) self.y = Series(np.random.randn(subsample_size), - index=sample(indices, subsample_size)) + index=np.random.choice(indices, subsample_size, + replace=False)) def time_align_series_irregular_string(self): - (self.x + self.y) + self.x + self.y class LibFastZip(object): + goal_time = 0.2 def setup(self): - self.N = 10000 - self.K = 10 - self.key1 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.key2 = tm.makeStringIndex(self.N).values.repeat(self.K) - self.df = DataFrame({'key1': self.key1, 'key2': self.key2, 'value': np.random.randn((self.N * self.K)), }) - self.col_array_list = list(self.df.values.T) - - self.df2 = self.df.copy() - self.df2.ix[:10000, :] = np.nan - self.col_array_list2 = list(self.df2.values.T) + N = 10000 + K = 10 + key1 = tm.makeStringIndex(N).values.repeat(K) + key2 = tm.makeStringIndex(N).values.repeat(K) + col_array = np.vstack([key1, key2, np.random.randn(N * K)]) + col_array2 = col_array.copy() + col_array2[:, :10000] = np.nan + self.col_array_list = list(col_array) def time_lib_fast_zip(self): lib.fast_zip(self.col_array_list) - - def time_lib_fast_zip_fillna(self): - lib.fast_zip_fillna(self.col_array_list2) diff --git a/asv_bench/benchmarks/replace.py b/asv_bench/benchmarks/replace.py index 63562f90eab2b6..41208125e8f321 100644 --- a/asv_bench/benchmarks/replace.py +++ b/asv_bench/benchmarks/replace.py @@ -1,70 +1,58 @@ -from .pandas_vb_common import * +import numpy as np +import pandas as pd +from .pandas_vb_common import setup # noqa -class replace_fillna(object): - goal_time = 0.2 - - def setup(self): - self.N = 1000000 - try: - self.rng = date_range('1/1/2000', periods=self.N, freq='min') - except NameError: - self.rng = DatetimeIndex('1/1/2000', periods=self.N, offset=datetools.Minute()) - self.date_range = DateRange - self.ts = Series(np.random.randn(self.N), index=self.rng) - def time_replace_fillna(self): - self.ts.fillna(0.0, inplace=True) +class FillNa(object): - -class replace_large_dict(object): goal_time = 0.2 + params = [True, False] + param_names = ['inplace'] - def setup(self): - self.n = (10 ** 6) - self.start_value = (10 ** 5) - self.to_rep = dict(((i, (self.start_value + i)) for i in range(self.n))) - self.s = Series(np.random.randint(self.n, size=(10 ** 3))) - - def time_replace_large_dict(self): - self.s.replace(self.to_rep, inplace=True) + def setup(self, inplace): + N = 10**6 + rng = pd.date_range('1/1/2000', periods=N, freq='min') + data = np.random.randn(N) + data[::2] = np.nan + self.ts = pd.Series(data, index=rng) + def time_fillna(self, inplace): + self.ts.fillna(0.0, inplace=inplace) -class replace_convert(object): - goal_time = 0.5 + def time_replace(self, inplace): + self.ts.replace(np.nan, 0.0, inplace=inplace) - def setup(self): - self.n = (10 ** 3) - self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n))) - self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n))) - self.s = Series(np.random.randint(self.n, size=(10 ** 3))) - self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)), - 'B': np.random.randint(self.n, size=(10 ** 3))}) - def time_replace_series_timestamp(self): - self.s.replace(self.to_ts) +class ReplaceDict(object): - def time_replace_series_timedelta(self): - self.s.replace(self.to_td) + goal_time = 0.2 + params = [True, False] + param_names = ['inplace'] - def time_replace_frame_timestamp(self): - self.df.replace(self.to_ts) + def setup(self, inplace): + N = 10**5 + start_value = 10**5 + self.to_rep = dict(enumerate(np.arange(N) + start_value)) + self.s = pd.Series(np.random.randint(N, size=10**3)) - def time_replace_frame_timedelta(self): - self.df.replace(self.to_td) + def time_replace_series(self, inplace): + self.s.replace(self.to_rep, inplace=inplace) -class replace_replacena(object): - goal_time = 0.2 +class Convert(object): - def setup(self): - self.N = 1000000 - try: - self.rng = date_range('1/1/2000', periods=self.N, freq='min') - except NameError: - self.rng = DatetimeIndex('1/1/2000', periods=self.N, offset=datetools.Minute()) - self.date_range = DateRange - self.ts = Series(np.random.randn(self.N), index=self.rng) - - def time_replace_replacena(self): - self.ts.replace(np.nan, 0.0, inplace=True) + goal_time = 0.5 + params = (['DataFrame', 'Series'], ['Timestamp', 'Timedelta']) + param_names = ['constructor', 'replace_data'] + + def setup(self, constructor, replace_data): + N = 10**3 + data = {'Series': pd.Series(np.random.randint(N, size=N)), + 'DataFrame': pd.DataFrame({'A': np.random.randint(N, size=N), + 'B': np.random.randint(N, size=N)})} + self.to_replace = {i: getattr(pd, replace_data) for i in range(N)} + self.data = data[constructor] + + def time_replace(self, constructor, replace_data): + self.data.replace(self.to_replace) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 177e3e7cb87fac..3cf9a32dab3984 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,13 +1,18 @@ -from .pandas_vb_common import * -from pandas import melt, wide_to_long +import string +from itertools import product +import numpy as np +from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long +import pandas as pd + +from .pandas_vb_common import setup # noqa + + +class Melt(object): -class melt_dataframe(object): goal_time = 0.2 def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) self.df = DataFrame(np.random.randn(10000, 3), columns=['A', 'B', 'C']) self.df['id1'] = np.random.randint(0, 10, 10000) self.df['id2'] = np.random.randint(100, 1000, 10000) @@ -16,50 +21,42 @@ def time_melt_dataframe(self): melt(self.df, id_vars=['id1', 'id2']) -class reshape_pivot_time_series(object): +class Pivot(object): + goal_time = 0.2 def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) - self.index = date_range('1/1/2000', periods=10000, freq='h') - self.df = DataFrame(randn(10000, 50), index=self.index, columns=range(50)) - self.pdf = self.unpivot(self.df) - self.f = (lambda : self.pdf.pivot('date', 'variable', 'value')) + N = 10000 + index = date_range('1/1/2000', periods=N, freq='h') + data = {'value': np.random.randn(N * 50), + 'variable': np.arange(50).repeat(N), + 'date': np.tile(index.values, 50)} + self.df = DataFrame(data) def time_reshape_pivot_time_series(self): - self.f() + self.df.pivot('date', 'variable', 'value') - def unpivot(self, frame): - (N, K) = frame.shape - self.data = {'value': frame.values.ravel('F'), 'variable': np.asarray(frame.columns).repeat(N), 'date': np.tile(np.asarray(frame.index), K), } - return DataFrame(self.data, columns=['date', 'variable', 'value']) +class SimpleReshape(object): -class reshape_stack_simple(object): goal_time = 0.2 def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) + arrays = [np.arange(100).repeat(100), + np.roll(np.tile(np.arange(100), 100), 25)] + index = MultiIndex.from_arrays(arrays) + self.df = DataFrame(np.random.randn(10000, 4), index=index) self.udf = self.df.unstack(1) - def time_reshape_stack_simple(self): + def time_stack(self): self.udf.stack() - -class reshape_unstack_simple(object): - goal_time = 0.2 - - def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) - - def time_reshape_unstack_simple(self): + def time_unstack(self): self.df.unstack(1) -class reshape_unstack_large_single_dtype(object): +class Unstack(object): + goal_time = 0.2 def setup(self): @@ -67,53 +64,89 @@ def setup(self): n = 1000 levels = np.arange(m) - index = pd.MultiIndex.from_product([levels]*2) + index = MultiIndex.from_product([levels] * 2) columns = np.arange(n) - values = np.arange(m*m*n).reshape(m*m, n) - self.df = pd.DataFrame(values, index, columns) + values = np.arange(m * m * n).reshape(m * m, n) + self.df = DataFrame(values, index, columns) self.df2 = self.df.iloc[:-1] - def time_unstack_full_product(self): + def time_full_product(self): self.df.unstack() - def time_unstack_with_mask(self): + def time_without_last_row(self): self.df2.unstack() -class unstack_sparse_keyspace(object): +class SparseIndex(object): + goal_time = 0.2 def setup(self): - self.index = MultiIndex.from_arrays([np.arange(100).repeat(100), np.roll(np.tile(np.arange(100), 100), 25)]) - self.df = DataFrame(np.random.randn(10000, 4), index=self.index) - self.NUM_ROWS = 1000 - for iter in range(10): - self.df = DataFrame({'A': np.random.randint(50, size=self.NUM_ROWS), 'B': np.random.randint(50, size=self.NUM_ROWS), 'C': np.random.randint((-10), 10, size=self.NUM_ROWS), 'D': np.random.randint((-10), 10, size=self.NUM_ROWS), 'E': np.random.randint(10, size=self.NUM_ROWS), 'F': np.random.randn(self.NUM_ROWS), }) - self.idf = self.df.set_index(['A', 'B', 'C', 'D', 'E']) - if (len(self.idf.index.unique()) == self.NUM_ROWS): - break + NUM_ROWS = 1000 + self.df = DataFrame({'A': np.random.randint(50, size=NUM_ROWS), + 'B': np.random.randint(50, size=NUM_ROWS), + 'C': np.random.randint(-10, 10, size=NUM_ROWS), + 'D': np.random.randint(-10, 10, size=NUM_ROWS), + 'E': np.random.randint(10, size=NUM_ROWS), + 'F': np.random.randn(NUM_ROWS)}) + self.df = self.df.set_index(['A', 'B', 'C', 'D', 'E']) + + def time_unstack(self): + self.df.unstack() - def time_unstack_sparse_keyspace(self): - self.idf.unstack() +class WideToLong(object): -class wide_to_long_big(object): goal_time = 0.2 def setup(self): - vars = 'ABCD' nyrs = 20 nidvars = 20 N = 5000 - yrvars = [] - for var in vars: - for yr in range(1, nyrs + 1): - yrvars.append(var + str(yr)) - - self.df = pd.DataFrame(np.random.randn(N, nidvars + len(yrvars)), - columns=list(range(nidvars)) + yrvars) - self.vars = vars + self.letters = list('ABCD') + yrvars = [l + str(num) + for l, num in product(self.letters, range(1, nyrs + 1))] + columns = [str(i) for i in range(nidvars)] + yrvars + self.df = DataFrame(np.random.randn(N, nidvars + len(yrvars)), + columns=columns) + self.df['id'] = self.df.index def time_wide_to_long_big(self): - self.df['id'] = self.df.index - wide_to_long(self.df, list(self.vars), i='id', j='year') + wide_to_long(self.df, self.letters, i='id', j='year') + + +class PivotTable(object): + + goal_time = 0.2 + + def setup(self): + N = 100000 + fac1 = np.array(['A', 'B', 'C'], dtype='O') + fac2 = np.array(['one', 'two'], dtype='O') + ind1 = np.random.randint(0, 3, size=N) + ind2 = np.random.randint(0, 2, size=N) + self.df = DataFrame({'key1': fac1.take(ind1), + 'key2': fac2.take(ind2), + 'key3': fac2.take(ind2), + 'value1': np.random.randn(N), + 'value2': np.random.randn(N), + 'value3': np.random.randn(N)}) + + def time_pivot_table(self): + self.df.pivot_table(index='key1', columns=['key2', 'key3']) + + +class GetDummies(object): + goal_time = 0.2 + + def setup(self): + categories = list(string.ascii_letters[:12]) + s = pd.Series(np.random.choice(categories, size=1000000), + dtype=pd.api.types.CategoricalDtype(categories)) + self.s = s + + def time_get_dummies_1d(self): + pd.get_dummies(self.s, sparse=False) + + def time_get_dummies_1d_sparse(self): + pd.get_dummies(self.s, sparse=True) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 899349cd21f840..e3bf551fa5f2b3 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -1,185 +1,79 @@ -from .pandas_vb_common import * import pandas as pd import numpy as np - -class DataframeRolling(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.Ns = 10000 - self.df = pd.DataFrame({'a': np.random.random(self.N)}) - self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) - self.wins = 10 - self.winl = 1000 - - def time_rolling_quantile_0(self): - (self.df.rolling(self.wins).quantile(0.0)) - - def time_rolling_quantile_1(self): - (self.df.rolling(self.wins).quantile(1.0)) - - def time_rolling_quantile_median(self): - (self.df.rolling(self.wins).quantile(0.5)) - - def time_rolling_median(self): - (self.df.rolling(self.wins).median()) - - def time_rolling_mean(self): - (self.df.rolling(self.wins).mean()) - - def time_rolling_max(self): - (self.df.rolling(self.wins).max()) - - def time_rolling_min(self): - (self.df.rolling(self.wins).min()) - - def time_rolling_std(self): - (self.df.rolling(self.wins).std()) - - def time_rolling_count(self): - (self.df.rolling(self.wins).count()) - - def time_rolling_skew(self): - (self.df.rolling(self.wins).skew()) - - def time_rolling_kurt(self): - (self.df.rolling(self.wins).kurt()) - - def time_rolling_sum(self): - (self.df.rolling(self.wins).sum()) - - def time_rolling_corr(self): - (self.dfs.rolling(self.wins).corr()) - - def time_rolling_cov(self): - (self.dfs.rolling(self.wins).cov()) - - def time_rolling_quantile_0_l(self): - (self.df.rolling(self.winl).quantile(0.0)) - - def time_rolling_quantile_1_l(self): - (self.df.rolling(self.winl).quantile(1.0)) - - def time_rolling_quantile_median_l(self): - (self.df.rolling(self.winl).quantile(0.5)) - - def time_rolling_median_l(self): - (self.df.rolling(self.winl).median()) - - def time_rolling_mean_l(self): - (self.df.rolling(self.winl).mean()) - - def time_rolling_max_l(self): - (self.df.rolling(self.winl).max()) - - def time_rolling_min_l(self): - (self.df.rolling(self.winl).min()) - - def time_rolling_std_l(self): - (self.df.rolling(self.wins).std()) - - def time_rolling_count_l(self): - (self.df.rolling(self.wins).count()) - - def time_rolling_skew_l(self): - (self.df.rolling(self.wins).skew()) - - def time_rolling_kurt_l(self): - (self.df.rolling(self.wins).kurt()) - - def time_rolling_sum_l(self): - (self.df.rolling(self.wins).sum()) - - -class SeriesRolling(object): - goal_time = 0.2 - - def setup(self): - self.N = 100000 - self.Ns = 10000 - self.df = pd.DataFrame({'a': np.random.random(self.N)}) - self.dfs = pd.DataFrame({'a': np.random.random(self.Ns)}) - self.sr = self.df.a - self.srs = self.dfs.a - self.wins = 10 - self.winl = 1000 - - def time_rolling_quantile_0(self): - (self.sr.rolling(self.wins).quantile(0.0)) - - def time_rolling_quantile_1(self): - (self.sr.rolling(self.wins).quantile(1.0)) - - def time_rolling_quantile_median(self): - (self.sr.rolling(self.wins).quantile(0.5)) - - def time_rolling_median(self): - (self.sr.rolling(self.wins).median()) - - def time_rolling_mean(self): - (self.sr.rolling(self.wins).mean()) - - def time_rolling_max(self): - (self.sr.rolling(self.wins).max()) - - def time_rolling_min(self): - (self.sr.rolling(self.wins).min()) - - def time_rolling_std(self): - (self.sr.rolling(self.wins).std()) - - def time_rolling_count(self): - (self.sr.rolling(self.wins).count()) - - def time_rolling_skew(self): - (self.sr.rolling(self.wins).skew()) - - def time_rolling_kurt(self): - (self.sr.rolling(self.wins).kurt()) - - def time_rolling_sum(self): - (self.sr.rolling(self.wins).sum()) - - def time_rolling_corr(self): - (self.srs.rolling(self.wins).corr()) - - def time_rolling_cov(self): - (self.srs.rolling(self.wins).cov()) - - def time_rolling_quantile_0_l(self): - (self.sr.rolling(self.winl).quantile(0.0)) - - def time_rolling_quantile_1_l(self): - (self.sr.rolling(self.winl).quantile(1.0)) - - def time_rolling_quantile_median_l(self): - (self.sr.rolling(self.winl).quantile(0.5)) - - def time_rolling_median_l(self): - (self.sr.rolling(self.winl).median()) - - def time_rolling_mean_l(self): - (self.sr.rolling(self.winl).mean()) - - def time_rolling_max_l(self): - (self.sr.rolling(self.winl).max()) - - def time_rolling_min_l(self): - (self.sr.rolling(self.winl).min()) - - def time_rolling_std_l(self): - (self.sr.rolling(self.wins).std()) - - def time_rolling_count_l(self): - (self.sr.rolling(self.wins).count()) - - def time_rolling_skew_l(self): - (self.sr.rolling(self.wins).skew()) - - def time_rolling_kurt_l(self): - (self.sr.rolling(self.wins).kurt()) - - def time_rolling_sum_l(self): - (self.sr.rolling(self.wins).sum()) +from .pandas_vb_common import setup # noqa + + +class Methods(object): + + sample_time = 0.2 + params = (['DataFrame', 'Series'], + [10, 1000], + ['int', 'float'], + ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, window, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + self.roll = getattr(pd, constructor)(arr).rolling(window) + + def time_rolling(self, constructor, window, dtype, method): + getattr(self.roll, method)() + + +class VariableWindowMethods(Methods): + sample_time = 0.2 + params = (['DataFrame', 'Series'], + ['50s', '1h', '1d'], + ['int', 'float'], + ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, window, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + index = pd.date_range('2017-01-01', periods=N, freq='5s') + self.roll = getattr(pd, constructor)(arr, index=index).rolling(window) + + +class Pairwise(object): + + sample_time = 0.2 + params = ([10, 1000, None], + ['corr', 'cov'], + [True, False]) + param_names = ['window', 'method', 'pairwise'] + + def setup(self, window, method, pairwise): + N = 10**4 + arr = np.random.random(N) + self.df = pd.DataFrame(arr) + + def time_pairwise(self, window, method, pairwise): + if window is None: + r = self.df.expanding() + else: + r = self.df.rolling(window=window) + getattr(r, method)(self.df, pairwise=pairwise) + + +class Quantile(object): + sample_time = 0.2 + params = (['DataFrame', 'Series'], + [10, 1000], + ['int', 'float'], + [0, 0.5, 1], + ['linear', 'nearest', 'lower', 'higher', 'midpoint']) + param_names = ['constructor', 'window', 'dtype', 'percentile'] + + def setup(self, constructor, window, dtype, percentile, interpolation): + N = 10 ** 5 + arr = np.random.random(N).astype(dtype) + self.roll = getattr(pd, constructor)(arr).rolling(window) + + def time_quantile(self, constructor, window, dtype, percentile, + interpolation): + self.roll.quantile(percentile, interpolation=interpolation) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 3c0e2869357ae8..a26c5d89bc4839 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -1,133 +1,194 @@ -from .pandas_vb_common import * +from datetime import datetime +import numpy as np +import pandas.util.testing as tm +from pandas import Series, date_range, NaT + +from .pandas_vb_common import setup # noqa + + +class SeriesConstructor(object): -class series_constructor_no_data_datetime_index(object): goal_time = 0.2 + params = [None, 'dict'] + param_names = ['data'] - def setup(self): - self.dr = pd.date_range( - start=datetime(2015,10,26), - end=datetime(2016,1,1), - freq='50s' - ) # ~100k long + def setup(self, data): + self.idx = date_range(start=datetime(2015, 10, 26), + end=datetime(2016, 1, 1), + freq='50s') + dict_data = dict(zip(self.idx, range(len(self.idx)))) + self.data = None if data is None else dict_data - def time_series_constructor_no_data_datetime_index(self): - Series(data=None, index=self.dr) + def time_constructor(self, data): + Series(data=self.data, index=self.idx) -class series_constructor_dict_data_datetime_index(object): +class IsIn(object): + goal_time = 0.2 + params = ['int64', 'object'] + param_names = ['dtype'] - def setup(self): - self.dr = pd.date_range( - start=datetime(2015, 10, 26), - end=datetime(2016, 1, 1), - freq='50s' - ) # ~100k long - self.data = {d: v for d, v in zip(self.dr, range(len(self.dr)))} + def setup(self, dtype): + self.s = Series(np.random.randint(1, 10, 100000)).astype(dtype) + self.values = [1, 2] - def time_series_constructor_no_data_datetime_index(self): - Series(data=self.data, index=self.dr) + def time_isin(self, dtypes): + self.s.isin(self.values) -class series_isin_int64(object): - goal_time = 0.2 +class IsInFloat64(object): def setup(self): - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') - self.s4 = Series(np.random.randint(1, 100, 10000000)).astype('int64') - self.values = [1, 2] + self.small = Series([1, 2], dtype=np.float64) + self.many_different_values = np.arange(10**6, dtype=np.float64) + self.few_different_values = np.zeros(10**7, dtype=np.float64) + self.only_nans_values = np.full(10**7, np.nan, dtype=np.float64) - def time_series_isin_int64(self): - self.s3.isin(self.values) + def time_isin_many_different(self): + # runtime is dominated by creation of the lookup-table + self.small.isin(self.many_different_values) - def time_series_isin_int64_large(self): - self.s4.isin(self.values) + def time_isin_few_different(self): + # runtime is dominated by creation of the lookup-table + self.small.isin(self.few_different_values) + def time_isin_nan_values(self): + # runtime is dominated by creation of the lookup-table + self.small.isin(self.few_different_values) -class series_isin_object(object): - goal_time = 0.2 + +class IsInForObjects(object): def setup(self): - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') - self.values = [1, 2] - self.s4 = self.s3.astype('object') + self.s_nans = Series(np.full(10**4, np.nan)).astype(np.object) + self.vals_nans = np.full(10**4, np.nan).astype(np.object) + self.s_short = Series(np.arange(2)).astype(np.object) + self.s_long = Series(np.arange(10**5)).astype(np.object) + self.vals_short = np.arange(2).astype(np.object) + self.vals_long = np.arange(10**5).astype(np.object) + # because of nans floats are special: + self.s_long_floats = Series(np.arange(10**5, + dtype=np.float)).astype(np.object) + self.vals_long_floats = np.arange(10**5, + dtype=np.float).astype(np.object) + + def time_isin_nans(self): + # if nan-objects are different objects, + # this has the potential to trigger O(n^2) running time + self.s_nans.isin(self.vals_nans) + + def time_isin_short_series_long_values(self): + # running time dominated by the preprocessing + self.s_short.isin(self.vals_long) + + def time_isin_long_series_short_values(self): + # running time dominated by look-up + self.s_long.isin(self.vals_short) + + def time_isin_long_series_long_values(self): + # no dominating part + self.s_long.isin(self.vals_long) + + def time_isin_long_series_long_values_floats(self): + # no dominating part + self.s_long_floats.isin(self.vals_long_floats) + + +class NSort(object): - def time_series_isin_object(self): - self.s4.isin(self.values) + goal_time = 0.2 + params = ['first', 'last', 'all'] + param_names = ['keep'] + def setup(self, keep): + self.s = Series(np.random.randint(1, 10, 100000)) -class series_nlargest1(object): - goal_time = 0.2 + def time_nlargest(self, keep): + self.s.nlargest(3, keep=keep) - def setup(self): - self.s1 = Series(np.random.randn(10000)) - self.s2 = Series(np.random.randint(1, 10, 10000)) - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') - self.values = [1, 2] - self.s4 = self.s3.astype('object') + def time_nsmallest(self, keep): + self.s.nsmallest(3, keep=keep) - def time_series_nlargest1(self): - self.s1.nlargest(3, keep='last') - self.s1.nlargest(3, keep='first') +class Dropna(object): -class series_nlargest2(object): goal_time = 0.2 + params = ['int', 'datetime'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10**6 + data = {'int': np.random.randint(1, 10, N), + 'datetime': date_range('2000-01-01', freq='S', periods=N)} + self.s = Series(data[dtype]) + if dtype == 'datetime': + self.s[np.random.randint(1, N, 100)] = NaT + + def time_dropna(self, dtype): + self.s.dropna() - def setup(self): - self.s1 = Series(np.random.randn(10000)) - self.s2 = Series(np.random.randint(1, 10, 10000)) - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') - self.values = [1, 2] - self.s4 = self.s3.astype('object') - def time_series_nlargest2(self): - self.s2.nlargest(3, keep='last') - self.s2.nlargest(3, keep='first') +class Map(object): + + goal_time = 0.2 + params = ['dict', 'Series'] + param_names = 'mapper' + + def setup(self, mapper): + map_size = 1000 + map_data = Series(map_size - np.arange(map_size)) + self.map_data = map_data if mapper == 'Series' else map_data.to_dict() + self.s = Series(np.random.randint(0, map_size, 10000)) + + def time_map(self, mapper): + self.s.map(self.map_data) + +class Clip(object): -class series_nsmallest2(object): goal_time = 0.2 def setup(self): - self.s1 = Series(np.random.randn(10000)) - self.s2 = Series(np.random.randint(1, 10, 10000)) - self.s3 = Series(np.random.randint(1, 10, 100000)).astype('int64') - self.values = [1, 2] - self.s4 = self.s3.astype('object') + self.s = Series(np.random.randn(50)) + + def time_clip(self): + self.s.clip(0, 1) - def time_series_nsmallest2(self): - self.s2.nsmallest(3, keep='last') - self.s2.nsmallest(3, keep='first') +class ValueCounts(object): -class series_dropna_int64(object): goal_time = 0.2 + params = ['int', 'float', 'object'] + param_names = ['dtype'] - def setup(self): - self.s = Series(np.random.randint(1, 10, 1000000)) + def setup(self, dtype): + self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype) + + def time_value_counts(self, dtype): + self.s.value_counts() - def time_series_dropna_int64(self): - self.s.dropna() +class Dir(object): -class series_dropna_datetime(object): goal_time = 0.2 def setup(self): - self.s = Series(pd.date_range('2000-01-01', freq='S', periods=1000000)) - self.s[np.random.randint(1, 1000000, 100)] = pd.NaT + self.s = Series(index=tm.makeStringIndex(10000)) - def time_series_dropna_datetime(self): - self.s.dropna() + def time_dir_strings(self): + dir(self.s) -class series_clip(object): +class SeriesGetattr(object): + # https://github.com/pandas-dev/pandas/issues/19764 goal_time = 0.2 def setup(self): - self.s = pd.Series(np.random.randn(50)) + self.s = Series(1, + index=date_range("2012-01-01", freq='s', + periods=int(1e6))) - def time_series_dropna_datetime(self): - self.s.clip(0, 1) + def time_series_datetimeindex_repr(self): + getattr(self.s, 'a', None) diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index a46205026481e5..dcb7694abc2ad9 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -1,211 +1,162 @@ import itertools -from .pandas_vb_common import * +import numpy as np import scipy.sparse -from pandas import SparseSeries, SparseDataFrame, SparseArray +from pandas import (SparseSeries, SparseDataFrame, SparseArray, Series, + date_range, MultiIndex) +from .pandas_vb_common import setup # noqa -class sparse_series_to_frame(object): - goal_time = 0.2 - def setup(self): - self.K = 50 - self.N = 50000 - self.rng = np.asarray(date_range('1/1/2000', periods=self.N, freq='T')) - self.series = {} - for i in range(1, (self.K + 1)): - self.data = np.random.randn(self.N)[:(- i)] - self.this_rng = self.rng[:(- i)] - self.data[100:] = np.nan - self.series[i] = SparseSeries(self.data, index=self.this_rng) +def make_array(size, dense_proportion, fill_value, dtype): + dense_size = int(size * dense_proportion) + arr = np.full(size, fill_value, dtype) + indexer = np.random.choice(np.arange(size), dense_size, replace=False) + arr[indexer] = np.random.choice(np.arange(100, dtype=dtype), dense_size) + return arr - def time_sparse_series_to_frame(self): - SparseDataFrame(self.series) +class SparseSeriesToFrame(object): -class sparse_array_constructor(object): goal_time = 0.2 def setup(self): - np.random.seed(1) - self.int64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=0, dtype=np.int64) - self.int64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=0, dtype=np.int64) - - self.float64_10percent = self.make_numeric_array(length=1000000, dense_size=100000, fill_value=np.nan, dtype=np.float64) - self.float64_1percent = self.make_numeric_array(length=1000000, dense_size=10000, fill_value=np.nan, dtype=np.float64) - - self.object_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=np.nan) - self.object_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=np.nan) - - self.object_non_nan_fill_value_10percent = self.make_object_array(length=1000000, dense_size=100000, fill_value=0) - self.object_non_nan_fill_value_1percent = self.make_object_array(length=1000000, dense_size=10000, fill_value=0) - - def make_numeric_array(self, length, dense_size, fill_value, dtype): - arr = np.array([fill_value] * length, dtype=dtype) - indexer = np.unique(np.random.randint(0, length, dense_size)) - arr[indexer] = np.random.randint(0, 100, len(indexer)) - return (arr, fill_value, dtype) - - def make_object_array(self, length, dense_size, fill_value): - elems = np.array(['a', 0.0, False, 1, 2], dtype=np.object) - arr = np.array([fill_value] * length, dtype=np.object) - indexer = np.unique(np.random.randint(0, length, dense_size)) - arr[indexer] = np.random.choice(elems, len(indexer)) - return (arr, fill_value, np.object) - - def time_sparse_array_constructor_int64_10percent(self): - arr, fill_value, dtype = self.int64_10percent - SparseArray(arr, fill_value=fill_value, dtype=dtype) - - def time_sparse_array_constructor_int64_1percent(self): - arr, fill_value, dtype = self.int64_1percent - SparseArray(arr, fill_value=fill_value, dtype=dtype) - - def time_sparse_array_constructor_float64_10percent(self): - arr, fill_value, dtype = self.float64_10percent - SparseArray(arr, fill_value=fill_value, dtype=dtype) - - def time_sparse_array_constructor_float64_1percent(self): - arr, fill_value, dtype = self.float64_1percent - SparseArray(arr, fill_value=fill_value, dtype=dtype) - - def time_sparse_array_constructor_object_nan_fill_value_10percent(self): - arr, fill_value, dtype = self.object_nan_fill_value_10percent - SparseArray(arr, fill_value=fill_value, dtype=dtype) - - def time_sparse_array_constructor_object_nan_fill_value_1percent(self): - arr, fill_value, dtype = self.object_nan_fill_value_1percent - SparseArray(arr, fill_value=fill_value, dtype=dtype) + K = 50 + N = 50001 + rng = date_range('1/1/2000', periods=N, freq='T') + self.series = {} + for i in range(1, K): + data = np.random.randn(N)[:-i] + idx = rng[:-i] + data[100:] = np.nan + self.series[i] = SparseSeries(data, index=idx) - def time_sparse_array_constructor_object_non_nan_fill_value_10percent(self): - arr, fill_value, dtype = self.object_non_nan_fill_value_10percent - SparseArray(arr, fill_value=fill_value, dtype=dtype) + def time_series_to_frame(self): + SparseDataFrame(self.series) - def time_sparse_array_constructor_object_non_nan_fill_value_1percent(self): - arr, fill_value, dtype = self.object_non_nan_fill_value_1percent - SparseArray(arr, fill_value=fill_value, dtype=dtype) +class SparseArrayConstructor(object): -class sparse_frame_constructor(object): goal_time = 0.2 + params = ([0.1, 0.01], [0, np.nan], + [np.int64, np.float64, np.object]) + param_names = ['dense_proportion', 'fill_value', 'dtype'] - def time_sparse_frame_constructor(self): - SparseDataFrame(columns=np.arange(100), index=np.arange(1000)) + def setup(self, dense_proportion, fill_value, dtype): + N = 10**6 + self.array = make_array(N, dense_proportion, fill_value, dtype) - def time_sparse_from_scipy(self): - SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005)) + def time_sparse_array(self, dense_proportion, fill_value, dtype): + SparseArray(self.array, fill_value=fill_value, dtype=dtype) - def time_sparse_from_dict(self): - SparseDataFrame(dict(zip(range(1000), itertools.repeat([0])))) +class SparseDataFrameConstructor(object): -class sparse_series_from_coo(object): goal_time = 0.2 def setup(self): - self.A = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(100, 100)) + N = 1000 + self.arr = np.arange(N) + self.sparse = scipy.sparse.rand(N, N, 0.005) + self.dict = dict(zip(range(N), itertools.repeat([0]))) - def time_sparse_series_from_coo(self): - self.ss = SparseSeries.from_coo(self.A) + def time_constructor(self): + SparseDataFrame(columns=self.arr, index=self.arr) + def time_from_scipy(self): + SparseDataFrame(self.sparse) -class sparse_series_to_coo(object): - goal_time = 0.2 + def time_from_dict(self): + SparseDataFrame(self.dict) - def setup(self): - self.s = pd.Series(([np.nan] * 10000)) - self.s[0] = 3.0 - self.s[100] = (-1.0) - self.s[999] = 12.1 - self.s.index = pd.MultiIndex.from_product((range(10), range(10), range(10), range(10))) - self.ss = self.s.to_sparse() - - def time_sparse_series_to_coo(self): - self.ss.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) +class FromCoo(object): -class sparse_arithmetic_int(object): goal_time = 0.2 def setup(self): - np.random.seed(1) - self.a_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan) - self.b_10percent = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=np.nan) - - self.a_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0) - self.b_10percent_zero = self.make_sparse_array(length=1000000, dense_size=100000, fill_value=0) - - self.a_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan) - self.b_1percent = self.make_sparse_array(length=1000000, dense_size=10000, fill_value=np.nan) - - def make_sparse_array(self, length, dense_size, fill_value): - arr = np.array([fill_value] * length, dtype=np.float64) - indexer = np.unique(np.random.randint(0, length, dense_size)) - arr[indexer] = np.random.randint(0, 100, len(indexer)) - return pd.SparseArray(arr, fill_value=fill_value) - - def time_sparse_make_union(self): - self.a_10percent.sp_index.make_union(self.b_10percent.sp_index) + self.matrix = scipy.sparse.coo_matrix(([3.0, 1.0, 2.0], + ([1, 0, 0], [0, 2, 3])), + shape=(100, 100)) - def time_sparse_intersect(self): - self.a_10percent.sp_index.intersect(self.b_10percent.sp_index) - - def time_sparse_addition_10percent(self): - self.a_10percent + self.b_10percent + def time_sparse_series_from_coo(self): + SparseSeries.from_coo(self.matrix) - def time_sparse_addition_10percent_zero(self): - self.a_10percent_zero + self.b_10percent_zero - def time_sparse_addition_1percent(self): - self.a_1percent + self.b_1percent +class ToCoo(object): - def time_sparse_division_10percent(self): - self.a_10percent / self.b_10percent + goal_time = 0.2 - def time_sparse_division_10percent_zero(self): - self.a_10percent_zero / self.b_10percent_zero + def setup(self): + s = Series([np.nan] * 10000) + s[0] = 3.0 + s[100] = -1.0 + s[999] = 12.1 + s.index = MultiIndex.from_product([range(10)] * 4) + self.ss = s.to_sparse() - def time_sparse_division_1percent(self): - self.a_1percent / self.b_1percent + def time_sparse_series_to_coo(self): + self.ss.to_coo(row_levels=[0, 1], + column_levels=[2, 3], + sort_labels=True) +class Arithmetic(object): -class sparse_arithmetic_block(object): goal_time = 0.2 + params = ([0.1, 0.01], [0, np.nan]) + param_names = ['dense_proportion', 'fill_value'] - def setup(self): - np.random.seed(1) - self.a = self.make_sparse_array(length=1000000, num_blocks=1000, - block_size=10, fill_value=np.nan) - self.b = self.make_sparse_array(length=1000000, num_blocks=1000, - block_size=10, fill_value=np.nan) - - self.a_zero = self.make_sparse_array(length=1000000, num_blocks=1000, - block_size=10, fill_value=0) - self.b_zero = self.make_sparse_array(length=1000000, num_blocks=1000, - block_size=10, fill_value=np.nan) + def setup(self, dense_proportion, fill_value): + N = 10**6 + arr1 = make_array(N, dense_proportion, fill_value, np.int64) + self.array1 = SparseArray(arr1, fill_value=fill_value) + arr2 = make_array(N, dense_proportion, fill_value, np.int64) + self.array2 = SparseArray(arr2, fill_value=fill_value) - def make_sparse_array(self, length, num_blocks, block_size, fill_value): - a = np.array([fill_value] * length) - for block in range(num_blocks): - i = np.random.randint(0, length) - a[i:i + block_size] = np.random.randint(0, 100, len(a[i:i + block_size])) - return pd.SparseArray(a, fill_value=fill_value) + def time_make_union(self, dense_proportion, fill_value): + self.array1.sp_index.make_union(self.array2.sp_index) - def time_sparse_make_union(self): - self.a.sp_index.make_union(self.b.sp_index) + def time_intersect(self, dense_proportion, fill_value): + self.array1.sp_index.intersect(self.array2.sp_index) - def time_sparse_intersect(self): - self.a.sp_index.intersect(self.b.sp_index) + def time_add(self, dense_proportion, fill_value): + self.array1 + self.array2 - def time_sparse_addition(self): - self.a + self.b + def time_divide(self, dense_proportion, fill_value): + self.array1 / self.array2 - def time_sparse_addition_zero(self): - self.a_zero + self.b_zero - def time_sparse_division(self): - self.a / self.b +class ArithmeticBlock(object): - def time_sparse_division_zero(self): - self.a_zero / self.b_zero + goal_time = 0.2 + params = [np.nan, 0] + param_names = ['fill_value'] + + def setup(self, fill_value): + N = 10**6 + self.arr1 = self.make_block_array(length=N, num_blocks=1000, + block_size=10, fill_value=fill_value) + self.arr2 = self.make_block_array(length=N, num_blocks=1000, + block_size=10, fill_value=fill_value) + + def make_block_array(self, length, num_blocks, block_size, fill_value): + arr = np.full(length, fill_value) + indicies = np.random.choice(np.arange(0, length, block_size), + num_blocks, + replace=False) + for ind in indicies: + arr[ind:ind + block_size] = np.random.randint(0, 100, block_size) + return SparseArray(arr, fill_value=fill_value) + + def time_make_union(self, fill_value): + self.arr1.sp_index.make_union(self.arr2.sp_index) + + def time_intersect(self, fill_value): + self.arr2.sp_index.intersect(self.arr2.sp_index) + + def time_addition(self, fill_value): + self.arr1 + self.arr2 + + def time_division(self, fill_value): + self.arr1 / self.arr2 diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 1e1eb167b46bfc..c447c78d0d0703 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -1,205 +1,114 @@ -from .pandas_vb_common import * +import numpy as np +import pandas as pd +from .pandas_vb_common import setup # noqa -def _set_use_bottleneck_False(): - try: - pd.options.compute.use_bottleneck = False - except: - from pandas.core import nanops - nanops._USE_BOTTLENECK = False +ops = ['mean', 'sum', 'median', 'std', 'skew', 'kurt', 'mad', 'prod', 'sem', + 'var'] -class FrameOps(object): - goal_time = 0.2 - - param_names = ['op', 'use_bottleneck', 'dtype', 'axis'] - params = [['mean', 'sum', 'median'], - [True, False], - ['float', 'int'], - [0, 1]] - - def setup(self, op, use_bottleneck, dtype, axis): - if dtype == 'float': - self.df = DataFrame(np.random.randn(100000, 4)) - elif dtype == 'int': - self.df = DataFrame(np.random.randint(1000, size=(100000, 4))) - - if not use_bottleneck: - _set_use_bottleneck_False() - - self.func = getattr(self.df, op) - - def time_op(self, op, use_bottleneck, dtype, axis): - self.func(axis=axis) +class FrameOps(object): -class stat_ops_level_frame_sum(object): goal_time = 0.2 + params = [ops, ['float', 'int'], [0, 1], [True, False]] + param_names = ['op', 'dtype', 'axis', 'use_bottleneck'] - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_frame_sum(self): - self.df.sum(level=1) - - -class stat_ops_level_frame_sum_multiple(object): - goal_time = 0.2 + def setup(self, op, dtype, axis, use_bottleneck): + df = pd.DataFrame(np.random.randn(100000, 4)).astype(dtype) + try: + pd.options.compute.use_bottleneck = use_bottleneck + except: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.df_func = getattr(df, op) - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def time_op(self, op, dtype, axis, use_bottleneck): + self.df_func(axis=axis) - def time_stat_ops_level_frame_sum_multiple(self): - self.df.sum(level=[0, 1]) +class FrameMultiIndexOps(object): -class stat_ops_level_series_sum(object): goal_time = 0.2 + params = ([0, 1, [0, 1]], ops) + param_names = ['level', 'op'] - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) + def setup(self, level, op): + levels = [np.arange(10), np.arange(100), np.arange(100)] + labels = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, labels=labels) + df = pd.DataFrame(np.random.randn(len(index), 4), index=index) + self.df_func = getattr(df, op) - def time_stat_ops_level_series_sum(self): - self.df[1].sum(level=1) + def time_op(self, level, op): + self.df_func(level=level) -class stat_ops_level_series_sum_multiple(object): - goal_time = 0.2 - - def setup(self): - self.index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], labels=[np.arange(10).repeat(10000), np.tile(np.arange(100).repeat(100), 10), np.tile(np.tile(np.arange(100), 100), 10)]) - random.shuffle(self.index.values) - self.df = DataFrame(np.random.randn(len(self.index), 4), index=self.index) - self.df_level = DataFrame(np.random.randn(100, 4), index=self.index.levels[1]) - - def time_stat_ops_level_series_sum_multiple(self): - self.df[1].sum(level=[0, 1]) +class SeriesOps(object): - -class stat_ops_series_std(object): goal_time = 0.2 + params = [ops, ['float', 'int'], [True, False]] + param_names = ['op', 'dtype', 'use_bottleneck'] - def setup(self): - self.s = Series(np.random.randn(100000), index=np.arange(100000)) - self.s[::2] = np.nan - - def time_stat_ops_series_std(self): - self.s.std() + def setup(self, op, dtype, use_bottleneck): + s = pd.Series(np.random.randn(100000)).astype(dtype) + try: + pd.options.compute.use_bottleneck = use_bottleneck + except: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.s_func = getattr(s, op) + def time_op(self, op, dtype, use_bottleneck): + self.s_func() -class stats_corr_spearman(object): - goal_time = 0.2 - def setup(self): - self.df = DataFrame(np.random.randn(1000, 30)) +class SeriesMultiIndexOps(object): - def time_stats_corr_spearman(self): - self.df.corr(method='spearman') - - -class stats_rank2d_axis0_average(object): goal_time = 0.2 + params = ([0, 1, [0, 1]], ops) + param_names = ['level', 'op'] - def setup(self): - self.df = DataFrame(np.random.randn(5000, 50)) - - def time_stats_rank2d_axis0_average(self): - self.df.rank() + def setup(self, level, op): + levels = [np.arange(10), np.arange(100), np.arange(100)] + labels = [np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)] + index = pd.MultiIndex(levels=levels, labels=labels) + s = pd.Series(np.random.randn(len(index)), index=index) + self.s_func = getattr(s, op) + def time_op(self, level, op): + self.s_func(level=level) -class stats_rank2d_axis1_average(object): - goal_time = 0.2 - - def setup(self): - self.df = DataFrame(np.random.randn(5000, 50)) - def time_stats_rank2d_axis1_average(self): - self.df.rank(1) +class Rank(object): - -class stats_rank_average(object): - goal_time = 0.2 - - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) - - def time_stats_rank_average(self): - self.s.rank() - - -class stats_rank_average_int(object): - goal_time = 0.2 - - def setup(self): - self.values = np.random.randint(0, 100000, size=200000) - self.s = Series(self.values) - - def time_stats_rank_average_int(self): - self.s.rank() - - -class stats_rank_pct_average(object): goal_time = 0.2 + params = [['DataFrame', 'Series'], [True, False]] + param_names = ['constructor', 'pct'] - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) + def setup(self, constructor, pct): + values = np.random.randn(10**5) + self.data = getattr(pd, constructor)(values) - def time_stats_rank_pct_average(self): - self.s.rank(pct=True) + def time_rank(self, constructor, pct): + self.data.rank(pct=pct) + def time_average_old(self, constructor, pct): + self.data.rank(pct=pct) / len(self.data) -class stats_rank_pct_average_old(object): - goal_time = 0.2 - - def setup(self): - self.values = np.concatenate([np.arange(100000), np.random.randn(100000), np.arange(100000)]) - self.s = Series(self.values) - - def time_stats_rank_pct_average_old(self): - (self.s.rank() / len(self.s)) +class Correlation(object): -class stats_rolling_mean(object): goal_time = 0.2 + params = ['spearman', 'kendall', 'pearson'] + param_names = ['method'] - def setup(self): - self.arr = np.random.randn(100000) - self.win = 100 - - def time_rolling_mean(self): - rolling_mean(self.arr, self.win) - - def time_rolling_median(self): - rolling_median(self.arr, self.win) - - def time_rolling_min(self): - rolling_min(self.arr, self.win) - - def time_rolling_max(self): - rolling_max(self.arr, self.win) - - def time_rolling_sum(self): - rolling_sum(self.arr, self.win) - - def time_rolling_std(self): - rolling_std(self.arr, self.win) - - def time_rolling_var(self): - rolling_var(self.arr, self.win) - - def time_rolling_skew(self): - rolling_skew(self.arr, self.win) + def setup(self, method): + self.df = pd.DataFrame(np.random.randn(1000, 30)) - def time_rolling_kurt(self): - rolling_kurt(self.arr, self.win) + def time_corr(self, method): + self.df.corr(method=method) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index c1600d4e07f583..ccfac2f73f14d2 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -1,107 +1,170 @@ -from .pandas_vb_common import * -import string -import itertools as IT -import pandas.util.testing as testing +import warnings +import numpy as np +from pandas import Series, DataFrame +import pandas.util.testing as tm -class StringMethods(object): - goal_time = 0.2 - def make_series(self, letters, strlen, size): - return Series([str(x) for x in np.fromiter(IT.cycle(letters), count=(size * strlen), dtype='|S1').view('|S{}'.format(strlen))]) +class Methods(object): - def setup(self): - self.many = self.make_series(('matchthis' + string.ascii_uppercase), strlen=19, size=10000) - self.few = self.make_series(('matchthis' + (string.ascii_uppercase * 42)), strlen=19, size=10000) - self.s = self.make_series(string.ascii_uppercase, strlen=10, size=10000).str.join('|') + goal_time = 0.2 - def time_cat(self): - self.many.str.cat(sep=',') + def setup(self): + self.s = Series(tm.makeStringIndex(10**5)) def time_center(self): - self.many.str.center(100) - - def time_contains_few(self): - self.few.str.contains('matchthis') - - def time_contains_few_noregex(self): - self.few.str.contains('matchthis', regex=False) - - def time_contains_many(self): - self.many.str.contains('matchthis') - - def time_contains_many_noregex(self): - self.many.str.contains('matchthis', regex=False) + self.s.str.center(100) def time_count(self): - self.many.str.count('matchthis') + self.s.str.count('A') def time_endswith(self): - self.many.str.endswith('matchthis') + self.s.str.endswith('A') def time_extract(self): - self.many.str.extract('(\\w*)matchthis(\\w*)') + with warnings.catch_warnings(record=True): + self.s.str.extract('(\\w*)A(\\w*)') def time_findall(self): - self.many.str.findall('[A-Z]+') + self.s.str.findall('[A-Z]+') def time_get(self): - self.many.str.get(0) - - def time_join_split(self): - self.many.str.join('--').str.split('--') - - def time_join_split_expand(self): - self.many.str.join('--').str.split('--', expand=True) + self.s.str.get(0) def time_len(self): - self.many.str.len() + self.s.str.len() def time_match(self): - self.many.str.match('mat..this') + self.s.str.match('A') def time_pad(self): - self.many.str.pad(100, side='both') - - def time_repeat(self): - self.many.str.repeat(list(IT.islice(IT.cycle(range(1, 4)), len(self.many)))) + self.s.str.pad(100, side='both') def time_replace(self): - self.many.str.replace('(matchthis)', '\x01\x01') + self.s.str.replace('A', '\x01\x01') def time_slice(self): - self.many.str.slice(5, 15, 2) + self.s.str.slice(5, 15, 2) def time_startswith(self): - self.many.str.startswith('matchthis') + self.s.str.startswith('A') def time_strip(self): - self.many.str.strip('matchthis') + self.s.str.strip('A') def time_rstrip(self): - self.many.str.rstrip('matchthis') + self.s.str.rstrip('A') def time_lstrip(self): - self.many.str.lstrip('matchthis') + self.s.str.lstrip('A') def time_title(self): - self.many.str.title() + self.s.str.title() def time_upper(self): - self.many.str.upper() + self.s.str.upper() def time_lower(self): - self.many.str.lower() + self.s.str.lower() + + +class Repeat(object): + + goal_time = 0.2 + params = ['int', 'array'] + param_names = ['repeats'] + + def setup(self, repeats): + N = 10**5 + self.s = Series(tm.makeStringIndex(N)) + repeat = {'int': 1, 'array': np.random.randint(1, 3, N)} + self.repeat = repeat[repeats] + + def time_repeat(self, repeats): + self.s.str.repeat(self.repeat) + + +class Cat(object): + + goal_time = 0.2 + params = ([0, 3], [None, ','], [None, '-'], [0.0, 0.001, 0.15]) + param_names = ['other_cols', 'sep', 'na_rep', 'na_frac'] + + def setup(self, other_cols, sep, na_rep, na_frac): + N = 10 ** 5 + mask_gen = lambda: np.random.choice([True, False], N, + p=[1 - na_frac, na_frac]) + self.s = Series(tm.makeStringIndex(N)).where(mask_gen()) + if other_cols == 0: + # str.cat self-concatenates only for others=None + self.others = None + else: + self.others = DataFrame({i: tm.makeStringIndex(N).where(mask_gen()) + for i in range(other_cols)}) + + def time_cat(self, other_cols, sep, na_rep, na_frac): + # before the concatenation (one caller + other_cols columns), the total + # expected fraction of rows containing any NaN is: + # reduce(lambda t, _: t + (1 - t) * na_frac, range(other_cols + 1), 0) + # for other_cols=3 and na_frac=0.15, this works out to ~48% + self.s.str.cat(others=self.others, sep=sep, na_rep=na_rep) + + +class Contains(object): + + goal_time = 0.2 + params = [True, False] + param_names = ['regex'] + + def setup(self, regex): + self.s = Series(tm.makeStringIndex(10**5)) + + def time_contains(self, regex): + self.s.str.contains('A', regex=regex) + + +class Split(object): + + goal_time = 0.2 + params = [True, False] + param_names = ['expand'] + + def setup(self, expand): + self.s = Series(tm.makeStringIndex(10**5)).str.join('--') + + def time_split(self, expand): + self.s.str.split('--', expand=expand) + + +class Dummies(object): + + goal_time = 0.2 + + def setup(self): + self.s = Series(tm.makeStringIndex(10**5)).str.join('|') def time_get_dummies(self): self.s.str.get_dummies('|') -class StringEncode(object): +class Encode(object): + goal_time = 0.2 def setup(self): - self.ser = Series(testing.makeUnicodeIndex()) + self.ser = Series(tm.makeUnicodeIndex()) def time_encode_decode(self): self.ser.str.encode('utf-8').str.decode('utf-8') + + +class Slice(object): + + goal_time = 0.2 + + def setup(self): + self.s = Series(['abcdefg', np.nan] * 500000) + + def time_vector_slice(self): + # GH 2602 + self.s.str[:5] diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index c112d1ef72eb80..3fe75b3c34299d 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -1,42 +1,129 @@ -from .pandas_vb_common import * -from pandas import to_timedelta, Timestamp +import datetime +import numpy as np +from pandas import Series, timedelta_range, to_timedelta, Timestamp, Timedelta + + +class TimedeltaConstructor(object): -class ToTimedelta(object): goal_time = 0.2 - def setup(self): - self.arr = np.random.randint(0, 1000, size=10000) - self.arr2 = ['{0} days'.format(i) for i in self.arr] + def time_from_int(self): + Timedelta(123456789) + + def time_from_unit(self): + Timedelta(1, unit='d') + + def time_from_components(self): + Timedelta(days=1, hours=2, minutes=3, seconds=4, milliseconds=5, + microseconds=6, nanoseconds=7) + + def time_from_datetime_timedelta(self): + Timedelta(datetime.timedelta(days=1, seconds=1)) + + def time_from_np_timedelta(self): + Timedelta(np.timedelta64(1, 'ms')) + + def time_from_string(self): + Timedelta('1 days') + + def time_from_iso_format(self): + Timedelta('P4DT12H30M5S') - self.arr3 = np.random.randint(0, 60, size=10000) - self.arr3 = ['00:00:{0:02d}'.format(i) for i in self.arr3] + def time_from_missing(self): + Timedelta('nat') - self.arr4 = list(self.arr2) - self.arr4[-1] = 'apple' + +class ToTimedelta(object): + + goal_time = 0.2 + + def setup(self): + self.ints = np.random.randint(0, 60, size=10000) + self.str_days = [] + self.str_seconds = [] + for i in self.ints: + self.str_days.append('{0} days'.format(i)) + self.str_seconds.append('00:00:{0:02d}'.format(i)) def time_convert_int(self): - to_timedelta(self.arr, unit='s') + to_timedelta(self.ints, unit='s') - def time_convert_string(self): - to_timedelta(self.arr2) + def time_convert_string_days(self): + to_timedelta(self.str_days) def time_convert_string_seconds(self): - to_timedelta(self.arr3) + to_timedelta(self.str_seconds) + + +class ToTimedeltaErrors(object): + + goal_time = 0.2 + params = ['coerce', 'ignore'] + param_names = ['errors'] - def time_convert_coerce(self): - to_timedelta(self.arr4, errors='coerce') + def setup(self, errors): + ints = np.random.randint(0, 60, size=10000) + self.arr = ['{0} days'.format(i) for i in ints] + self.arr[-1] = 'apple' - def time_convert_ignore(self): - to_timedelta(self.arr4, errors='ignore') + def time_convert(self, errors): + to_timedelta(self.arr, errors=errors) -class Ops(object): +class TimedeltaOps(object): + goal_time = 0.2 def setup(self): self.td = to_timedelta(np.arange(1000000)) self.ts = Timestamp('2000') - def test_add_td_ts(self): + def time_add_td_ts(self): self.td + self.ts + + +class TimedeltaProperties(object): + + goal_time = 0.2 + + def setup_cache(self): + td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35) + return td + + def time_timedelta_days(self, td): + td.days + + def time_timedelta_seconds(self, td): + td.seconds + + def time_timedelta_microseconds(self, td): + td.microseconds + + def time_timedelta_nanoseconds(self, td): + td.nanoseconds + + +class DatetimeAccessor(object): + + goal_time = 0.2 + + def setup_cache(self): + N = 100000 + series = Series(timedelta_range('1 days', periods=N, freq='h')) + return series + + def time_dt_accessor(self, series): + series.dt + + def time_timedelta_days(self, series): + series.dt.days + + def time_timedelta_seconds(self, series): + series.dt.seconds + + def time_timedelta_microseconds(self, series): + series.dt.microseconds + + def time_timedelta_nanoseconds(self, series): + series.dt.nanoseconds diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index 779fc0bd20964a..2c98cc16595199 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -1,364 +1,331 @@ +import warnings +from datetime import timedelta + +import numpy as np +from pandas import to_datetime, date_range, Series, DataFrame, period_range +from pandas.tseries.frequencies import infer_freq try: from pandas.plotting._converter import DatetimeConverter except ImportError: from pandas.tseries.converter import DatetimeConverter -from .pandas_vb_common import * -import pandas as pd -import datetime as dt -try: - import pandas.tseries.holiday -except ImportError: - pass -from pandas.tseries.frequencies import infer_freq -import numpy as np -if hasattr(Series, 'convert'): - Series.resample = Series.convert +from .pandas_vb_common import setup # noqa class DatetimeIndex(object): + goal_time = 0.2 + params = ['dst', 'repeated', 'tz_aware', 'tz_naive'] + param_names = ['index_type'] - def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() + def setup(self, index_type): + N = 100000 + dtidxes = {'dst': date_range(start='10/29/2000 1:00:00', + end='10/29/2000 1:59:59', freq='S'), + 'repeated': date_range(start='2000', + periods=N / 10, + freq='s').repeat(10), + 'tz_aware': date_range(start='2000', + periods=N, + freq='s', + tz='US/Eastern'), + 'tz_naive': date_range(start='2000', + periods=N, + freq='s')} + self.index = dtidxes[index_type] - self.rng2 = date_range(start='1/1/2000 9:30', periods=10000, freq='S', tz='US/Eastern') + def time_add_timedelta(self, index_type): + self.index + timedelta(minutes=2) - self.index_repeated = date_range(start='1/1/2000', periods=1000, freq='T').repeat(10) + def time_normalize(self, index_type): + self.index.normalize() - self.rng3 = date_range(start='1/1/2000', periods=1000, freq='H') - self.df = DataFrame(np.random.randn(len(self.rng3), 2), self.rng3) + def time_unique(self, index_type): + self.index.unique() - self.rng4 = date_range(start='1/1/2000', periods=1000, freq='H', tz='US/Eastern') - self.df2 = DataFrame(np.random.randn(len(self.rng4), 2), index=self.rng4) + def time_to_time(self, index_type): + self.index.time - N = 100000 - self.dti = pd.date_range('2011-01-01', freq='H', periods=N).repeat(5) - self.dti_tz = pd.date_range('2011-01-01', freq='H', periods=N, - tz='Asia/Tokyo').repeat(5) + def time_get(self, index_type): + self.index[0] - self.rng5 = date_range(start='1/1/2000', end='3/1/2000', tz='US/Eastern') + def time_timeseries_is_month_start(self, index_type): + self.index.is_month_start - self.dst_rng = date_range(start='10/29/2000 1:00:00', end='10/29/2000 1:59:59', freq='S') - self.index = date_range(start='10/29/2000', end='10/29/2000 00:59:59', freq='S') - self.index = self.index.append(self.dst_rng) - self.index = self.index.append(self.dst_rng) - self.index = self.index.append(date_range(start='10/29/2000 2:00:00', end='10/29/2000 3:00:00', freq='S')) + def time_to_date(self, index_type): + self.index.date - self.N = 10000 - self.rng6 = date_range(start='1/1/1', periods=self.N, freq='B') + def time_to_pydatetime(self, index_type): + self.index.to_pydatetime() - self.rng7 = date_range(start='1/1/1700', freq='D', periods=100000) - self.no_freq = self.rng7[:50000].append(self.rng7[50002:]) - self.d_freq = self.rng7[:50000].append(self.rng7[50000:]) - self.rng8 = date_range(start='1/1/1700', freq='B', periods=75000) - self.b_freq = self.rng8[:50000].append(self.rng8[50000:]) +class TzLocalize(object): - def time_add_timedelta(self): - (self.rng + dt.timedelta(minutes=2)) + goal_time = 0.2 + + def setup(self): + dst_rng = date_range(start='10/29/2000 1:00:00', + end='10/29/2000 1:59:59', freq='S') + self.index = date_range(start='10/29/2000', + end='10/29/2000 00:59:59', freq='S') + self.index = self.index.append(dst_rng) + self.index = self.index.append(dst_rng) + self.index = self.index.append(date_range(start='10/29/2000 2:00:00', + end='10/29/2000 3:00:00', + freq='S')) - def time_add_offset_delta(self): - (self.rng + self.delta_offset) + def time_infer_dst(self): + self.index.tz_localize('US/Eastern', ambiguous='infer') - def time_add_offset_fast(self): - (self.rng + self.fast_offset) - def time_add_offset_slow(self): - (self.rng + self.slow_offset) +class ResetIndex(object): - def time_normalize(self): - self.rng2.normalize() + goal_time = 0.2 + params = [None, 'US/Eastern'] + param_names = 'tz' - def time_unique(self): - self.index_repeated.unique() + def setup(self, tz): + idx = date_range(start='1/1/2000', periods=1000, freq='H', tz=tz) + self.df = DataFrame(np.random.randn(1000, 2), index=idx) - def time_reset_index(self): + def time_reest_datetimeindex(self, tz): self.df.reset_index() - def time_reset_index_tz(self): - self.df2.reset_index() - def time_dti_factorize(self): - self.dti.factorize() +class Factorize(object): - def time_dti_tz_factorize(self): - self.dti_tz.factorize() + goal_time = 0.2 + params = [None, 'Asia/Tokyo'] + param_names = 'tz' - def time_timestamp_tzinfo_cons(self): - self.rng5[0] + def setup(self, tz): + N = 100000 + self.dti = date_range('2011-01-01', freq='H', periods=N, tz=tz) + self.dti = self.dti.repeat(5) - def time_infer_dst(self): - self.index.tz_localize('US/Eastern', infer_dst=True) + def time_factorize(self, tz): + self.dti.factorize() - def time_timeseries_is_month_start(self): - self.rng6.is_month_start - def time_infer_freq_none(self): - infer_freq(self.no_freq) +class InferFreq(object): - def time_infer_freq_daily(self): - infer_freq(self.d_freq) + goal_time = 0.2 + params = [None, 'D', 'B'] + param_names = ['freq'] + + def setup(self, freq): + if freq is None: + self.idx = date_range(start='1/1/1700', freq='D', periods=10000) + self.idx.freq = None + else: + self.idx = date_range(start='1/1/1700', freq=freq, periods=10000) - def time_infer_freq_business(self): - infer_freq(self.b_freq) + def time_infer_freq(self, freq): + infer_freq(self.idx) class TimeDatetimeConverter(object): + goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') + N = 100000 + self.rng = date_range(start='1/1/2000', periods=N, freq='T') def time_convert(self): DatetimeConverter.convert(self.rng, None, None) class Iteration(object): - goal_time = 0.2 - def setup(self): - self.N = 1000000 - self.M = 10000 - self.idx1 = date_range(start='20140101', freq='T', periods=self.N) - self.idx2 = period_range(start='20140101', freq='T', periods=self.N) - - def iter_n(self, iterable, n=None): - self.i = 0 - for _ in iterable: - self.i += 1 - if ((n is not None) and (self.i > n)): - break - - def time_iter_datetimeindex(self): - self.iter_n(self.idx1) - - def time_iter_datetimeindex_preexit(self): - self.iter_n(self.idx1, self.M) + goal_time = 0.2 + params = [date_range, period_range] + param_names = ['time_index'] - def time_iter_periodindex(self): - self.iter_n(self.idx2) + def setup(self, time_index): + N = 10**6 + self.idx = time_index(start='20140101', freq='T', periods=N) + self.exit = 10000 - def time_iter_periodindex_preexit(self): - self.iter_n(self.idx2, self.M) + def time_iter(self, time_index): + for _ in self.idx: + pass + def time_iter_preexit(self, time_index): + for i, _ in enumerate(self.idx): + if i > self.exit: + break -#---------------------------------------------------------------------- -# Resampling class ResampleDataFrame(object): - goal_time = 0.2 - - def setup(self): - self.rng = date_range(start='20130101', periods=100000, freq='50L') - self.df = DataFrame(np.random.randn(100000, 2), index=self.rng) - def time_max_numpy(self): - self.df.resample('1s', how=np.max) - - def time_max_string(self): - self.df.resample('1s', how='max') - - def time_mean_numpy(self): - self.df.resample('1s', how=np.mean) - - def time_mean_string(self): - self.df.resample('1s', how='mean') + goal_time = 0.2 + params = ['max', 'mean', 'min'] + param_names = ['method'] - def time_min_numpy(self): - self.df.resample('1s', how=np.min) + def setup(self, method): + rng = date_range(start='20130101', periods=100000, freq='50L') + df = DataFrame(np.random.randn(100000, 2), index=rng) + self.resample = getattr(df.resample('1s'), method) - def time_min_string(self): - self.df.resample('1s', how='min') + def time_method(self, method): + self.resample() class ResampleSeries(object): + + goal_time = 0.2 + params = (['period', 'datetime'], ['5min', '1D'], ['mean', 'ohlc']) + param_names = ['index', 'freq', 'method'] + + def setup(self, index, freq, method): + indexes = {'period': period_range(start='1/1/2000', + end='1/1/2001', + freq='T'), + 'datetime': date_range(start='1/1/2000', + end='1/1/2001', + freq='T')} + idx = indexes[index] + ts = Series(np.random.randn(len(idx)), index=idx) + self.resample = getattr(ts.resample(freq), method) + + def time_resample(self, index, freq, method): + self.resample() + + +class ResampleDatetetime64(object): + # GH 7754 goal_time = 0.2 def setup(self): - self.rng1 = period_range(start='1/1/2000', end='1/1/2001', freq='T') - self.ts1 = Series(np.random.randn(len(self.rng1)), index=self.rng1) - - self.rng2 = date_range(start='1/1/2000', end='1/1/2001', freq='T') - self.ts2 = Series(np.random.randn(len(self.rng2)), index=self.rng2) + rng3 = date_range(start='2000-01-01 00:00:00', + end='2000-01-01 10:00:00', freq='555000U') + self.dt_ts = Series(5, rng3, dtype='datetime64[ns]') - self.rng3 = date_range(start='2000-01-01 00:00:00', end='2000-01-01 10:00:00', freq='555000U') - self.int_ts = Series(5, self.rng3, dtype='int64') - self.dt_ts = self.int_ts.astype('datetime64[ns]') - - def time_period_downsample_mean(self): - self.ts1.resample('D', how='mean') - - def time_timestamp_downsample_mean(self): - self.ts2.resample('D', how='mean') - - def time_resample_datetime64(self): - # GH 7754 - self.dt_ts.resample('1S', how='last') - - def time_1min_5min_mean(self): - self.ts2[:10000].resample('5min', how='mean') - - def time_1min_5min_ohlc(self): - self.ts2[:10000].resample('5min', how='ohlc') + def time_resample(self): + self.dt_ts.resample('1S').last() class AsOf(object): - goal_time = 0.2 - def setup(self): - self.N = 10000 - self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.ts = Series(np.random.randn(self.N), index=self.rng) - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') + goal_time = 0.2 + params = ['DataFrame', 'Series'] + param_names = ['constructor'] + + def setup(self, constructor): + N = 10000 + M = 10 + rng = date_range(start='1/1/1990', periods=N, freq='53s') + data = {'DataFrame': DataFrame(np.random.randn(N, M)), + 'Series': Series(np.random.randn(N))} + self.ts = data[constructor] + self.ts.index = rng self.ts2 = self.ts.copy() - self.ts2[250:5000] = np.nan + self.ts2.iloc[250:5000] = np.nan self.ts3 = self.ts.copy() - self.ts3[-5000:] = np.nan + self.ts3.iloc[-5000:] = np.nan + self.dates = date_range(start='1/1/1990', periods=N * 10, freq='5s') + self.date = self.dates[0] + self.date_last = self.dates[-1] + self.date_early = self.date - timedelta(10) # test speed of pre-computing NAs. - def time_asof(self): + def time_asof(self, constructor): self.ts.asof(self.dates) # should be roughly the same as above. - def time_asof_nan(self): + def time_asof_nan(self, constructor): self.ts2.asof(self.dates) # test speed of the code path for a scalar index # without *while* loop - def time_asof_single(self): - self.ts.asof(self.dates[0]) + def time_asof_single(self, constructor): + self.ts.asof(self.date) # test speed of the code path for a scalar index # before the start. should be the same as above. - def time_asof_single_early(self): - self.ts.asof(self.dates[0] - dt.timedelta(10)) + def time_asof_single_early(self, constructor): + self.ts.asof(self.date_early) # test the speed of the code path for a scalar index # with a long *while* loop. should still be much # faster than pre-computing all the NAs. - def time_asof_nan_single(self): - self.ts3.asof(self.dates[-1]) + def time_asof_nan_single(self, constructor): + self.ts3.asof(self.date_last) -class AsOfDataFrame(object): - goal_time = 0.2 +class SortIndex(object): - def setup(self): - self.N = 10000 - self.M = 100 - self.rng = date_range(start='1/1/1990', periods=self.N, freq='53s') - self.dates = date_range(start='1/1/1990', periods=(self.N * 10), freq='5s') - self.ts = DataFrame(np.random.randn(self.N, self.M), index=self.rng) - self.ts2 = self.ts.copy() - self.ts2.iloc[250:5000] = np.nan - self.ts3 = self.ts.copy() - self.ts3.iloc[-5000:] = np.nan - - # test speed of pre-computing NAs. - def time_asof(self): - self.ts.asof(self.dates) + goal_time = 0.2 + params = [True, False] + param_names = ['monotonic'] - # should be roughly the same as above. - def time_asof_nan(self): - self.ts2.asof(self.dates) + def setup(self, monotonic): + N = 10**5 + idx = date_range(start='1/1/2000', periods=N, freq='s') + self.s = Series(np.random.randn(N), index=idx) + if not monotonic: + self.s = self.s.sample(frac=1) - # test speed of the code path for a scalar index - # with pre-computing all NAs. - def time_asof_single(self): - self.ts.asof(self.dates[0]) + def time_sort_index(self, monotonic): + self.s.sort_index() - # should be roughly the same as above. - def time_asof_nan_single(self): - self.ts3.asof(self.dates[-1]) + def time_get_slice(self, monotonic): + self.s[:10000] - # test speed of the code path for a scalar index - # before the start. should be without the cost of - # pre-computing all the NAs. - def time_asof_single_early(self): - self.ts.asof(self.dates[0] - dt.timedelta(10)) +class IrregularOps(object): -class TimeSeries(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='s') - self.rng = self.rng.take(np.random.permutation(self.N)) - self.ts = Series(np.random.randn(self.N), index=self.rng) - - self.rng2 = date_range(start='1/1/2000', periods=self.N, freq='T') - self.ts2 = Series(np.random.randn(self.N), index=self.rng2) + N = 10**5 + idx = date_range(start='1/1/2000', periods=N, freq='s') + s = Series(np.random.randn(N), index=idx) + self.left = s.sample(frac=1) + self.right = s.sample(frac=1) - self.lindex = np.random.permutation(self.N)[:(self.N // 2)] - self.rindex = np.random.permutation(self.N)[:(self.N // 2)] - self.left = Series(self.ts2.values.take(self.lindex), index=self.ts2.index.take(self.lindex)) - self.right = Series(self.ts2.values.take(self.rindex), index=self.ts2.index.take(self.rindex)) + def time_add(self): + self.left + self.right - self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S') - self.ts3 = Series(1, index=self.rng3) - def time_sort_index_monotonic(self): - self.ts2.sort_index() +class Lookup(object): - def time_sort_index_non_monotonic(self): - self.ts.sort_index() + goal_time = 0.2 - def time_timeseries_slice_minutely(self): - self.ts2[:10000] + def setup(self): + N = 1500000 + rng = date_range(start='1/1/2000', periods=N, freq='S') + self.ts = Series(1, index=rng) + self.lookup_val = rng[N // 2] - def time_add_irregular(self): - (self.left + self.right) + def time_lookup_and_cleanup(self): + self.ts[self.lookup_val] + self.ts.index._cleanup() - def time_large_lookup_value(self): - self.ts3[self.ts3.index[(len(self.ts3) // 2)]] - self.ts3.index._cleanup() +class ToDatetimeYYYYMMDD(object): -class SeriesArithmetic(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.s = Series(date_range(start='20140101', freq='T', periods=self.N)) - self.delta_offset = pd.offsets.Day() - self.fast_offset = pd.offsets.DateOffset(months=2, days=2) - self.slow_offset = pd.offsets.BusinessDay() - - def time_add_offset_delta(self): - (self.s + self.delta_offset) + rng = date_range(start='1/1/2000', periods=10000, freq='D') + self.stringsD = Series(rng.strftime('%Y%m%d')) - def time_add_offset_fast(self): - (self.s + self.fast_offset) + def time_format_YYYYMMDD(self): + to_datetime(self.stringsD, format='%Y%m%d') - def time_add_offset_slow(self): - (self.s + self.slow_offset) +class ToDatetimeISO8601(object): -class ToDatetime(object): goal_time = 0.2 def setup(self): - self.rng = date_range(start='1/1/2000', periods=10000, freq='D') - self.stringsD = Series((((self.rng.year * 10000) + (self.rng.month * 100)) + self.rng.day), dtype=np.int64).apply(str) - - self.rng = date_range(start='1/1/2000', periods=20000, freq='H') - self.strings = [x.strftime('%Y-%m-%d %H:%M:%S') for x in self.rng] - self.strings_nosep = [x.strftime('%Y%m%d %H:%M:%S') for x in self.rng] + rng = date_range(start='1/1/2000', periods=20000, freq='H') + self.strings = rng.strftime('%Y-%m-%d %H:%M:%S').tolist() + self.strings_nosep = rng.strftime('%Y%m%d %H:%M:%S').tolist() self.strings_tz_space = [x.strftime('%Y-%m-%d %H:%M:%S') + ' -0800' - for x in self.rng] - - self.s = Series((['19MAY11', '19MAY11:00:00:00'] * 100000)) - self.s2 = self.s.str.replace(':\\S+$', '') - - def time_format_YYYYMMDD(self): - to_datetime(self.stringsD, format='%Y%m%d') + for x in rng] def time_iso8601(self): to_datetime(self.strings) @@ -375,149 +342,75 @@ def time_iso8601_format_no_sep(self): def time_iso8601_tz_spaceformat(self): to_datetime(self.strings_tz_space) - def time_format_exact(self): - to_datetime(self.s2, format='%d%b%y') - - def time_format_no_exact(self): - to_datetime(self.s, format='%d%b%y', exact=False) +class ToDatetimeNONISO8601(object): -class Offsets(object): goal_time = 0.2 def setup(self): - self.date = dt.datetime(2011, 1, 1) - self.dt64 = np.datetime64('2011-01-01 09:00Z') - self.hcal = pd.tseries.holiday.USFederalHolidayCalendar() - self.day = pd.offsets.Day() - self.year = pd.offsets.YearBegin() - self.cday = pd.offsets.CustomBusinessDay() - self.cmb = pd.offsets.CustomBusinessMonthBegin(calendar=self.hcal) - self.cme = pd.offsets.CustomBusinessMonthEnd(calendar=self.hcal) - self.cdayh = pd.offsets.CustomBusinessDay(calendar=self.hcal) + N = 10000 + half = int(N / 2) + ts_string_1 = 'March 1, 2018 12:00:00+0400' + ts_string_2 = 'March 1, 2018 12:00:00+0500' + self.same_offset = [ts_string_1] * N + self.diff_offset = [ts_string_1] * half + [ts_string_2] * half - def time_timeseries_day_apply(self): - self.day.apply(self.date) + def time_same_offset(self): + to_datetime(self.same_offset) - def time_timeseries_day_incr(self): - (self.date + self.day) + def time_different_offset(self): + to_datetime(self.diff_offset) - def time_timeseries_year_apply(self): - self.year.apply(self.date) - def time_timeseries_year_incr(self): - (self.date + self.year) +class ToDatetimeFormat(object): - # custom business offsets - - def time_custom_bday_decr(self): - (self.date - self.cday) - - def time_custom_bday_incr(self): - (self.date + self.cday) - - def time_custom_bday_apply(self): - self.cday.apply(self.date) - - def time_custom_bday_apply_dt64(self): - self.cday.apply(self.dt64) - - def time_custom_bday_cal_incr(self): - self.date + 1 * self.cdayh - - def time_custom_bday_cal_decr(self): - self.date - 1 * self.cdayh - - def time_custom_bday_cal_incr_n(self): - self.date + 10 * self.cdayh - - def time_custom_bday_cal_incr_neg_n(self): - self.date - 10 * self.cdayh - - # Increment custom business month - - def time_custom_bmonthend_incr(self): - (self.date + self.cme) - - def time_custom_bmonthend_incr_n(self): - (self.date + (10 * self.cme)) - - def time_custom_bmonthend_decr_n(self): - (self.date - (10 * self.cme)) - - def time_custom_bmonthbegin_decr_n(self): - (self.date - (10 * self.cmb)) - - def time_custom_bmonthbegin_incr_n(self): - (self.date + (10 * self.cmb)) - - -class SemiMonthOffset(object): goal_time = 0.2 def setup(self): - self.N = 100000 - self.rng = date_range(start='1/1/2000', periods=self.N, freq='T') - # date is not on an offset which will be slowest case - self.date = dt.datetime(2011, 1, 2) - self.semi_month_end = pd.offsets.SemiMonthEnd() - self.semi_month_begin = pd.offsets.SemiMonthBegin() - - def time_end_apply(self): - self.semi_month_end.apply(self.date) - - def time_end_incr(self): - self.date + self.semi_month_end - - def time_end_incr_n(self): - self.date + 10 * self.semi_month_end - - def time_end_decr(self): - self.date - self.semi_month_end - - def time_end_decr_n(self): - self.date - 10 * self.semi_month_end + self.s = Series(['19MAY11', '19MAY11:00:00:00'] * 100000) + self.s2 = self.s.str.replace(':\\S+$', '') - def time_end_apply_index(self): - self.semi_month_end.apply_index(self.rng) + def time_exact(self): + to_datetime(self.s2, format='%d%b%y') - def time_end_incr_rng(self): - self.rng + self.semi_month_end + def time_no_exact(self): + to_datetime(self.s, format='%d%b%y', exact=False) - def time_end_decr_rng(self): - self.rng - self.semi_month_end - def time_begin_apply(self): - self.semi_month_begin.apply(self.date) +class ToDatetimeCache(object): - def time_begin_incr(self): - self.date + self.semi_month_begin + goal_time = 0.2 + params = [True, False] + param_names = ['cache'] - def time_begin_incr_n(self): - self.date + 10 * self.semi_month_begin + def setup(self, cache): + N = 10000 + self.unique_numeric_seconds = list(range(N)) + self.dup_numeric_seconds = [1000] * N + self.dup_string_dates = ['2000-02-11'] * N + self.dup_string_with_tz = ['2000-02-11 15:00:00-0800'] * N - def time_begin_decr(self): - self.date - self.semi_month_begin + def time_unique_seconds_and_unit(self, cache): + to_datetime(self.unique_numeric_seconds, unit='s', cache=cache) - def time_begin_decr_n(self): - self.date - 10 * self.semi_month_begin + def time_dup_seconds_and_unit(self, cache): + to_datetime(self.dup_numeric_seconds, unit='s', cache=cache) - def time_begin_apply_index(self): - self.semi_month_begin.apply_index(self.rng) + def time_dup_string_dates(self, cache): + to_datetime(self.dup_string_dates, cache=cache) - def time_begin_incr_rng(self): - self.rng + self.semi_month_begin + def time_dup_string_dates_and_format(self, cache): + to_datetime(self.dup_string_dates, format='%Y-%m-%d', cache=cache) - def time_begin_decr_rng(self): - self.rng - self.semi_month_begin + def time_dup_string_tzoffset_dates(self, cache): + to_datetime(self.dup_string_with_tz, cache=cache) class DatetimeAccessor(object): + def setup(self): - self.N = 100000 - self.series = pd.Series( - pd.date_range(start='1/1/2000', periods=self.N, freq='T') - ) + N = 100000 + self.series = Series(date_range(start='1/1/2000', periods=N, freq='T')) def time_dt_accessor(self): self.series.dt diff --git a/asv_bench/benchmarks/timestamp.py b/asv_bench/benchmarks/timestamp.py index e8cb4c9d1c75bd..c142a9b59fc43f 100644 --- a/asv_bench/benchmarks/timestamp.py +++ b/asv_bench/benchmarks/timestamp.py @@ -1,89 +1,119 @@ -from .pandas_vb_common import * -from pandas import to_timedelta, Timestamp -import pytz import datetime +from pandas import Timestamp +import pytz + + +class TimestampConstruction(object): + + def time_parse_iso8601_no_tz(self): + Timestamp('2017-08-25 08:16:14') + + def time_parse_iso8601_tz(self): + Timestamp('2017-08-25 08:16:14-0500') + + def time_parse_dateutil(self): + Timestamp('2017/08/25 08:16:14 AM') + + def time_parse_today(self): + Timestamp('today') + + def time_parse_now(self): + Timestamp('now') + + def time_fromordinal(self): + Timestamp.fromordinal(730120) + + def time_fromtimestamp(self): + Timestamp.fromtimestamp(1515448538) + class TimestampProperties(object): goal_time = 0.2 - def setup(self): - self.ts = Timestamp('2017-08-25 08:16:14') + _tzs = [None, pytz.timezone('Europe/Amsterdam')] + _freqs = [None, 'B'] + params = [_tzs, _freqs] + param_names = ['tz', 'freq'] - def time_tz(self): - self.ts.tz + def setup(self, tz, freq): + self.ts = Timestamp('2017-08-25 08:16:14', tzinfo=tz, freq=freq) - def time_offset(self): - self.ts.offset + def time_tz(self, tz, freq): + self.ts.tz - def time_dayofweek(self): + def time_dayofweek(self, tz, freq): self.ts.dayofweek - def time_weekday_name(self): + def time_weekday_name(self, tz, freq): self.ts.weekday_name - def time_dayofyear(self): + def time_dayofyear(self, tz, freq): self.ts.dayofyear - def time_week(self): + def time_week(self, tz, freq): self.ts.week - def time_quarter(self): + def time_quarter(self, tz, freq): self.ts.quarter - def time_days_in_month(self): + def time_days_in_month(self, tz, freq): self.ts.days_in_month - def time_freqstr(self): + def time_freqstr(self, tz, freq): self.ts.freqstr - def time_is_month_start(self): + def time_is_month_start(self, tz, freq): self.ts.is_month_start - def time_is_month_end(self): + def time_is_month_end(self, tz, freq): self.ts.is_month_end - def time_is_quarter_start(self): + def time_is_quarter_start(self, tz, freq): self.ts.is_quarter_start - def time_is_quarter_end(self): + def time_is_quarter_end(self, tz, freq): self.ts.is_quarter_end - def time_is_year_start(self): + def time_is_year_start(self, tz, freq): self.ts.is_quarter_end - def time_is_year_end(self): + def time_is_year_end(self, tz, freq): self.ts.is_quarter_end - def time_is_leap_year(self): + def time_is_leap_year(self, tz, freq): self.ts.is_quarter_end - def time_microsecond(self): + def time_microsecond(self, tz, freq): self.ts.microsecond class TimestampOps(object): goal_time = 0.2 - def setup(self): - self.ts = Timestamp('2017-08-25 08:16:14') - self.ts_tz = Timestamp('2017-08-25 08:16:14', tz='US/Eastern') + params = [None, 'US/Eastern'] + param_names = ['tz'] - dt = datetime.datetime(2016, 3, 27, 1) - self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo - self.ts2 = Timestamp(dt) + def setup(self, tz): + self.ts = Timestamp('2017-08-25 08:16:14', tz=tz) - def time_replace_tz(self): + def time_replace_tz(self, tz): self.ts.replace(tzinfo=pytz.timezone('US/Eastern')) - def time_replace_across_dst(self): - self.ts2.replace(tzinfo=self.tzinfo) - - def time_replace_None(self): - self.ts_tz.replace(tzinfo=None) + def time_replace_None(self, tz): + self.ts.replace(tzinfo=None) - def time_to_pydatetime(self): + def time_to_pydatetime(self, tz): self.ts.to_pydatetime() - def time_to_pydatetime_tz(self): - self.ts_tz.to_pydatetime() + +class TimestampAcrossDst(object): + goal_time = 0.2 + + def setup(self): + dt = datetime.datetime(2016, 3, 27, 1) + self.tzinfo = pytz.timezone('CET').localize(dt, is_dst=False).tzinfo + self.ts2 = Timestamp(dt) + + def time_replace_across_dst(self): + self.ts2.replace(tzinfo=self.tzinfo) diff --git a/asv_bench/vbench_to_asv.py b/asv_bench/vbench_to_asv.py deleted file mode 100644 index 2a4ce5d183ea2b..00000000000000 --- a/asv_bench/vbench_to_asv.py +++ /dev/null @@ -1,163 +0,0 @@ -import ast -import vbench -import os -import sys -import astor -import glob - - -def vbench_to_asv_source(bench, kinds=None): - tab = ' ' * 4 - if kinds is None: - kinds = ['time'] - - output = 'class {}(object):\n'.format(bench.name) - output += tab + 'goal_time = 0.2\n\n' - - if bench.setup: - indented_setup = [tab * 2 + '{}\n'.format(x) for x in bench.setup.splitlines()] - output += tab + 'def setup(self):\n' + ''.join(indented_setup) + '\n' - - for kind in kinds: - output += tab + 'def {}_{}(self):\n'.format(kind, bench.name) - for line in bench.code.splitlines(): - output += tab * 2 + line + '\n' - output += '\n\n' - - if bench.cleanup: - output += tab + 'def teardown(self):\n' + tab * 2 + bench.cleanup - - output += '\n\n' - return output - - -class AssignToSelf(ast.NodeTransformer): - def __init__(self): - super(AssignToSelf, self).__init__() - self.transforms = {} - self.imports = [] - - self.in_class_define = False - self.in_setup = False - - def visit_ClassDef(self, node): - self.transforms = {} - self.in_class_define = True - - functions_to_promote = [] - setup_func = None - - for class_func in ast.iter_child_nodes(node): - if isinstance(class_func, ast.FunctionDef): - if class_func.name == 'setup': - setup_func = class_func - for anon_func in ast.iter_child_nodes(class_func): - if isinstance(anon_func, ast.FunctionDef): - functions_to_promote.append(anon_func) - - if setup_func: - for func in functions_to_promote: - setup_func.body.remove(func) - func.args.args.insert(0, ast.Name(id='self', ctx=ast.Load())) - node.body.append(func) - self.transforms[func.name] = 'self.' + func.name - - ast.fix_missing_locations(node) - - self.generic_visit(node) - - return node - - def visit_TryExcept(self, node): - if any([isinstance(x, (ast.Import, ast.ImportFrom)) for x in node.body]): - self.imports.append(node) - else: - self.generic_visit(node) - return node - - def visit_Assign(self, node): - for target in node.targets: - if isinstance(target, ast.Name) and not isinstance(target.ctx, ast.Param) and not self.in_class_define: - self.transforms[target.id] = 'self.' + target.id - self.generic_visit(node) - - return node - - def visit_Name(self, node): - new_node = node - if node.id in self.transforms: - if not isinstance(node.ctx, ast.Param): - new_node = ast.Attribute(value=ast.Name(id='self', ctx=node.ctx), attr=node.id, ctx=node.ctx) - - self.generic_visit(node) - - return ast.copy_location(new_node, node) - - def visit_Import(self, node): - self.imports.append(node) - - def visit_ImportFrom(self, node): - self.imports.append(node) - - def visit_FunctionDef(self, node): - """Delete functions that are empty due to imports being moved""" - self.in_class_define = False - - self.generic_visit(node) - - if node.body: - return node - - -def translate_module(target_module): - g_vars = {} - l_vars = {} - exec('import ' + target_module) in g_vars - - print(target_module) - module = eval(target_module, g_vars) - - benchmarks = [] - for obj_str in dir(module): - obj = getattr(module, obj_str) - if isinstance(obj, vbench.benchmark.Benchmark): - benchmarks.append(obj) - - if not benchmarks: - return - - rewritten_output = '' - for bench in benchmarks: - rewritten_output += vbench_to_asv_source(bench) - - with open('rewrite.py', 'w') as f: - f.write(rewritten_output) - - ast_module = ast.parse(rewritten_output) - - transformer = AssignToSelf() - transformed_module = transformer.visit(ast_module) - - unique_imports = {astor.to_source(node): node for node in transformer.imports} - - transformed_module.body = unique_imports.values() + transformed_module.body - - transformed_source = astor.to_source(transformed_module) - - with open('benchmarks/{}.py'.format(target_module), 'w') as f: - f.write(transformed_source) - - -if __name__ == '__main__': - cwd = os.getcwd() - new_dir = os.path.join(os.path.dirname(__file__), '../vb_suite') - sys.path.insert(0, new_dir) - - for module in glob.glob(os.path.join(new_dir, '*.py')): - mod = os.path.basename(module) - if mod in ['make.py', 'measure_memory_consumption.py', 'perf_HEAD.py', 'run_suite.py', 'test_perf.py', 'generate_rst_files.py', 'test.py', 'suite.py']: - continue - print('') - print(mod) - - translate_module(mod.replace('.py', '')) diff --git a/ci/appveyor-27.yaml b/ci/appveyor-27.yaml new file mode 100644 index 00000000000000..bcd9ddee1715ed --- /dev/null +++ b/ci/appveyor-27.yaml @@ -0,0 +1,31 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - bottleneck + - dateutil + - gcsfs + - html5lib + - jinja2=2.8 + - lxml + - matplotlib + - numexpr + - numpy=1.12* + - openpyxl=2.5.5 + - pytables + - python=2.7.* + - pytz + - s3fs + - scipy + - sqlalchemy + - xlrd + - xlsxwriter + - xlwt + # universal + - cython>=0.28.2 + - pytest + - pytest-xdist + - moto + - hypothesis>=3.58.0 diff --git a/ci/appveyor-36.yaml b/ci/appveyor-36.yaml new file mode 100644 index 00000000000000..6230e9b6a1885f --- /dev/null +++ b/ci/appveyor-36.yaml @@ -0,0 +1,28 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - blosc + - bottleneck + - fastparquet + - feather-format + - matplotlib + - numexpr + - numpy=1.14* + - openpyxl=2.5.5 + - pyarrow + - pytables + - python-dateutil + - python=3.6.* + - pytz + - scipy + - thrift=0.10* + - xlrd + - xlsxwriter + - xlwt + # universal + - cython>=0.28.2 + - pytest + - pytest-xdist + - hypothesis>=3.58.0 diff --git a/ci/before_install_travis.sh b/ci/before_script_travis.sh similarity index 93% rename from ci/before_install_travis.sh rename to ci/before_script_travis.sh index 2d0b4da6120dc3..0b3939b1906a24 100755 --- a/ci/before_install_travis.sh +++ b/ci/before_script_travis.sh @@ -4,6 +4,7 @@ echo "inside $0" if [ "${TRAVIS_OS_NAME}" == "linux" ]; then sh -e /etc/init.d/xvfb start + sleep 3 fi # Never fail because bad things happened here. diff --git a/ci/build_docs.sh b/ci/build_docs.sh index a038304fe0f7ae..f445447e3565c4 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -8,15 +8,6 @@ fi cd "$TRAVIS_BUILD_DIR" echo "inside $0" -git show --pretty="format:" --name-only HEAD~5.. --first-parent | grep -P "rst|txt|doc" - -if [ "$?" != "0" ]; then - echo "Skipping doc build, none were modified" - # nope, skip docs build - exit 0 -fi - - if [ "$DOC" ]; then echo "Will build docs" @@ -24,6 +15,7 @@ if [ "$DOC" ]; then source activate pandas mv "$TRAVIS_BUILD_DIR"/doc /tmp + mv "$TRAVIS_BUILD_DIR/LICENSE" /tmp # included in the docs. cd /tmp/doc echo ############################### @@ -59,15 +51,6 @@ if [ "$DOC" ]; then git remote -v git push origin gh-pages -f - - echo "Running doctests" - cd "$TRAVIS_BUILD_DIR" - pytest --doctest-modules \ - pandas/core/reshape/concat.py \ - pandas/core/reshape/pivot.py \ - pandas/core/reshape/reshape.py \ - pandas/core/reshape/tile.py - fi exit 0 diff --git a/ci/check_imports.py b/ci/check_imports.py index a83436e7d258c2..19e48b659617f7 100644 --- a/ci/check_imports.py +++ b/ci/check_imports.py @@ -5,11 +5,12 @@ blacklist = { 'bs4', + 'gcsfs', 'html5lib', 'ipython', 'jinja2' + 'hypothesis', 'lxml', - 'matplotlib', 'numexpr', 'openpyxl', 'py', diff --git a/ci/circle-27-compat.yaml b/ci/circle-27-compat.yaml new file mode 100644 index 00000000000000..84ec7e20fc8f1f --- /dev/null +++ b/ci/circle-27-compat.yaml @@ -0,0 +1,29 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - bottleneck=1.0.0 + - cython=0.28.2 + - jinja2=2.8 + - numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr + - numpy=1.9.3 + - openpyxl=2.5.5 + - psycopg2 + - pytables=3.2.2 + - python-dateutil=2.5.0 + - python=2.7* + - pytz=2013b + - scipy=0.14.0 + - sqlalchemy=0.7.8 + - xlrd=0.9.2 + - xlsxwriter=0.5.2 + - xlwt=0.7.5 + # universal + - pytest + - pytest-xdist + - pip: + - html5lib==1.0b2 + - beautifulsoup4==4.2.1 + - pymysql==0.6.0 + - hypothesis>=3.58.0 diff --git a/ci/circle-35-ascii.yaml b/ci/circle-35-ascii.yaml new file mode 100644 index 00000000000000..281ed59e2deff0 --- /dev/null +++ b/ci/circle-35-ascii.yaml @@ -0,0 +1,15 @@ +name: pandas +channels: + - defaults +dependencies: + - cython>=0.28.2 + - nomkl + - numpy + - python-dateutil + - python=3.5* + - pytz + # universal + - pytest + - pytest-xdist + - pip: + - hypothesis>=3.58.0 diff --git a/ci/circle-36-locale.yaml b/ci/circle-36-locale.yaml new file mode 100644 index 00000000000000..ef97b85406709e --- /dev/null +++ b/ci/circle-36-locale.yaml @@ -0,0 +1,35 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython>=0.28.2 + - html5lib + - ipython + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy + - openpyxl=2.5.5 + - psycopg2 + - pymysql + - pytables + - python-dateutil + - python=3.6* + - pytz + - s3fs + - scipy + - sqlalchemy + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - moto + - pip: + - hypothesis>=3.58.0 diff --git a/ci/circle-36-locale_slow.yaml b/ci/circle-36-locale_slow.yaml new file mode 100644 index 00000000000000..14b23dd6f3e4c0 --- /dev/null +++ b/ci/circle-36-locale_slow.yaml @@ -0,0 +1,36 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython>=0.28.2 + - gcsfs + - html5lib + - ipython + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy + - openpyxl=2.5.5 + - psycopg2 + - pymysql + - pytables + - python-dateutil + - python=3.6* + - pytz + - s3fs + - scipy + - sqlalchemy + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - moto + - pip: + - hypothesis>=3.58.0 diff --git a/ci/doctests.sh b/ci/doctests.sh new file mode 100755 index 00000000000000..a941515fde4ae9 --- /dev/null +++ b/ci/doctests.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +echo "inside $0" + + +source activate pandas +cd "$TRAVIS_BUILD_DIR" + +RET=0 + +if [ "$DOCTEST" ]; then + + echo "Running doctests" + + # running all doctests is not yet working + # pytest --doctest-modules --ignore=pandas/tests -v pandas + + # if [ $? -ne "0" ]; then + # RET=1 + # fi + + # DataFrame / Series docstrings + pytest --doctest-modules -v pandas/core/frame.py \ + -k"-assign -axes -combine -isin -itertuples -join -nlargest -nsmallest -nunique -pivot_table -quantile -query -reindex -reindex_axis -replace -round -set_index -stack -to_dict -to_stata" + + if [ $? -ne "0" ]; then + RET=1 + fi + + pytest --doctest-modules -v pandas/core/series.py \ + -k"-nonzero -reindex -searchsorted -to_dict" + + if [ $? -ne "0" ]; then + RET=1 + fi + + pytest --doctest-modules -v pandas/core/generic.py \ + -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -resample -sample -to_json -to_xarray -transpose -values -xs" + + if [ $? -ne "0" ]; then + RET=1 + fi + + # top-level reshaping functions + pytest --doctest-modules -v \ + pandas/core/reshape/concat.py \ + pandas/core/reshape/pivot.py \ + pandas/core/reshape/reshape.py \ + pandas/core/reshape/tile.py \ + -k"-crosstab -pivot_table -cut" + + if [ $? -ne "0" ]; then + RET=1 + fi + +else + echo "NOT running doctests" +fi + +exit $RET diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml index c3d3d59f895c66..f3323face4144b 100644 --- a/ci/environment-dev.yaml +++ b/ci/environment-dev.yaml @@ -3,12 +3,16 @@ channels: - defaults - conda-forge dependencies: - - Cython + - Cython>=0.28.2 - NumPy + - flake8 + - flake8-comprehensions + - hypothesis>=3.58.0 - moto - - pytest - - python-dateutil + - pytest>=3.6 + - python-dateutil>=2.5.0 - python=3 - pytz - - setuptools + - setuptools>=24.2.0 - sphinx + - sphinxcontrib-spelling diff --git a/ci/install_circle.sh b/ci/install_circle.sh index fd79f907625e9d..f8bcf6bcffc995 100755 --- a/ci/install_circle.sh +++ b/ci/install_circle.sh @@ -6,14 +6,7 @@ echo "[home_dir: $home_dir]" echo "[ls -ltr]" ls -ltr -echo "[Using clean Miniconda install]" -rm -rf "$MINICONDA_DIR" - -# install miniconda -wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 -bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 - -export PATH="$MINICONDA_DIR/bin:$PATH" +apt-get update -y && apt-get install -y build-essential postgresql-client-9.6 echo "[update conda]" conda config --set ssl_verify false || exit 1 @@ -46,15 +39,19 @@ echo "[environmental variable file]" cat $ENVS_FILE source $ENVS_FILE -export REQ_BUILD=ci/requirements-${JOB}.build -export REQ_RUN=ci/requirements-${JOB}.run -export REQ_PIP=ci/requirements-${JOB}.pip - # edit the locale override if needed if [ -n "$LOCALE_OVERRIDE" ]; then + + apt-get update && apt-get -y install locales locales-all + + export LANG=$LOCALE_OVERRIDE + export LC_ALL=$LOCALE_OVERRIDE + + python -c "import locale; locale.setlocale(locale.LC_ALL, \"$LOCALE_OVERRIDE\")" || exit 1; + echo "[Adding locale to the first line of pandas/__init__.py]" rm -f pandas/__init__.pyc - sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, '$LOCALE_OVERRIDE')\n" + sedc="3iimport locale\nlocale.setlocale(locale.LC_ALL, \"$LOCALE_OVERRIDE\")\n" sed -i "$sedc" pandas/__init__.py echo "[head -4 pandas/__init__.py]" head -4 pandas/__init__.py @@ -62,25 +59,23 @@ if [ -n "$LOCALE_OVERRIDE" ]; then fi # create envbuild deps -echo "[create env: ${REQ_BUILD}]" -time conda create -n pandas -q --file=${REQ_BUILD} || exit 1 -time conda install -n pandas pytest>=3.1.0 || exit 1 +echo "[create env]" +time conda env create -q -n pandas --file="${ENV_FILE}" || exit 1 source activate pandas -time pip install moto || exit 1 + +# remove any installed pandas package +# w/o removing anything else +echo +echo "[removing installed pandas]" +conda remove pandas -y --force +pip uninstall -y pandas # build but don't install echo "[build em]" time python setup.py build_ext --inplace || exit 1 -# we may have run installations -echo "[conda installs: ${REQ_RUN}]" -if [ -e ${REQ_RUN} ]; then - time conda install -q --file=${REQ_RUN} || exit 1 -fi +echo +echo "[show environment]" -# we may have additional pip installs -echo "[pip installs: ${REQ_PIP}]" -if [ -e ${REQ_PIP} ]; then - pip install -r $REQ_PIP -fi +conda list diff --git a/ci/install_db_circle.sh b/ci/install_db_circle.sh deleted file mode 100755 index a00f74f009f548..00000000000000 --- a/ci/install_db_circle.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -echo "installing dbs" -mysql -e 'create database pandas_nosetest;' -psql -c 'create database pandas_nosetest;' -U postgres - -echo "done" -exit 0 diff --git a/ci/install_travis.sh b/ci/install_travis.sh index b85263daa1eaca..fd4a36f86db6cc 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -34,9 +34,9 @@ fi # install miniconda if [ "${TRAVIS_OS_NAME}" == "osx" ]; then - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh || exit 1 + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -q -O miniconda.sh || exit 1 else - time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh || exit 1 + time wget http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -q -O miniconda.sh || exit 1 fi time bash miniconda.sh -b -p "$MINICONDA_DIR" || exit 1 @@ -50,16 +50,6 @@ conda config --set ssl_verify false || exit 1 conda config --set quiet true --set always_yes true --set changeps1 false || exit 1 conda update -q conda -echo -echo "[add channels]" -conda config --remove channels defaults || exit 1 -conda config --add channels defaults || exit 1 - -if [ "$CONDA_FORGE" ]; then - # add conda-forge channel as priority - conda config --add channels conda-forge || exit 1 -fi - # Useful for debugging any issues with conda conda info -a || exit 1 @@ -90,64 +80,10 @@ echo echo "[create env]" # create our environment -REQ="ci/requirements-${JOB}.build" -time conda create -n pandas --file=${REQ} || exit 1 +time conda env create -q -n pandas --file="${ENV_FILE}" || exit 1 source activate pandas -# may have addtl installation instructions for this build -echo -echo "[build addtl installs]" -REQ="ci/requirements-${JOB}.build.sh" -if [ -e ${REQ} ]; then - time bash $REQ || exit 1 -fi - -time conda install -n pandas pytest>=3.1.0 -time pip install pytest-xdist moto - -if [ "$LINT" ]; then - conda install flake8 - pip install cpplint -fi - -if [ "$COVERAGE" ]; then - pip install coverage pytest-cov -fi - -echo -if [ -z "$BUILD_TEST" ]; then - - # build but don't install - echo "[build em]" - time python setup.py build_ext --inplace || exit 1 - -fi - -# we may have run installations -echo -echo "[conda installs]" -REQ="ci/requirements-${JOB}.run" -if [ -e ${REQ} ]; then - time conda install -n pandas --file=${REQ} || exit 1 -fi - -# we may have additional pip installs -echo -echo "[pip installs]" -REQ="ci/requirements-${JOB}.pip" -if [ -e ${REQ} ]; then - pip install -r $REQ -fi - -# may have addtl installation instructions for this build -echo -echo "[addtl installs]" -REQ="ci/requirements-${JOB}.sh" -if [ -e ${REQ} ]; then - time bash $REQ || exit 1 -fi - # remove any installed pandas package # w/o removing anything else echo @@ -155,31 +91,18 @@ echo "[removing installed pandas]" conda remove pandas -y --force pip uninstall -y pandas -if [ "$BUILD_TEST" ]; then - - # remove any installation - pip uninstall -y pandas - conda list pandas - pip list --format columns |grep pandas - - # build & install testing - echo ["building release"] - bash scripts/build_dist_for_release.sh - conda uninstall -y cython - time pip install dist/*tar.gz || exit 1 - -else - - # install our pandas - echo - echo "[running setup.py develop]" - python setup.py develop || exit 1 +echo +echo "[no installed pandas]" +conda list pandas +pip list --format columns |grep pandas -fi +# build and install +echo "[running setup.py develop]" +python setup.py develop || exit 1 echo -echo "[show pandas]" -conda list pandas +echo "[show environment]" +conda list echo echo "[done]" diff --git a/ci/lint.sh b/ci/lint.sh index 43d6ea0c118b02..533e1d18d8e0ed 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -8,23 +8,59 @@ RET=0 if [ "$LINT" ]; then + # We're ignoring the following codes across the board + #E402, # module level import not at top of file + #E731, # do not assign a lambda expression, use a def + #E741, # do not use variables named 'l', 'O', or 'I' + #W503, # line break before binary operator + #C406, # Unnecessary (list/tuple) literal - rewrite as a dict literal. + #C408, # Unnecessary (dict/list/tuple) call - rewrite as a literal. + #C409, # Unnecessary (list/tuple) passed to tuple() - (remove the outer call to tuple()/rewrite as a tuple literal). + #C410 # Unnecessary (list/tuple) passed to list() - (remove the outer call to list()/rewrite as a list literal). + # pandas/_libs/src is C code, so no need to search there. - echo "Linting *.py" - flake8 pandas --filename=*.py --exclude pandas/_libs/src + echo "Linting *.py" + flake8 pandas --filename=*.py --exclude pandas/_libs/src --ignore=C406,C408,C409,E402,E731,E741,W503 + if [ $? -ne "0" ]; then + RET=1 + fi + + flake8 scripts/tests --filename=*.py if [ $? -ne "0" ]; then RET=1 fi echo "Linting *.py DONE" echo "Linting setup.py" - flake8 setup.py + flake8 setup.py --ignore=E402,E731,E741,W503 if [ $? -ne "0" ]; then RET=1 fi echo "Linting setup.py DONE" + echo "Linting asv_bench/benchmarks/" + flake8 asv_bench/benchmarks/ --exclude=asv_bench/benchmarks/*.py --ignore=F811,C406,C408,C409,C410 + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting asv_bench/benchmarks/*.py DONE" + + echo "Linting scripts/*.py" + flake8 scripts --filename=*.py --ignore=C408,E402,E731,E741,W503 + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting scripts/*.py DONE" + + echo "Linting doc scripts" + flake8 doc/make.py doc/source/conf.py --ignore=E402,E731,E741,W503 + if [ $? -ne "0" ]; then + RET=1 + fi + echo "Linting doc scripts DONE" + echo "Linting *.pyx" - flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403 + flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403,C400,C401,C402,C403,C404,C405,C406,C407,C408,C409,C410,C411 if [ $? -ne "0" ]; then RET=1 fi @@ -38,10 +74,20 @@ if [ "$LINT" ]; then if [ $? -ne "0" ]; then RET=1 fi - done echo "Linting *.pxi.in DONE" + echo "Linting *.pxd" + for path in '_libs' + do + echo "linting -> pandas/$path" + flake8 pandas/$path --filename=*.pxd --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 + if [ $? -ne "0" ]; then + RET=1 + fi + done + echo "Linting *.pxd DONE" + # readability/casting: Warnings about C casting instead of C++ casting # runtime/int: Warnings about using C number types instead of C++ ones # build/include_subdir: Warnings about prefacing included header files with directory @@ -51,7 +97,7 @@ if [ "$LINT" ]; then # this particular codebase (e.g. src/headers, src/klib, src/msgpack). However, # we can lint all header files since they aren't "generated" like C files are. echo "Linting *.c and *.h" - for path in '*.h' 'period_helper.c' 'datetime' 'parser' 'ujson' + for path in '*.h' 'parser' 'ujson' do echo "linting -> pandas/_libs/src/$path" cpplint --quiet --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/_libs/src/$path @@ -59,15 +105,90 @@ if [ "$LINT" ]; then RET=1 fi done + echo "linting -> pandas/_libs/tslibs/src/datetime" + cpplint --quiet --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/_libs/tslibs/src/datetime + if [ $? -ne "0" ]; then + RET=1 + fi echo "Linting *.c and *.h DONE" echo "Check for invalid testing" - grep -r -E --include '*.py' --exclude testing.py '(numpy|np)\.testing' pandas + + # Check for the following code in testing: + # + # np.testing + # np.array_equal + grep -r -E --include '*.py' --exclude testing.py '(numpy|np)(\.testing|\.array_equal)' pandas/tests/ + if [ $? = "0" ]; then RET=1 fi + + # Check for pytest.warns + grep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ + + if [ $? = "0" ]; then + RET=1 + fi + + # Check for the following code in the extension array base tests + # tm.assert_frame_equal + # tm.assert_series_equal + grep -r -E --include '*.py' --exclude base.py 'tm.assert_(series|frame)_equal' pandas/tests/extension/base + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for invalid testing DONE" + # Check for imports from pandas.core.common instead + # of `import pandas.core.common as com` + echo "Check for non-standard imports" + grep -R --include="*.py*" -E "from pandas.core.common import " pandas + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for non-standard imports DONE" + + echo "Check for incorrect sphinx directives" + SPHINX_DIRECTIVES=$(echo \ + "autosummary|contents|currentmodule|deprecated|function|image|"\ + "important|include|ipython|literalinclude|math|module|note|raw|"\ + "seealso|toctree|versionadded|versionchanged|warning" | tr -d "[:space:]") + for path in './pandas' './doc/source' + do + grep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. ($SPHINX_DIRECTIVES):[^:]" $path + if [ $? = "0" ]; then + RET=1 + fi + done + echo "Check for incorrect sphinx directives DONE" + + echo "Check for deprecated messages without sphinx directive" + grep -R --include="*.py" --include="*.pyx" -E "(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)" pandas + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for deprecated messages without sphinx directive DONE" + + echo "Check for old-style classes" + grep -R --include="*.py" -E "class\s\S*[^)]:" pandas scripts + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for old-style classes DONE" + + echo "Check for backticks incorrectly rendering because of missing spaces" + grep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ + + if [ $? = "0" ]; then + RET=1 + fi + echo "Check for backticks incorrectly rendering because of missing spaces DONE" + else echo "NOT Linting" fi diff --git a/ci/print_skipped.py b/ci/print_skipped.py index dd2180f6eeb193..67bc7b556cd439 100755 --- a/ci/print_skipped.py +++ b/ci/print_skipped.py @@ -10,7 +10,7 @@ def parse_results(filename): root = tree.getroot() skipped = [] - current_class = old_class = '' + current_class = '' i = 1 assert i - 1 == len(skipped) for el in root.findall('testcase'): @@ -24,7 +24,9 @@ def parse_results(filename): out = '' if old_class != current_class: ndigits = int(math.log(i, 10) + 1) - out += ('-' * (len(name + msg) + 4 + ndigits) + '\n') # 4 for : + space + # + space + + # 4 for : + space + # + space + out += ('-' * (len(name + msg) + 4 + ndigits) + '\n') out += '#{i} {name}: {msg}'.format(i=i, name=name, msg=msg) skipped.append(out) i += 1 diff --git a/ci/requirements-2.7.build b/ci/requirements-2.7.build deleted file mode 100644 index 415df13179fcf9..00000000000000 --- a/ci/requirements-2.7.build +++ /dev/null @@ -1,6 +0,0 @@ -python=2.7* -python-dateutil=2.4.1 -pytz=2013b -nomkl -numpy -cython=0.23 diff --git a/ci/requirements-2.7.pip b/ci/requirements-2.7.pip deleted file mode 100644 index 876d9e978fa849..00000000000000 --- a/ci/requirements-2.7.pip +++ /dev/null @@ -1,10 +0,0 @@ -blosc -pandas-gbq -html5lib -beautifulsoup4 -pathlib -backports.lzma -py -PyCrypto -mock -ipython diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run deleted file mode 100644 index a68e1d256058df..00000000000000 --- a/ci/requirements-2.7.run +++ /dev/null @@ -1,20 +0,0 @@ -python-dateutil=2.4.1 -pytz=2013b -numpy -xlwt=0.7.5 -numexpr -pytables -matplotlib -openpyxl=1.6.2 -xlrd=0.9.2 -sqlalchemy=0.9.6 -lxml -scipy -xlsxwriter=0.5.2 -s3fs -bottleneck -psycopg2 -patsy -pymysql=0.6.3 -jinja2=2.8 -xarray=0.8.0 diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh deleted file mode 100644 index e3bd5e46026c54..00000000000000 --- a/ci/requirements-2.7.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "install 27" - -conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet diff --git a/ci/requirements-2.7_BUILD_TEST.build b/ci/requirements-2.7_BUILD_TEST.build deleted file mode 100644 index aadec00cb7ebf2..00000000000000 --- a/ci/requirements-2.7_BUILD_TEST.build +++ /dev/null @@ -1,6 +0,0 @@ -python=2.7* -dateutil -pytz -nomkl -numpy -cython diff --git a/ci/requirements-2.7_BUILD_TEST.pip b/ci/requirements-2.7_BUILD_TEST.pip deleted file mode 100644 index a0fc77c40bc00d..00000000000000 --- a/ci/requirements-2.7_BUILD_TEST.pip +++ /dev/null @@ -1,7 +0,0 @@ -xarray -geopandas -seaborn -pandas_gbq -pandas_datareader -statsmodels -scikit-learn diff --git a/ci/requirements-2.7_BUILD_TEST.sh b/ci/requirements-2.7_BUILD_TEST.sh deleted file mode 100755 index 78941fd0944e57..00000000000000 --- a/ci/requirements-2.7_BUILD_TEST.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "install 27 BUILD_TEST" - -conda install -n pandas -c conda-forge pyarrow dask diff --git a/ci/requirements-2.7_COMPAT.build b/ci/requirements-2.7_COMPAT.build deleted file mode 100644 index d9c932daa110ba..00000000000000 --- a/ci/requirements-2.7_COMPAT.build +++ /dev/null @@ -1,5 +0,0 @@ -python=2.7* -numpy=1.9.2 -cython=0.23 -dateutil=1.5 -pytz=2013b diff --git a/ci/requirements-2.7_COMPAT.pip b/ci/requirements-2.7_COMPAT.pip deleted file mode 100644 index 13cd35a923124f..00000000000000 --- a/ci/requirements-2.7_COMPAT.pip +++ /dev/null @@ -1,4 +0,0 @@ -html5lib==1.0b2 -beautifulsoup4==4.2.0 -openpyxl -argparse diff --git a/ci/requirements-2.7_COMPAT.run b/ci/requirements-2.7_COMPAT.run deleted file mode 100644 index 39bf7201407333..00000000000000 --- a/ci/requirements-2.7_COMPAT.run +++ /dev/null @@ -1,14 +0,0 @@ -numpy=1.9.2 -dateutil=1.5 -pytz=2013b -scipy=0.14.0 -xlwt=0.7.5 -xlrd=0.9.2 -bottleneck=1.0.0 -numexpr=2.4.4 # we test that we correctly don't use an unsupported numexpr -pytables=3.2.2 -psycopg2 -pymysql=0.6.0 -sqlalchemy=0.7.8 -xlsxwriter=0.5.2 -jinja2=2.8 diff --git a/ci/requirements-2.7_LOCALE.build b/ci/requirements-2.7_LOCALE.build deleted file mode 100644 index 96cb184ec2665e..00000000000000 --- a/ci/requirements-2.7_LOCALE.build +++ /dev/null @@ -1,5 +0,0 @@ -python=2.7* -python-dateutil -pytz=2013b -numpy=1.9.2 -cython=0.23 diff --git a/ci/requirements-2.7_LOCALE.pip b/ci/requirements-2.7_LOCALE.pip deleted file mode 100644 index 1b825bbf492ca9..00000000000000 --- a/ci/requirements-2.7_LOCALE.pip +++ /dev/null @@ -1,3 +0,0 @@ -html5lib==1.0b2 -beautifulsoup4==4.2.1 -blosc diff --git a/ci/requirements-2.7_LOCALE.run b/ci/requirements-2.7_LOCALE.run deleted file mode 100644 index 978bbf6a051c51..00000000000000 --- a/ci/requirements-2.7_LOCALE.run +++ /dev/null @@ -1,12 +0,0 @@ -python-dateutil -pytz=2013b -numpy=1.9.2 -xlwt=0.7.5 -openpyxl=1.6.2 -xlsxwriter=0.5.2 -xlrd=0.9.2 -bottleneck=1.0.0 -matplotlib=1.4.3 -sqlalchemy=0.8.1 -lxml -scipy diff --git a/ci/requirements-2.7_SLOW.build b/ci/requirements-2.7_SLOW.build deleted file mode 100644 index a665ab9edd5850..00000000000000 --- a/ci/requirements-2.7_SLOW.build +++ /dev/null @@ -1,5 +0,0 @@ -python=2.7* -python-dateutil -pytz -numpy=1.10* -cython diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run deleted file mode 100644 index db95a6ccb23140..00000000000000 --- a/ci/requirements-2.7_SLOW.run +++ /dev/null @@ -1,19 +0,0 @@ -python-dateutil -pytz -numpy=1.10* -matplotlib=1.4.3 -scipy -patsy -xlwt -openpyxl -xlsxwriter -xlrd -numexpr -pytables -sqlalchemy -lxml -s3fs -psycopg2 -pymysql -html5lib -beautifulsoup4 diff --git a/ci/requirements-2.7_WIN.run b/ci/requirements-2.7_WIN.run deleted file mode 100644 index c4ca7fc736bb17..00000000000000 --- a/ci/requirements-2.7_WIN.run +++ /dev/null @@ -1,18 +0,0 @@ -dateutil -pytz -numpy=1.10* -xlwt -numexpr -pytables==3.2.2 -matplotlib -openpyxl -xlrd -sqlalchemy -lxml -scipy -xlsxwriter -s3fs -bottleneck -html5lib -beautifulsoup4 -jinja2=2.8 diff --git a/ci/requirements-3.5.build b/ci/requirements-3.5.build deleted file mode 100644 index 76227e106e1fd4..00000000000000 --- a/ci/requirements-3.5.build +++ /dev/null @@ -1,6 +0,0 @@ -python=3.5* -python-dateutil -pytz -nomkl -numpy=1.11.3 -cython diff --git a/ci/requirements-3.5.pip b/ci/requirements-3.5.pip deleted file mode 100644 index 6e4f7b65f97286..00000000000000 --- a/ci/requirements-3.5.pip +++ /dev/null @@ -1,2 +0,0 @@ -xarray==0.9.1 -pandas-gbq diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run deleted file mode 100644 index 52828b52209971..00000000000000 --- a/ci/requirements-3.5.run +++ /dev/null @@ -1,20 +0,0 @@ -pytz -numpy=1.11.3 -openpyxl -xlsxwriter -xlrd -xlwt -scipy -numexpr -pytables -html5lib -lxml -matplotlib -jinja2 -bottleneck -sqlalchemy -pymysql -psycopg2 -s3fs -beautifulsoup4 -ipython diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh deleted file mode 100644 index d694ad3679ac12..00000000000000 --- a/ci/requirements-3.5.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "install 35" - -# pip install python-dateutil to get latest -conda remove -n pandas python-dateutil --force -pip install python-dateutil - -conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0 diff --git a/ci/requirements-3.5_ASCII.build b/ci/requirements-3.5_ASCII.build deleted file mode 100644 index f7befe3b318650..00000000000000 --- a/ci/requirements-3.5_ASCII.build +++ /dev/null @@ -1,6 +0,0 @@ -python=3.5* -python-dateutil -pytz -nomkl -numpy -cython diff --git a/ci/requirements-3.5_ASCII.run b/ci/requirements-3.5_ASCII.run deleted file mode 100644 index b9d543f557d06e..00000000000000 --- a/ci/requirements-3.5_ASCII.run +++ /dev/null @@ -1,3 +0,0 @@ -python-dateutil -pytz -numpy diff --git a/ci/requirements-3.5_OSX.build b/ci/requirements-3.5_OSX.build deleted file mode 100644 index f5bc01b67a20ab..00000000000000 --- a/ci/requirements-3.5_OSX.build +++ /dev/null @@ -1,4 +0,0 @@ -python=3.5* -nomkl -numpy=1.10.4 -cython diff --git a/ci/requirements-3.5_OSX.pip b/ci/requirements-3.5_OSX.pip deleted file mode 100644 index d1fc1fe24a079f..00000000000000 --- a/ci/requirements-3.5_OSX.pip +++ /dev/null @@ -1 +0,0 @@ -python-dateutil==2.5.3 diff --git a/ci/requirements-3.5_OSX.run b/ci/requirements-3.5_OSX.run deleted file mode 100644 index 1d83474d10f2fc..00000000000000 --- a/ci/requirements-3.5_OSX.run +++ /dev/null @@ -1,16 +0,0 @@ -pytz -numpy=1.10.4 -openpyxl -xlsxwriter -xlrd -xlwt -numexpr -pytables -html5lib -lxml -matplotlib -jinja2 -bottleneck -xarray -s3fs -beautifulsoup4 diff --git a/ci/requirements-3.5_OSX.sh b/ci/requirements-3.5_OSX.sh deleted file mode 100644 index c2978b175968c5..00000000000000 --- a/ci/requirements-3.5_OSX.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "install 35_OSX" - -conda install -n pandas -c conda-forge feather-format==0.3.1 fastparquet diff --git a/ci/requirements-3.6.build b/ci/requirements-3.6.build deleted file mode 100644 index 1c4b46aea3865d..00000000000000 --- a/ci/requirements-3.6.build +++ /dev/null @@ -1,6 +0,0 @@ -python=3.6* -python-dateutil -pytz -nomkl -numpy -cython diff --git a/ci/requirements-3.6.pip b/ci/requirements-3.6.pip deleted file mode 100644 index 753a60d6c119ac..00000000000000 --- a/ci/requirements-3.6.pip +++ /dev/null @@ -1 +0,0 @@ -brotlipy diff --git a/ci/requirements-3.6.run b/ci/requirements-3.6.run deleted file mode 100644 index 822144a80bc9a0..00000000000000 --- a/ci/requirements-3.6.run +++ /dev/null @@ -1,25 +0,0 @@ -python-dateutil -pytz -numpy -scipy -openpyxl -xlsxwriter -xlrd -xlwt -numexpr -pytables -matplotlib -lxml -html5lib -jinja2 -sqlalchemy -pymysql -feather-format -pyarrow -psycopg2 -python-snappy -fastparquet -beautifulsoup4 -s3fs -xarray -ipython diff --git a/ci/requirements-3.6_DOC.build b/ci/requirements-3.6_DOC.build deleted file mode 100644 index bdcfe28105866e..00000000000000 --- a/ci/requirements-3.6_DOC.build +++ /dev/null @@ -1,5 +0,0 @@ -python=3.6* -python-dateutil -pytz -numpy -cython diff --git a/ci/requirements-3.6_DOC.run b/ci/requirements-3.6_DOC.run deleted file mode 100644 index 6c45e3371e9cfa..00000000000000 --- a/ci/requirements-3.6_DOC.run +++ /dev/null @@ -1,25 +0,0 @@ -ipython -ipykernel -ipywidgets -sphinx=1.5* -nbconvert -nbformat -notebook -matplotlib -seaborn -scipy -lxml -beautifulsoup4 -html5lib -pytables -python-snappy -openpyxl -xlrd -xlwt -xlsxwriter -sqlalchemy -numexpr -bottleneck -statsmodels -xarray -pyqt diff --git a/ci/requirements-3.6_DOC.sh b/ci/requirements-3.6_DOC.sh deleted file mode 100644 index aec0f62148622d..00000000000000 --- a/ci/requirements-3.6_DOC.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "[install DOC_BUILD deps]" - -pip install pandas-gbq - -conda install -n pandas -c conda-forge feather-format pyarrow nbsphinx pandoc fastparquet - -conda install -n pandas -c r r rpy2 --yes diff --git a/ci/requirements-3.6_LOCALE.build b/ci/requirements-3.6_LOCALE.build deleted file mode 100644 index 1c4b46aea3865d..00000000000000 --- a/ci/requirements-3.6_LOCALE.build +++ /dev/null @@ -1,6 +0,0 @@ -python=3.6* -python-dateutil -pytz -nomkl -numpy -cython diff --git a/ci/requirements-3.6_LOCALE.run b/ci/requirements-3.6_LOCALE.run deleted file mode 100644 index ad54284c6f7e3d..00000000000000 --- a/ci/requirements-3.6_LOCALE.run +++ /dev/null @@ -1,22 +0,0 @@ -python-dateutil -pytz -numpy -scipy -openpyxl -xlsxwriter -xlrd -xlwt -numexpr -pytables -matplotlib -lxml -html5lib -jinja2 -sqlalchemy -pymysql -# feather-format (not available on defaults ATM) -psycopg2 -beautifulsoup4 -s3fs -xarray -ipython diff --git a/ci/requirements-3.6_LOCALE_SLOW.build b/ci/requirements-3.6_LOCALE_SLOW.build deleted file mode 100644 index 1c4b46aea3865d..00000000000000 --- a/ci/requirements-3.6_LOCALE_SLOW.build +++ /dev/null @@ -1,6 +0,0 @@ -python=3.6* -python-dateutil -pytz -nomkl -numpy -cython diff --git a/ci/requirements-3.6_LOCALE_SLOW.run b/ci/requirements-3.6_LOCALE_SLOW.run deleted file mode 100644 index ad54284c6f7e3d..00000000000000 --- a/ci/requirements-3.6_LOCALE_SLOW.run +++ /dev/null @@ -1,22 +0,0 @@ -python-dateutil -pytz -numpy -scipy -openpyxl -xlsxwriter -xlrd -xlwt -numexpr -pytables -matplotlib -lxml -html5lib -jinja2 -sqlalchemy -pymysql -# feather-format (not available on defaults ATM) -psycopg2 -beautifulsoup4 -s3fs -xarray -ipython diff --git a/ci/requirements-3.6_NUMPY_DEV.build b/ci/requirements-3.6_NUMPY_DEV.build deleted file mode 100644 index 336fbe86b57d88..00000000000000 --- a/ci/requirements-3.6_NUMPY_DEV.build +++ /dev/null @@ -1,2 +0,0 @@ -python=3.6* -pytz diff --git a/ci/requirements-3.6_NUMPY_DEV.build.sh b/ci/requirements-3.6_NUMPY_DEV.build.sh deleted file mode 100644 index fd79142c5cebbe..00000000000000 --- a/ci/requirements-3.6_NUMPY_DEV.build.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -source activate pandas - -echo "install numpy master wheel" - -# remove the system installed numpy -pip uninstall numpy -y - -# install numpy wheel from master -PRE_WHEELS="https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com" -pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy - -# install dateutil from master -pip install -U git+git://github.com/dateutil/dateutil.git - -# cython via pip -pip install cython - -true diff --git a/ci/requirements-3.6_NUMPY_DEV.run b/ci/requirements-3.6_NUMPY_DEV.run deleted file mode 100644 index af44f198c687e2..00000000000000 --- a/ci/requirements-3.6_NUMPY_DEV.run +++ /dev/null @@ -1 +0,0 @@ -pytz diff --git a/ci/requirements-3.6_WIN.run b/ci/requirements-3.6_WIN.run deleted file mode 100644 index 5d6c074ec1f856..00000000000000 --- a/ci/requirements-3.6_WIN.run +++ /dev/null @@ -1,16 +0,0 @@ -python-dateutil -pytz -numpy=1.12* -bottleneck -openpyxl -xlsxwriter -xlrd -xlwt -# scipy -feather-format -numexpr -pytables -matplotlib -blosc -fastparquet -pyarrow diff --git a/ci/requirements-optional-conda.txt b/ci/requirements-optional-conda.txt index 6edb8d17337e40..376fdb1e14e3ac 100644 --- a/ci/requirements-optional-conda.txt +++ b/ci/requirements-optional-conda.txt @@ -1,17 +1,18 @@ -beautifulsoup4 +beautifulsoup4>=4.2.1 blosc bottleneck fastparquet feather-format +gcsfs html5lib -ipython +ipython>=5.6.0 ipykernel jinja2 lxml matplotlib nbsphinx numexpr -openpyxl +openpyxl=2.5.5 pyarrow pymysql pytables @@ -21,6 +22,7 @@ s3fs scipy seaborn sqlalchemy +statsmodels xarray xlrd xlsxwriter diff --git a/ci/requirements-optional-pip.txt b/ci/requirements-optional-pip.txt index 06b22bd8f2c63f..2e1bf0ca22bcf0 100644 --- a/ci/requirements-optional-pip.txt +++ b/ci/requirements-optional-pip.txt @@ -1,17 +1,20 @@ # This file was autogenerated by scripts/convert_deps.py -# Do not modify directlybeautifulsoup4 +# Do not modify directly +beautifulsoup4>=4.2.1 blosc bottleneck fastparquet feather-format +gcsfs html5lib -ipython +ipython>=5.6.0 +ipykernel jinja2 lxml matplotlib nbsphinx numexpr -openpyxl +openpyxl=2.5.5 pyarrow pymysql tables @@ -21,7 +24,8 @@ s3fs scipy seaborn sqlalchemy +statsmodels xarray xlrd xlsxwriter -xlwt \ No newline at end of file +xlwt diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 2fb36b7cd70d8b..68fffe5d0df09c 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -1,10 +1,14 @@ # This file was autogenerated by scripts/convert_deps.py # Do not modify directly -Cython +Cython>=0.28.2 NumPy +flake8 +flake8-comprehensions +hypothesis>=3.58.0 moto -pytest -python-dateutil +pytest>=3.6 +python-dateutil>=2.5.0 pytz -setuptools -sphinx \ No newline at end of file +setuptools>=24.2.0 +sphinx +sphinxcontrib-spelling \ No newline at end of file diff --git a/ci/run_circle.sh b/ci/run_circle.sh index 0e46d28ab6fc4d..fc2a8b849a3548 100755 --- a/ci/run_circle.sh +++ b/ci/run_circle.sh @@ -5,5 +5,5 @@ export PATH="$MINICONDA_DIR/bin:$PATH" source activate pandas -echo "pytest --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" -pytest --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas +echo "pytest --strict --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas" +pytest --strict --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml $@ pandas diff --git a/ci/script_multi.sh b/ci/script_multi.sh index ee9fbcaad5ef5f..2b2d4d5488b91f 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -e echo "[script multi]" @@ -12,38 +12,32 @@ if [ -n "$LOCALE_OVERRIDE" ]; then python -c "$pycmd" fi +# Enforce absent network during testing by faking a proxy +if echo "$TEST_ARGS" | grep -e --skip-network -q; then + export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; +fi + # Workaround for pytest-xdist flaky collection order # https://github.com/pytest-dev/pytest/issues/920 # https://github.com/pytest-dev/pytest/issues/1075 export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') echo PYTHONHASHSEED=$PYTHONHASHSEED -if [ "$BUILD_TEST" ]; then - echo "[build-test]" - - echo "[env]" - pip list --format columns |grep pandas - - echo "[running]" - cd /tmp - unset PYTHONPATH - python -c 'import pandas; pandas.test(["-n 2", "--skip-slow", "--skip-network", "-r xX", "-m not single"])' - -elif [ "$DOC" ]; then +if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas elif [ "$SLOW" ]; then TEST_ARGS="--only-slow --skip-network" - echo pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + echo pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -r xX -m "not single and slow" -v --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas else - echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas + pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/ci/script_single.sh b/ci/script_single.sh index 375e9879e950fd..ed12ee35b91513 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -16,19 +16,23 @@ if [ "$SLOW" ]; then TEST_ARGS="--only-slow --skip-network" fi -if [ "$BUILD_TEST" ]; then - echo "We are not running pytest as this is a build test." +# Enforce absent network during testing by faking a proxy +if echo "$TEST_ARGS" | grep -e --skip-network -q; then + export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; +fi -elif [ "$DOC" ]; then +if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + echo pytest -s -r xXs --strict scripts + pytest -s -r xXs --strict scripts else - echo pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas + pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/ci/travis-27-locale.yaml b/ci/travis-27-locale.yaml new file mode 100644 index 00000000000000..aca65f27d41876 --- /dev/null +++ b/ci/travis-27-locale.yaml @@ -0,0 +1,28 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - bottleneck=1.0.0 + - cython=0.28.2 + - lxml + - matplotlib=1.4.3 + - numpy=1.9.3 + - openpyxl=2.4.0 + - python-dateutil + - python-blosc + - python=2.7 + - pytz + - pytz=2013b + - scipy + - sqlalchemy=0.8.1 + - xlrd=0.9.2 + - xlsxwriter=0.5.2 + - xlwt=0.7.5 + # universal + - pytest + - pytest-xdist + - hypothesis>=3.58.0 + - pip: + - html5lib==1.0b2 + - beautifulsoup4==4.2.1 diff --git a/ci/travis-27.yaml b/ci/travis-27.yaml new file mode 100644 index 00000000000000..a921bcb46dba4f --- /dev/null +++ b/ci/travis-27.yaml @@ -0,0 +1,53 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - bottleneck + - cython=0.28.2 + - fastparquet + - feather-format + - flake8=3.4.1 + - flake8-comprehensions + - gcsfs + - html5lib + - ipython + - jemalloc=4.5.0.post + - jinja2=2.8 + - lxml + - matplotlib + - mock + - nomkl + - numexpr + - numpy=1.13* + - openpyxl=2.4.0 + - patsy + - psycopg2 + - py + - pyarrow=0.4.1 + - PyCrypto + - pymysql=0.6.3 + - pytables + - blosc=1.14.3 + - python-blosc + - python-dateutil=2.5.0 + - python=2.7* + - pytz=2013b + - s3fs + - scipy + - sqlalchemy=0.9.6 + - xarray=0.8.0 + - xlrd=0.9.2 + - xlsxwriter=0.5.2 + - xlwt=0.7.5 + # universal + - pytest + - pytest-xdist + - moto + - hypothesis>=3.58.0 + - pip: + - backports.lzma + - cpplint + - pandas-gbq + - pathlib diff --git a/ci/travis-35-osx.yaml b/ci/travis-35-osx.yaml new file mode 100644 index 00000000000000..a36f748ded8129 --- /dev/null +++ b/ci/travis-35-osx.yaml @@ -0,0 +1,28 @@ +name: pandas +channels: + - defaults +dependencies: + - beautifulsoup4 + - bottleneck + - cython>=0.28.2 + - html5lib + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy=1.10.4 + - openpyxl=2.5.5 + - pytables + - python=3.5* + - pytz + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - pip: + - python-dateutil==2.5.3 + - hypothesis>=3.58.0 diff --git a/ci/travis-36-doc.yaml b/ci/travis-36-doc.yaml new file mode 100644 index 00000000000000..50626088d5bc49 --- /dev/null +++ b/ci/travis-36-doc.yaml @@ -0,0 +1,47 @@ +name: pandas +channels: + - defaults + - conda-forge + - r +dependencies: + - beautifulsoup4 + - bottleneck + - cython>=0.28.2 + - fastparquet + - feather-format + - html5lib + - hypothesis>=3.58.0 + - ipykernel + - ipython + - ipywidgets + - lxml + - matplotlib + - nbconvert + - nbformat + - nbsphinx + - notebook + - numexpr + - numpy=1.13* + - openpyxl=2.5.5 + - pandoc + - pyqt + - pytables + - python-dateutil + - python-snappy + - python=3.6* + - pytz + - r + - rpy2 + - scipy + - seaborn + - sphinx + - sqlalchemy + - statsmodels + - tzlocal + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist diff --git a/ci/travis-36-slow.yaml b/ci/travis-36-slow.yaml new file mode 100644 index 00000000000000..1a7bc53e1b74be --- /dev/null +++ b/ci/travis-36-slow.yaml @@ -0,0 +1,31 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython>=0.28.2 + - html5lib + - lxml + - matplotlib + - numexpr + - numpy + - openpyxl=2.5.5 + - patsy + - psycopg2 + - pymysql + - pytables + - python-dateutil + - python=3.6* + - pytz + - s3fs + - scipy + - sqlalchemy + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - moto + - hypothesis>=3.58.0 diff --git a/ci/travis-36.yaml b/ci/travis-36.yaml new file mode 100644 index 00000000000000..3c9daa5f8b73c8 --- /dev/null +++ b/ci/travis-36.yaml @@ -0,0 +1,49 @@ +name: pandas +channels: + - defaults + - conda-forge +dependencies: + - beautifulsoup4 + - cython>=0.28.2 + - dask + - fastparquet + - feather-format + - gcsfs + - geopandas + - html5lib + - ipython + - jinja2 + - lxml + - matplotlib + - nomkl + - numexpr + - numpy + - openpyxl=2.5.5 + - psycopg2 + - pyarrow + - pymysql + - pytables + - python-snappy + - python=3.6* + - pytz + - s3fs + - scikit-learn + - scipy + - seaborn + - sqlalchemy + - statsmodels + - xarray + - xlrd + - xlsxwriter + - xlwt + # universal + - pytest + - pytest-xdist + - pytest-cov + - moto + - hypothesis>=3.58.0 + - pip: + - brotlipy + - coverage + - pandas-datareader + - python-dateutil diff --git a/ci/travis-37-numpydev.yaml b/ci/travis-37-numpydev.yaml new file mode 100644 index 00000000000000..82c75b7c91b1f4 --- /dev/null +++ b/ci/travis-37-numpydev.yaml @@ -0,0 +1,17 @@ +name: pandas +channels: + - defaults +dependencies: + - python=3.7* + - pytz + - Cython>=0.28.2 + # universal + - pytest + - pytest-xdist + - hypothesis>=3.58.0 + - pip: + - "git+git://github.com/dateutil/dateutil.git" + - "-f https://7933911d6844c6c53a7d-47bd50c35cd79bd838daf386af554a83.ssl.cf2.rackcdn.com" + - "--pre" + - "numpy" + - "scipy" diff --git a/ci/travis-37.yaml b/ci/travis-37.yaml new file mode 100644 index 00000000000000..4f2138d8555e34 --- /dev/null +++ b/ci/travis-37.yaml @@ -0,0 +1,15 @@ +name: pandas +channels: + - defaults + - conda-forge + - c3i_test +dependencies: + - python=3.7 + - cython>=0.28.2 + - numpy + - python-dateutil + - nomkl + - pytz + - pytest + - pytest-xdist + - hypothesis>=3.58.0 diff --git a/circle.yml b/circle.yml deleted file mode 100644 index 9d49145af54e33..00000000000000 --- a/circle.yml +++ /dev/null @@ -1,38 +0,0 @@ -machine: - environment: - # these are globally set - MINICONDA_DIR: /home/ubuntu/miniconda3 - - -database: - override: - - ./ci/install_db_circle.sh - - -checkout: - post: - # since circleci does a shallow fetch - # we need to populate our tags - - git fetch --depth=1000 - - -dependencies: - override: - - > - case $CIRCLE_NODE_INDEX in - 0) - sudo apt-get install language-pack-it && ./ci/install_circle.sh JOB="2.7_COMPAT" LOCALE_OVERRIDE="it_IT.UTF-8" ;; - 1) - sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; - 2) - sudo apt-get install language-pack-zh-hans && ./ci/install_circle.sh JOB="3.6_LOCALE_SLOW" LOCALE_OVERRIDE="zh_CN.UTF-8" ;; - 3) - ./ci/install_circle.sh JOB="3.5_ASCII" LOCALE_OVERRIDE="C" ;; - esac - - ./ci/show_circle.sh - - -test: - override: - - case $CIRCLE_NODE_INDEX in 0) ./ci/run_circle.sh --skip-slow --skip-network ;; 1) ./ci/run_circle.sh --only-slow --skip-network ;; 2) ./ci/run_circle.sh --skip-slow --skip-network ;; 3) ./ci/run_circle.sh --skip-slow --skip-network ;; esac: - parallel: true diff --git a/conda.recipe/meta.yaml b/conda.recipe/meta.yaml index 2aee11772896f8..f92090fecccf35 100644 --- a/conda.recipe/meta.yaml +++ b/conda.recipe/meta.yaml @@ -1,9 +1,9 @@ package: name: pandas - version: {{ GIT_DESCRIBE_TAG|replace("v","") }} + version: {{ environ.get('GIT_DESCRIBE_TAG','').replace('v', '', 1) }} build: - number: {{ GIT_DESCRIBE_NUMBER|int }} + number: {{ environ.get('GIT_DESCRIBE_NUMBER', 0) }} {% if GIT_DESCRIBE_NUMBER|int == 0 %}string: np{{ CONDA_NPY }}py{{ CONDA_PY }}_0 {% else %}string: np{{ CONDA_NPY }}py{{ CONDA_PY }}_{{ GIT_BUILD_STR }}{% endif %} @@ -12,22 +12,28 @@ source: requirements: build: + - {{ compiler('c') }} + - {{ compiler('cxx') }} + host: - python + - pip - cython - - numpy x.x - - setuptools + - numpy + - setuptools >=3.3 + - python-dateutil >=2.5.0 - pytz - - python-dateutil - run: - - python - - numpy x.x - - python-dateutil + - python {{ python }} + - {{ pin_compatible('numpy') }} + - python-dateutil >=2.5.0 - pytz test: - imports: - - pandas + requires: + - pytest + commands: + - python -c "import pandas; pandas.test()" + about: home: http://pandas.pydata.org diff --git a/doc/README.rst b/doc/README.rst index b2c66611b68bbb..12950d323f5d34 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -42,7 +42,7 @@ Some other important things to know about the docs: - The docstrings follow the **Numpy Docstring Standard** which is used widely in the Scientific Python community. This standard specifies the format of the different sections of the docstring. See `this document - `_ + `_ for a detailed explanation, or look at some of the existing functions to extend it in a similar manner. @@ -160,8 +160,8 @@ Where to start? There are a number of issues listed under `Docs `_ -and `Good as first PR -`_ +and `good first issue +`_ where you could start out. Or maybe you have an idea of your own, by using pandas, looking for something diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf index 0492805a1408b1..696ed288cf7a60 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx index 6cca9ac4647f7e..f8b98a6f1f8e4a 100644 Binary files a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pdf new file mode 100644 index 00000000000000..746d1b6c980fed Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pptx new file mode 100644 index 00000000000000..f8b98a6f1f8e4a Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet_JP.pptx differ diff --git a/doc/make.py b/doc/make.py index acef563f301e42..d85747458148d5 100755 --- a/doc/make.py +++ b/doc/make.py @@ -1,129 +1,44 @@ #!/usr/bin/env python - """ Python script for building documentation. To build the docs you must have all optional dependencies for pandas installed. See the installation instructions for a list of these. -Note: currently latex builds do not work because of table formats that are not -supported in the latex generation. - -2014-01-30: Latex has some issues but 'latex_forced' works ok for 0.13.0-400 or so - Usage ----- -python make.py clean -python make.py html + $ python make.py clean + $ python make.py html + $ python make.py latex """ -from __future__ import print_function - -import io -import glob # noqa +import importlib +import sys import os import shutil -import sys -from contextlib import contextmanager - -import sphinx # noqa +# import subprocess import argparse -import jinja2 # noqa - -os.environ['PYTHONPATH'] = '..' - -SPHINX_BUILD = 'sphinxbuild' +from contextlib import contextmanager +import webbrowser +import jinja2 -def _process_user(user): - if user is None or user is False: - user = '' - else: - user = user + '@' - return user - - -def upload_dev(user=None): - 'push a copy to the pydata dev directory' - user = _process_user(user) - if os.system('cd build/html; rsync -avz . {0}pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'.format(user)): - raise SystemExit('Upload to Pydata Dev failed') - - -def upload_dev_pdf(user=None): - 'push a copy to the pydata dev directory' - user = _process_user(user) - if os.system('cd build/latex; scp pandas.pdf {0}pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/dev/'.format(user)): - raise SystemExit('PDF upload to Pydata Dev failed') - - -def upload_stable(user=None): - 'push a copy to the pydata stable directory' - user = _process_user(user) - if os.system('cd build/html; rsync -avz . {0}pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'.format(user)): - raise SystemExit('Upload to stable failed') - - -def upload_stable_pdf(user=None): - 'push a copy to the pydata dev directory' - user = _process_user(user) - if os.system('cd build/latex; scp pandas.pdf {0}pandas.pydata.org' - ':/usr/share/nginx/pandas/pandas-docs/stable/'.format(user)): - raise SystemExit('PDF upload to stable failed') - - -def upload_prev(ver, doc_root='./', user=None): - 'push a copy of older release to appropriate version directory' - user = _process_user(user) - local_dir = doc_root + 'build/html' - remote_dir = '/usr/share/nginx/pandas/pandas-docs/version/%s/' % ver - cmd = 'cd %s; rsync -avz . %spandas.pydata.org:%s -essh' - cmd = cmd % (local_dir, user, remote_dir) - print(cmd) - if os.system(cmd): - raise SystemExit( - 'Upload to %s from %s failed' % (remote_dir, local_dir)) - - local_dir = doc_root + 'build/latex' - pdf_cmd = 'cd %s; scp pandas.pdf %spandas.pydata.org:%s' - pdf_cmd = pdf_cmd % (local_dir, user, remote_dir) - if os.system(pdf_cmd): - raise SystemExit('Upload PDF to %s from %s failed' % (ver, doc_root)) - -def build_pandas(): - os.chdir('..') - os.system('python setup.py clean') - os.system('python setup.py build_ext --inplace') - os.chdir('doc') - -def build_prev(ver): - if os.system('git checkout v%s' % ver) != 1: - os.chdir('..') - os.system('python setup.py clean') - os.system('python setup.py build_ext --inplace') - os.chdir('doc') - os.system('python make.py clean') - os.system('python make.py html') - os.system('python make.py latex') - os.system('git checkout master') - - -def clean(): - if os.path.exists('build'): - shutil.rmtree('build') - - if os.path.exists('source/generated'): - shutil.rmtree('source/generated') +DOC_PATH = os.path.dirname(os.path.abspath(__file__)) +SOURCE_PATH = os.path.join(DOC_PATH, 'source') +BUILD_PATH = os.path.join(DOC_PATH, 'build') +BUILD_DIRS = ['doctrees', 'html', 'latex', 'plots', '_static', '_templates'] @contextmanager -def maybe_exclude_notebooks(): - """ - Skip building the notebooks if pandoc is not installed. +def _maybe_exclude_notebooks(): + """Skip building the notebooks if pandoc is not installed. + This assumes that nbsphinx is installed. + + Skip notebook conversion if: + 1. nbconvert isn't installed, or + 2. nbconvert is installed, but pandoc isn't """ + # TODO move to exclude_pattern base = os.path.dirname(__file__) notebooks = [os.path.join(base, 'source', nb) for nb in ['style.ipynb']] @@ -135,304 +50,327 @@ def _remove_notebooks(): contents[nb] = f.read() os.remove(nb) - # Skip notebook conversion if - # 1. nbconvert isn't installed, or - # 2. nbconvert is installed, but pandoc isn't try: import nbconvert except ImportError: - print("Warning: nbconvert not installed. Skipping notebooks.") + sys.stderr.write('Warning: nbconvert not installed. ' + 'Skipping notebooks.\n') _remove_notebooks() else: try: nbconvert.utils.pandoc.get_pandoc_version() except nbconvert.utils.pandoc.PandocMissing: - print("Warning: Pandoc is not installed. Skipping notebooks.") + sys.stderr.write('Warning: Pandoc is not installed. ' + 'Skipping notebooks.\n') _remove_notebooks() yield + for nb, content in contents.items(): with open(nb, 'wt') as f: f.write(content) -def html(): - check_build() - - with maybe_exclude_notebooks(): - if os.system('sphinx-build -P -b html -d build/doctrees ' - 'source build/html'): - raise SystemExit("Building HTML failed.") - try: - # remove stale file - os.remove('build/html/pandas.zip') - except: - pass - - -def zip_html(): - try: - print("\nZipping up HTML docs...") - # just in case the wonky build box doesn't have zip - # don't fail this. - os.system('cd build; rm -f html/pandas.zip; zip html/pandas.zip -r -q html/* ') - print("\n") - except: - pass - -def latex(): - check_build() - if sys.platform != 'win32': - # LaTeX format. - if os.system('sphinx-build -j 2 -b latex -d build/doctrees ' - 'source build/latex'): - raise SystemExit("Building LaTeX failed.") - # Produce pdf. - - os.chdir('build/latex') - - # Call the makefile produced by sphinx... - if os.system('make'): - print("Rendering LaTeX failed.") - print("You may still be able to get a usable PDF file by going into 'build/latex'") - print("and executing 'pdflatex pandas.tex' for the requisite number of passes.") - print("Or using the 'latex_forced' target") - raise SystemExit - - os.chdir('../..') - else: - print('latex build has not been tested on windows') - -def latex_forced(): - check_build() - if sys.platform != 'win32': - # LaTeX format. - if os.system('sphinx-build -j 2 -b latex -d build/doctrees ' - 'source build/latex'): - raise SystemExit("Building LaTeX failed.") - # Produce pdf. - - os.chdir('build/latex') - - # Manually call pdflatex, 3 passes should ensure latex fixes up - # all the required cross-references and such. - os.system('pdflatex -interaction=nonstopmode pandas.tex') - os.system('pdflatex -interaction=nonstopmode pandas.tex') - os.system('pdflatex -interaction=nonstopmode pandas.tex') - raise SystemExit("You should check the file 'build/latex/pandas.pdf' for problems.") - - os.chdir('../..') - else: - print('latex build has not been tested on windows') +class DocBuilder: + """Class to wrap the different commands of this script. + All public methods of this class can be called as parameters of the + script. + """ + def __init__(self, num_jobs=1, include_api=True, single_doc=None, + verbosity=0): + self.num_jobs = num_jobs + self.include_api = include_api + self.verbosity = verbosity + self.single_doc = None + self.single_doc_type = None + if single_doc is not None: + self._process_single_doc(single_doc) + self.exclude_patterns = self._exclude_patterns + + self._generate_index() + if self.single_doc_type == 'docstring': + self._run_os('sphinx-autogen', '-o', + 'source/generated_single', 'source/index.rst') + + @property + def _exclude_patterns(self): + """Docs source files that will be excluded from building.""" + # TODO move maybe_exclude_notebooks here + if self.single_doc is not None: + rst_files = [f for f in os.listdir(SOURCE_PATH) + if ((f.endswith('.rst') or f.endswith('.ipynb')) + and (f != 'index.rst') + and (f != '{0}.rst'.format(self.single_doc)))] + if self.single_doc_type != 'api': + rst_files += ['generated/*.rst'] + elif not self.include_api: + rst_files = ['api.rst', 'generated/*.rst'] + else: + rst_files = ['generated_single/*.rst'] + + exclude_patterns = ','.join( + '{!r}'.format(i) for i in ['**.ipynb_checkpoints'] + rst_files) + + return exclude_patterns + + def _process_single_doc(self, single_doc): + """Extract self.single_doc (base name) and self.single_doc_type from + passed single_doc kwarg. + + """ + self.include_api = False + + if single_doc == 'api.rst' or single_doc == 'api': + self.single_doc_type = 'api' + self.single_doc = 'api' + elif os.path.exists(os.path.join(SOURCE_PATH, single_doc)): + self.single_doc_type = 'rst' + self.single_doc = os.path.splitext(os.path.basename(single_doc))[0] + elif os.path.exists( + os.path.join(SOURCE_PATH, '{}.rst'.format(single_doc))): + self.single_doc_type = 'rst' + self.single_doc = single_doc + elif single_doc is not None: + try: + obj = pandas # noqa: F821 + for name in single_doc.split('.'): + obj = getattr(obj, name) + except AttributeError: + raise ValueError('Single document not understood, it should ' + 'be a file in doc/source/*.rst (e.g. ' + '"contributing.rst" or a pandas function or ' + 'method (e.g. "pandas.DataFrame.head")') + else: + self.single_doc_type = 'docstring' + if single_doc.startswith('pandas.'): + self.single_doc = single_doc[len('pandas.'):] + else: + self.single_doc = single_doc + + def _copy_generated_docstring(self): + """Copy existing generated (from api.rst) docstring page because + this is more correct in certain cases (where a custom autodoc + template is used). + + """ + fname = os.path.join(SOURCE_PATH, 'generated', + 'pandas.{}.rst'.format(self.single_doc)) + temp_dir = os.path.join(SOURCE_PATH, 'generated_single') -def check_build(): - build_dirs = [ - 'build', 'build/doctrees', 'build/html', - 'build/latex', 'build/plots', 'build/_static', - 'build/_templates'] - for d in build_dirs: try: - os.mkdir(d) + os.makedirs(temp_dir) except OSError: pass + if os.path.exists(fname): + try: + # copying to make sure sphinx always thinks it is new + # and needs to be re-generated (to pick source code changes) + shutil.copy(fname, temp_dir) + except: # noqa + pass + + def _generate_index(self): + """Create index.rst file with the specified sections.""" + if self.single_doc_type == 'docstring': + self._copy_generated_docstring() + + with open(os.path.join(SOURCE_PATH, 'index.rst.template')) as f: + t = jinja2.Template(f.read()) + + with open(os.path.join(SOURCE_PATH, 'index.rst'), 'w') as f: + f.write(t.render(include_api=self.include_api, + single_doc=self.single_doc, + single_doc_type=self.single_doc_type)) + + @staticmethod + def _create_build_structure(): + """Create directories required to build documentation.""" + for dirname in BUILD_DIRS: + try: + os.makedirs(os.path.join(BUILD_PATH, dirname)) + except OSError: + pass + + @staticmethod + def _run_os(*args): + """Execute a command as a OS terminal. + + Parameters + ---------- + *args : list of str + Command and parameters to be executed + + Examples + -------- + >>> DocBuilder()._run_os('python', '--version') + """ + # TODO check_call should be more safe, but it fails with + # exclude patterns, needs investigation + # subprocess.check_call(args, stderr=subprocess.STDOUT) + os.system(' '.join(args)) + + def _sphinx_build(self, kind): + """Call sphinx to build documentation. + + Attribute `num_jobs` from the class is used. + + Parameters + ---------- + kind : {'html', 'latex'} + + Examples + -------- + >>> DocBuilder(num_jobs=4)._sphinx_build('html') + """ + if kind not in ('html', 'latex', 'spelling'): + raise ValueError('kind must be html, latex or ' + 'spelling, not {}'.format(kind)) + + self._run_os('sphinx-build', + '-j{}'.format(self.num_jobs), + '-b{}'.format(kind), + '-{}'.format( + 'v' * self.verbosity) if self.verbosity else '', + '-d{}'.format(os.path.join(BUILD_PATH, 'doctrees')), + '-Dexclude_patterns={}'.format(self.exclude_patterns), + SOURCE_PATH, + os.path.join(BUILD_PATH, kind)) + + def _open_browser(self): + base_url = os.path.join('file://', DOC_PATH, 'build', 'html') + if self.single_doc_type == 'docstring': + url = os.path.join( + base_url, + 'generated_single', 'pandas.{}.html'.format(self.single_doc)) + else: + url = os.path.join(base_url, '{}.html'.format(self.single_doc)) + webbrowser.open(url, new=2) + + def html(self): + """Build HTML documentation.""" + self._create_build_structure() + with _maybe_exclude_notebooks(): + self._sphinx_build('html') + zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + if os.path.exists(zip_fname): + os.remove(zip_fname) + + if self.single_doc is not None: + self._open_browser() + shutil.rmtree(os.path.join(SOURCE_PATH, 'generated_single'), + ignore_errors=True) + + def latex(self, force=False): + """Build PDF documentation.""" + self._create_build_structure() + if sys.platform == 'win32': + sys.stderr.write('latex build has not been tested on windows\n') + else: + self._sphinx_build('latex') + os.chdir(os.path.join(BUILD_PATH, 'latex')) + if force: + for i in range(3): + self._run_os('pdflatex', + '-interaction=nonstopmode', + 'pandas.tex') + raise SystemExit('You should check the file ' + '"build/latex/pandas.pdf" for problems.') + else: + self._run_os('make') + + def latex_forced(self): + """Build PDF documentation with retries to find missing references.""" + self.latex(force=True) + + @staticmethod + def clean(): + """Clean documentation generated files.""" + shutil.rmtree(BUILD_PATH, ignore_errors=True) + shutil.rmtree(os.path.join(SOURCE_PATH, 'generated'), + ignore_errors=True) + + def zip_html(self): + """Compress HTML documentation into a zip file.""" + zip_fname = os.path.join(BUILD_PATH, 'html', 'pandas.zip') + if os.path.exists(zip_fname): + os.remove(zip_fname) + dirname = os.path.join(BUILD_PATH, 'html') + fnames = os.listdir(dirname) + os.chdir(dirname) + self._run_os('zip', + zip_fname, + '-r', + '-q', + *fnames) + + def spellcheck(self): + """Spell check the documentation.""" + self._sphinx_build('spelling') + output_location = os.path.join('build', 'spelling', 'output.txt') + with open(output_location) as output: + lines = output.readlines() + if lines: + raise SyntaxError( + 'Found misspelled words.' + ' Check pandas/doc/build/spelling/output.txt' + ' for more details.') -def all(): - # clean() - html() - - -def auto_dev_build(debug=False): - msg = '' - try: - step = 'clean' - clean() - step = 'html' - html() - step = 'upload dev' - upload_dev() - if not debug: - sendmail(step) - - step = 'latex' - latex() - step = 'upload pdf' - upload_dev_pdf() - if not debug: - sendmail(step) - except (Exception, SystemExit) as inst: - msg = str(inst) + '\n' - sendmail(step, '[ERROR] ' + msg) - - -def sendmail(step=None, err_msg=None): - from_name, to_name = _get_config() - - if step is None: - step = '' - - if err_msg is None or '[ERROR]' not in err_msg: - msgstr = 'Daily docs %s completed successfully' % step - subject = "DOC: %s successful" % step - else: - msgstr = err_msg - subject = "DOC: %s failed" % step - - import smtplib - from email.MIMEText import MIMEText - msg = MIMEText(msgstr) - msg['Subject'] = subject - msg['From'] = from_name - msg['To'] = to_name - - server_str, port, login, pwd = _get_credentials() - server = smtplib.SMTP(server_str, port) - server.ehlo() - server.starttls() - server.ehlo() - - server.login(login, pwd) - try: - server.sendmail(from_name, to_name, msg.as_string()) - finally: - server.close() - - -def _get_dir(subdir=None): - import getpass - USERNAME = getpass.getuser() - if sys.platform == 'darwin': - HOME = '/Users/%s' % USERNAME - else: - HOME = '/home/%s' % USERNAME - - if subdir is None: - subdir = '/code/scripts/config' - conf_dir = '%s/%s' % (HOME, subdir) - return conf_dir - - -def _get_credentials(): - tmp_dir = _get_dir() - cred = '%s/credentials' % tmp_dir - with open(cred, 'r') as fh: - server, port, un, domain = fh.read().split(',') - port = int(port) - login = un + '@' + domain + '.com' - - import base64 - with open('%s/cron_email_pwd' % tmp_dir, 'r') as fh: - pwd = base64.b64decode(fh.read()) - - return server, port, login, pwd - - -def _get_config(): - tmp_dir = _get_dir() - with open('%s/addresses' % tmp_dir, 'r') as fh: - from_name, to_name = fh.read().split(',') - return from_name, to_name - -funcd = { - 'html': html, - 'zip_html': zip_html, - 'upload_dev': upload_dev, - 'upload_stable': upload_stable, - 'upload_dev_pdf': upload_dev_pdf, - 'upload_stable_pdf': upload_stable_pdf, - 'latex': latex, - 'latex_forced': latex_forced, - 'clean': clean, - 'auto_dev': auto_dev_build, - 'auto_debug': lambda: auto_dev_build(True), - 'build_pandas': build_pandas, - 'all': all, -} - -small_docs = False - -# current_dir = os.getcwd() -# os.chdir(os.path.dirname(os.path.join(current_dir, __file__))) - -import argparse -argparser = argparse.ArgumentParser(description=""" -pandas documentation builder -""".strip()) - -# argparser.add_argument('-arg_name', '--arg_name', -# metavar='label for arg help', -# type=str|etc, -# nargs='N|*|?|+|argparse.REMAINDER', -# required=False, -# #choices='abc', -# help='help string', -# action='store|store_true') - -# args = argparser.parse_args() - -#print args.accumulate(args.integers) - -def generate_index(api=True, single=False, **kwds): - from jinja2 import Template - with open("source/index.rst.template") as f: - t = Template(f.read()) - - with open("source/index.rst","w") as f: - f.write(t.render(api=api,single=single,**kwds)) - -import argparse -argparser = argparse.ArgumentParser(description="pandas documentation builder", - epilog="Targets : %s" % funcd.keys()) - -argparser.add_argument('--no-api', - default=False, - help='Ommit api and autosummary', - action='store_true') -argparser.add_argument('--single', - metavar='FILENAME', - type=str, - default=False, - help='filename of section to compile, e.g. "indexing"') -argparser.add_argument('--user', - type=str, - default=False, - help='Username to connect to the pydata server') def main(): - args, unknown = argparser.parse_known_args() - sys.argv = [sys.argv[0]] + unknown - if args.single: - args.single = os.path.basename(args.single).split(".rst")[0] - - if 'clean' in unknown: - args.single=False - - generate_index(api=not args.no_api and not args.single, single=args.single) - - if len(sys.argv) > 2: - ftype = sys.argv[1] - ver = sys.argv[2] - - if ftype == 'build_previous': - build_prev(ver, user=args.user) - if ftype == 'upload_previous': - upload_prev(ver, user=args.user) - elif len(sys.argv) == 2: - for arg in sys.argv[1:]: - func = funcd.get(arg) - if func is None: - raise SystemExit('Do not know how to handle %s; valid args are %s' % ( - arg, list(funcd.keys()))) - if args.user: - func(user=args.user) - else: - func() - else: - small_docs = False - all() -# os.chdir(current_dir) + cmds = [method for method in dir(DocBuilder) if not method.startswith('_')] + + argparser = argparse.ArgumentParser( + description='pandas documentation builder', + epilog='Commands: {}'.format(','.join(cmds))) + argparser.add_argument('command', + nargs='?', + default='html', + help='command to run: {}'.format(', '.join(cmds))) + argparser.add_argument('--num-jobs', + type=int, + default=1, + help='number of jobs used by sphinx-build') + argparser.add_argument('--no-api', + default=False, + help='ommit api and autosummary', + action='store_true') + argparser.add_argument('--single', + metavar='FILENAME', + type=str, + default=None, + help=('filename of section or method name to ' + 'compile, e.g. "indexing", "DataFrame.join"')) + argparser.add_argument('--python-path', + type=str, + default=os.path.dirname(DOC_PATH), + help='path') + argparser.add_argument('-v', action='count', dest='verbosity', default=0, + help=('increase verbosity (can be repeated), ' + 'passed to the sphinx build command')) + args = argparser.parse_args() + + if args.command not in cmds: + raise ValueError('Unknown command {}. Available options: {}'.format( + args.command, ', '.join(cmds))) + + # Below we update both os.environ and sys.path. The former is used by + # external libraries (namely Sphinx) to compile this module and resolve + # the import of `python_path` correctly. The latter is used to resolve + # the import within the module, injecting it into the global namespace + os.environ['PYTHONPATH'] = args.python_path + sys.path.append(args.python_path) + globals()['pandas'] = importlib.import_module('pandas') + + # Set the matplotlib backend to the non-interactive Agg backend for all + # child processes. + os.environ['MPLBACKEND'] = 'module://matplotlib.backends.backend_agg' + + builder = DocBuilder(args.num_jobs, not args.no_api, args.single, + args.verbosity) + getattr(builder, args.command)() + if __name__ == '__main__': - import sys sys.exit(main()) diff --git a/doc/plots/stats/moment_plots.py b/doc/plots/stats/moment_plots.py deleted file mode 100644 index 9e3a902592c6bf..00000000000000 --- a/doc/plots/stats/moment_plots.py +++ /dev/null @@ -1,30 +0,0 @@ -import numpy as np - -import matplotlib.pyplot as plt -import pandas.util.testing as t -import pandas.stats.moments as m - - -def test_series(n=1000): - t.N = n - s = t.makeTimeSeries() - return s - - -def plot_timeseries(*args, **kwds): - n = len(args) - - fig, axes = plt.subplots(n, 1, figsize=kwds.get('size', (10, 5)), - sharex=True) - titles = kwds.get('titles', None) - - for k in range(1, n + 1): - ax = axes[k - 1] - ts = args[k - 1] - ax.plot(ts.index, ts.values) - - if titles: - ax.set_title(titles[k - 1]) - - fig.autofmt_xdate() - fig.subplots_adjust(bottom=0.10, top=0.95) diff --git a/doc/plots/stats/moments_ewma.py b/doc/plots/stats/moments_ewma.py deleted file mode 100644 index 3e521ed60bb8f5..00000000000000 --- a/doc/plots/stats/moments_ewma.py +++ /dev/null @@ -1,15 +0,0 @@ -import matplotlib.pyplot as plt -import pandas.util.testing as t -import pandas.stats.moments as m - -t.N = 200 -s = t.makeTimeSeries().cumsum() - -plt.figure(figsize=(10, 5)) -plt.plot(s.index, s.values) -plt.plot(s.index, m.ewma(s, 20, min_periods=1).values) -f = plt.gcf() -f.autofmt_xdate() - -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_ewmvol.py b/doc/plots/stats/moments_ewmvol.py deleted file mode 100644 index 093f62868fc4e3..00000000000000 --- a/doc/plots/stats/moments_ewmvol.py +++ /dev/null @@ -1,23 +0,0 @@ -import matplotlib.pyplot as plt -import pandas.util.testing as t -import pandas.stats.moments as m - -t.N = 500 -ts = t.makeTimeSeries() -ts[::100] = 20 - -s = ts.cumsum() - - -plt.figure(figsize=(10, 5)) -plt.plot(s.index, m.ewmvol(s, span=50, min_periods=1).values, color='b') -plt.plot(s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') - -plt.title('Exp-weighted std with shocks') -plt.legend(('Exp-weighted', 'Equal-weighted')) - -f = plt.gcf() -f.autofmt_xdate() - -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_expw.py b/doc/plots/stats/moments_expw.py deleted file mode 100644 index 5fff419b3a9405..00000000000000 --- a/doc/plots/stats/moments_expw.py +++ /dev/null @@ -1,35 +0,0 @@ -from moment_plots import * - -np.random.seed(1) - -ts = test_series(500) * 10 - -# ts[::100] = 20 - -s = ts.cumsum() - -fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) - -ax0, ax1, ax2 = axes - -ax0.plot(s.index, s.values) -ax0.set_title('time series') - -ax1.plot(s.index, m.ewma(s, span=50, min_periods=1).values, color='b') -ax1.plot(s.index, m.rolling_mean(s, 50, min_periods=1).values, color='r') -ax1.set_title('rolling_mean vs. ewma') - -line1 = ax2.plot( - s.index, m.ewmstd(s, span=50, min_periods=1).values, color='b') -line2 = ax2.plot( - s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') -ax2.set_title('rolling_std vs. ewmstd') - -fig.legend((line1, line2), - ('Exp-weighted', 'Equal-weighted'), - loc='upper right') -fig.autofmt_xdate() -fig.subplots_adjust(bottom=0.10, top=0.95) - -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_rolling.py b/doc/plots/stats/moments_rolling.py deleted file mode 100644 index 30a6c5f53e20cf..00000000000000 --- a/doc/plots/stats/moments_rolling.py +++ /dev/null @@ -1,24 +0,0 @@ -from moment_plots import * - -ts = test_series() -s = ts.cumsum() - -s[20:50] = np.NaN -s[120:150] = np.NaN -plot_timeseries(s, - m.rolling_count(s, 50), - m.rolling_sum(s, 50, min_periods=10), - m.rolling_mean(s, 50, min_periods=10), - m.rolling_std(s, 50, min_periods=10), - m.rolling_skew(s, 50, min_periods=10), - m.rolling_kurt(s, 50, min_periods=10), - size=(10, 12), - titles=('time series', - 'rolling_count', - 'rolling_sum', - 'rolling_mean', - 'rolling_std', - 'rolling_skew', - 'rolling_kurt')) -plt.show() -plt.close('all') diff --git a/doc/plots/stats/moments_rolling_binary.py b/doc/plots/stats/moments_rolling_binary.py deleted file mode 100644 index ab6b7b1c8ff490..00000000000000 --- a/doc/plots/stats/moments_rolling_binary.py +++ /dev/null @@ -1,30 +0,0 @@ -from moment_plots import * - -np.random.seed(1) - -ts = test_series() -s = ts.cumsum() -ts2 = test_series() -s2 = ts2.cumsum() - -s[20:50] = np.NaN -s[120:150] = np.NaN -fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) - -ax0, ax1, ax2 = axes - -ax0.plot(s.index, s.values) -ax0.plot(s2.index, s2.values) -ax0.set_title('time series') - -ax1.plot(s.index, m.rolling_corr(s, s2, 50, min_periods=1).values) -ax1.set_title('rolling_corr') - -ax2.plot(s.index, m.rolling_cov(s, s2, 50, min_periods=1).values) -ax2.set_title('rolling_cov') - -fig.autofmt_xdate() -fig.subplots_adjust(bottom=0.10, top=0.95) - -plt.show() -plt.close('all') diff --git a/doc/source/10min.rst b/doc/source/10min.rst index 49142311ff0576..fbbe94a72c71e5 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -25,7 +25,7 @@ ******************** This is a short introduction to pandas, geared mainly for new users. -You can see more complex recipes in the :ref:`Cookbook` +You can see more complex recipes in the :ref:`Cookbook`. Customarily, we import as follows: @@ -38,7 +38,7 @@ Customarily, we import as follows: Object Creation --------------- -See the :ref:`Data Structure Intro section ` +See the :ref:`Data Structure Intro section `. Creating a :class:`Series` by passing a list of values, letting pandas create a default integer index: @@ -48,7 +48,7 @@ a default integer index: s = pd.Series([1,3,5,np.nan,6,8]) s -Creating a :class:`DataFrame` by passing a numpy array, with a datetime index +Creating a :class:`DataFrame` by passing a NumPy array, with a datetime index and labeled columns: .. ipython:: python @@ -70,7 +70,8 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s 'F' : 'foo' }) df2 -Having specific :ref:`dtypes ` +The columns of the resulting ``DataFrame`` have different +:ref:`dtypes `. .. ipython:: python @@ -104,16 +105,16 @@ truncated for brevity. Viewing Data ------------ -See the :ref:`Basics section ` +See the :ref:`Basics section `. -See the top & bottom rows of the frame +Here is how to view the top and bottom rows of the frame: .. ipython:: python df.head() df.tail(3) -Display the index, columns, and the underlying numpy data +Display the index, columns, and the underlying NumPy data: .. ipython:: python @@ -121,25 +122,25 @@ Display the index, columns, and the underlying numpy data df.columns df.values -Describe shows a quick statistic summary of your data +:func:`~DataFrame.describe` shows a quick statistic summary of your data: .. ipython:: python df.describe() -Transposing your data +Transposing your data: .. ipython:: python df.T -Sorting by an axis +Sorting by an axis: .. ipython:: python df.sort_index(axis=1, ascending=False) -Sorting by values +Sorting by values: .. ipython:: python @@ -153,15 +154,15 @@ Selection While standard Python / Numpy expressions for selecting and setting are intuitive and come in handy for interactive work, for production code, we recommend the optimized pandas data access methods, ``.at``, ``.iat``, - ``.loc``, ``.iloc`` and ``.ix``. + ``.loc`` and ``.iloc``. -See the indexing documentation :ref:`Indexing and Selecting Data ` and :ref:`MultiIndex / Advanced Indexing ` +See the indexing documentation :ref:`Indexing and Selecting Data ` and :ref:`MultiIndex / Advanced Indexing `. Getting ~~~~~~~ Selecting a single column, which yields a ``Series``, -equivalent to ``df.A`` +equivalent to ``df.A``: .. ipython:: python @@ -177,39 +178,39 @@ Selecting via ``[]``, which slices the rows. Selection by Label ~~~~~~~~~~~~~~~~~~ -See more in :ref:`Selection by Label ` +See more in :ref:`Selection by Label `. -For getting a cross section using a label +For getting a cross section using a label: .. ipython:: python df.loc[dates[0]] -Selecting on a multi-axis by label +Selecting on a multi-axis by label: .. ipython:: python df.loc[:,['A','B']] -Showing label slicing, both endpoints are *included* +Showing label slicing, both endpoints are *included*: .. ipython:: python df.loc['20130102':'20130104',['A','B']] -Reduction in the dimensions of the returned object +Reduction in the dimensions of the returned object: .. ipython:: python df.loc['20130102',['A','B']] -For getting a scalar value +For getting a scalar value: .. ipython:: python df.loc[dates[0],'A'] -For getting fast access to a scalar (equiv to the prior method) +For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python @@ -218,45 +219,45 @@ For getting fast access to a scalar (equiv to the prior method) Selection by Position ~~~~~~~~~~~~~~~~~~~~~ -See more in :ref:`Selection by Position ` +See more in :ref:`Selection by Position `. -Select via the position of the passed integers +Select via the position of the passed integers: .. ipython:: python df.iloc[3] -By integer slices, acting similar to numpy/python +By integer slices, acting similar to numpy/python: .. ipython:: python df.iloc[3:5,0:2] -By lists of integer position locations, similar to the numpy/python style +By lists of integer position locations, similar to the numpy/python style: .. ipython:: python df.iloc[[1,2,4],[0,2]] -For slicing rows explicitly +For slicing rows explicitly: .. ipython:: python df.iloc[1:3,:] -For slicing columns explicitly +For slicing columns explicitly: .. ipython:: python df.iloc[:,1:3] -For getting a value explicitly +For getting a value explicitly: .. ipython:: python df.iloc[1,1] -For getting fast access to a scalar (equiv to the prior method) +For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python @@ -290,7 +291,7 @@ Setting ~~~~~~~ Setting a new column automatically aligns the data -by the indexes +by the indexes. .. ipython:: python @@ -298,25 +299,25 @@ by the indexes s1 df['F'] = s1 -Setting values by label +Setting values by label: .. ipython:: python df.at[dates[0],'A'] = 0 -Setting values by position +Setting values by position: .. ipython:: python df.iat[0,1] = 0 -Setting by assigning with a numpy array +Setting by assigning with a NumPy array: .. ipython:: python df.loc[:,'D'] = np.array([5] * len(df)) -The result of the prior setting operations +The result of the prior setting operations. .. ipython:: python @@ -336,7 +337,7 @@ Missing Data pandas primarily uses the value ``np.nan`` to represent missing data. It is by default not included in computations. See the :ref:`Missing Data section -` +`. Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data. @@ -353,13 +354,13 @@ To drop any rows that have missing data. df1.dropna(how='any') -Filling missing data +Filling missing data. .. ipython:: python df1.fillna(value=5) -To get the boolean mask where values are ``nan`` +To get the boolean mask where values are ``nan``. .. ipython:: python @@ -369,20 +370,20 @@ To get the boolean mask where values are ``nan`` Operations ---------- -See the :ref:`Basic section on Binary Ops ` +See the :ref:`Basic section on Binary Ops `. Stats ~~~~~ Operations in general *exclude* missing data. -Performing a descriptive statistic +Performing a descriptive statistic: .. ipython:: python df.mean() -Same operation on the other axis +Same operation on the other axis: .. ipython:: python @@ -401,7 +402,7 @@ In addition, pandas automatically broadcasts along the specified dimension. Apply ~~~~~ -Applying functions to the data +Applying functions to the data: .. ipython:: python @@ -411,7 +412,7 @@ Applying functions to the data Histogramming ~~~~~~~~~~~~~ -See more at :ref:`Histogramming and Discretization ` +See more at :ref:`Histogramming and Discretization `. .. ipython:: python @@ -425,7 +426,7 @@ String Methods Series is equipped with a set of string processing methods in the `str` attribute that make it easy to operate on each element of the array, as in the code snippet below. Note that pattern-matching in `str` generally uses `regular -expressions `__ by default (and in +expressions `__ by default (and in some cases always uses them). See more at :ref:`Vectorized String Methods `. @@ -445,7 +446,7 @@ DataFrame, and Panel objects with various kinds of set logic for the indexes and relational algebra functionality in the case of join / merge-type operations. -See the :ref:`Merging section ` +See the :ref:`Merging section `. Concatenating pandas objects together with :func:`concat`: @@ -462,7 +463,7 @@ Concatenating pandas objects together with :func:`concat`: Join ~~~~ -SQL style merges. See the :ref:`Database style joining ` +SQL style merges. See the :ref:`Database style joining ` section. .. ipython:: python @@ -486,7 +487,8 @@ Another example that can be given is: Append ~~~~~~ -Append rows to a dataframe. See the :ref:`Appending ` +Append rows to a dataframe. See the :ref:`Appending ` +section. .. ipython:: python @@ -500,13 +502,13 @@ Grouping -------- By "group by" we are referring to a process involving one or more of the -following steps +following steps: - **Splitting** the data into groups based on some criteria - **Applying** a function to each group independently - **Combining** the results into a data structure -See the :ref:`Grouping section ` +See the :ref:`Grouping section `. .. ipython:: python @@ -518,14 +520,15 @@ See the :ref:`Grouping section ` 'D' : np.random.randn(8)}) df -Grouping and then applying a function ``sum`` to the resulting groups. +Grouping and then applying the :meth:`~DataFrame.sum` function to the resulting +groups. .. ipython:: python df.groupby('A').sum() -Grouping by multiple columns forms a hierarchical index, which we then apply -the function. +Grouping by multiple columns forms a hierarchical index, and again we can +apply the ``sum`` function. .. ipython:: python @@ -595,7 +598,7 @@ Time Series pandas has simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, -financial applications. See the :ref:`Time Series section ` +financial applications. See the :ref:`Time Series section `. .. ipython:: python @@ -603,7 +606,7 @@ financial applications. See the :ref:`Time Series section ` ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) ts.resample('5Min').sum() -Time zone representation +Time zone representation: .. ipython:: python @@ -613,13 +616,13 @@ Time zone representation ts_utc = ts.tz_localize('UTC') ts_utc -Convert to another time zone +Converting to another time zone: .. ipython:: python ts_utc.tz_convert('US/Eastern') -Converting between time span representations +Converting between time span representations: .. ipython:: python @@ -659,14 +662,15 @@ Convert the raw grades to a categorical data type. df["grade"] = df["raw_grade"].astype("category") df["grade"] -Rename the categories to more meaningful names (assigning to ``Series.cat.categories`` is inplace!) +Rename the categories to more meaningful names (assigning to +``Series.cat.categories`` is inplace!). .. ipython:: python df["grade"].cat.categories = ["very good", "good", "very bad"] Reorder the categories and simultaneously add the missing categories (methods under ``Series -.cat`` return a new ``Series`` per default). +.cat`` return a new ``Series`` by default). .. ipython:: python @@ -679,7 +683,7 @@ Sorting is per order in the categories, not lexical order. df.sort_values(by="grade") -Grouping by a categorical column shows also empty categories. +Grouping by a categorical column also shows empty categories. .. ipython:: python @@ -689,7 +693,7 @@ Grouping by a categorical column shows also empty categories. Plotting -------- -:ref:`Plotting ` docs. +See the :ref:`Plotting ` docs. .. ipython:: python :suppress: @@ -705,8 +709,8 @@ Plotting @savefig series_plot_basic.png ts.plot() -On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the -columns with labels: +On a DataFrame, the :meth:`~DataFrame.plot` method is a convenience to plot all +of the columns with labels: .. ipython:: python @@ -723,13 +727,13 @@ Getting Data In/Out CSV ~~~ -:ref:`Writing to a csv file ` +:ref:`Writing to a csv file. ` .. ipython:: python df.to_csv('foo.csv') -:ref:`Reading from a csv file ` +:ref:`Reading from a csv file. ` .. ipython:: python @@ -743,15 +747,15 @@ CSV HDF5 ~~~~ -Reading and writing to :ref:`HDFStores ` +Reading and writing to :ref:`HDFStores `. -Writing to a HDF5 Store +Writing to a HDF5 Store. .. ipython:: python df.to_hdf('foo.h5','df') -Reading from a HDF5 Store +Reading from a HDF5 Store. .. ipython:: python @@ -765,15 +769,15 @@ Reading from a HDF5 Store Excel ~~~~~ -Reading and writing to :ref:`MS Excel ` +Reading and writing to :ref:`MS Excel `. -Writing to an excel file +Writing to an excel file. .. ipython:: python df.to_excel('foo.xlsx', sheet_name='Sheet1') -Reading from an excel file +Reading from an excel file. .. ipython:: python @@ -787,7 +791,7 @@ Reading from an excel file Gotchas ------- -If you are trying an operation and you see an exception like: +If you are attempting to perform an operation you might see an exception like: .. code-block:: python diff --git a/doc/source/_static/banklist.html b/doc/source/_static/banklist.html index 8ec1561f8c3941..cbcce5a2d49ffe 100644 --- a/doc/source/_static/banklist.html +++ b/doc/source/_static/banklist.html @@ -7,7 +7,7 @@ - + @@ -4849,7 +4849,7 @@

Failed Bank List